Repository: lxtGH/Video-K-Net Branch: main Commit: a69340321f47 Files: 237 Total size: 1.8 MB Directory structure: gitextract_0ef7ckct/ ├── .gitignore ├── DATASET.md ├── LICENSE ├── README.md ├── configs/ │ ├── det/ │ │ ├── _base_/ │ │ │ ├── datasets/ │ │ │ │ ├── cityscapes_panoptic.py │ │ │ │ ├── cityscapes_step.py │ │ │ │ ├── cityscapes_vps_clips.py │ │ │ │ ├── cityscapes_vps_clips_trainval.py │ │ │ │ ├── coco_instance.py │ │ │ │ ├── coco_panoptic.py │ │ │ │ ├── coco_panoptic_instance_annotations.py │ │ │ │ ├── kitti_step_dvps.py │ │ │ │ ├── kitti_step_vps.py │ │ │ │ ├── kitti_step_vps_trainval.py │ │ │ │ ├── mapillary_panoptic.py │ │ │ │ └── vipseg_dvps.py │ │ │ ├── default_runtime.py │ │ │ ├── models/ │ │ │ │ ├── knet_citystep_s3_r50_fpn.py │ │ │ │ ├── knet_kitti_step_s3_r50_fpn.py │ │ │ │ ├── knet_s3_r50_deformable_fpn.py │ │ │ │ ├── knet_s3_r50_fpn.py │ │ │ │ ├── knet_s3_r50_fpn_panoptic.py │ │ │ │ ├── knet_vipseg_s3_r50_fpn.py │ │ │ │ └── video_knet_s3_r50_fpn_panoptic.py │ │ │ └── schedules/ │ │ │ ├── schedule_10e.py │ │ │ └── schedule_1x.py │ │ ├── coco/ │ │ │ ├── knet_s3_r50_deformable_fpn_ms-3x_coco.py │ │ │ ├── knet_s3_r50_fpn_ms-3x_coco-panoptic.py │ │ │ ├── knet_s3_r50_fpn_ms-3x_coco.py │ │ │ └── knet_s3_swin-b_deformable_fpn_ms-3x_coco.py │ │ ├── common/ │ │ │ ├── lsj_coco_panoptic_50e.py │ │ │ ├── mstrain_3x_coco_instance.py │ │ │ ├── mstrain_3x_coco_panoptic_inst_anno.py │ │ │ ├── mstrain_3x_coco_panoptic_inst_anno_detr_aug.py │ │ │ └── mstrain_64e_city_panoptic.py │ │ ├── knet_cityscapes_step/ │ │ │ ├── knet_s3_r50_fpn.py │ │ │ ├── knet_s3_swin_b_fpn.py │ │ │ └── knet_s3_swin_l_fpn.py │ │ ├── video_knet_kitti_step/ │ │ │ ├── video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train.py │ │ │ ├── video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train_8e.py │ │ │ ├── video_knet_s3_swinb_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_update.py │ │ │ ├── video_knet_s3_swinl_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_update.py │ │ │ └── video_knet_s3_swinl_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_update_conv_short_track_fc.py │ │ └── video_knet_vipseg/ │ │ ├── video_knet_s3_r50_rpn_vipseg_mask_embed_link_ffn_joint_train.py │ │ └── video_knet_s3_swin_b_rpn_vipseg_mask_embed_link_ffn_joint_train_8e.py │ └── video_knet_vis/ │ ├── _base_/ │ │ ├── datasets/ │ │ │ ├── coco_instance.py │ │ │ └── youtubevis_2019.py │ │ ├── default_runtime.py │ │ ├── models/ │ │ │ ├── knet_track_r50.py │ │ │ └── knet_track_r50_deformablefpn.py │ │ └── schedules/ │ │ ├── schedule_0.75x.py │ │ ├── schedule_1x.py │ │ └── schedule_8e.py │ ├── common/ │ │ └── mstrain_3x_coco_instance.py │ └── video_knet_vis/ │ ├── knet_track_r50_1x_youtubevis.py │ ├── knet_track_r50_deformable_fpn_1x_youtubevis.py │ ├── knet_track_swinb_1x_youtubevis_8e.py │ └── knet_track_swinb_deformable_1x_youtubevis.py ├── external/ │ ├── cityscape_panoptic.py │ ├── cityscapes_step.py │ ├── cityscapes_vps.py │ ├── coco_panoptic.py │ ├── dataset/ │ │ ├── dvps_pipelines/ │ │ │ ├── __init__.py │ │ │ ├── loading.py │ │ │ ├── transforms.py │ │ │ └── tricks.py │ │ ├── forecasting_pipelines/ │ │ │ ├── __init__.py │ │ │ ├── loading.py │ │ │ └── transforms.py │ │ ├── mIoU.py │ │ └── pipelines/ │ │ ├── __init__.py │ │ ├── formatting.py │ │ ├── loading.py │ │ ├── test_time_aug.py │ │ └── transforms.py │ ├── evalhooks.py │ ├── ext/ │ │ ├── mask.py │ │ └── ytvos.py │ ├── fcn_mask_head.py │ ├── kitti_step_dvps.py │ ├── panoptic_fpn.py │ ├── panoptic_head.py │ ├── semantic_seg_head.py │ ├── semkitti_dvps.py │ ├── test.py │ ├── train.py │ ├── utils.py │ └── vipseg_dvps.py ├── knet/ │ ├── __init__.py │ ├── cross_entropy_loss.py │ ├── det/ │ │ ├── dice_loss.py │ │ ├── kernel_head.py │ │ ├── kernel_iter_head.py │ │ ├── kernel_update_head.py │ │ ├── knet.py │ │ ├── mask_hungarian_assigner.py │ │ ├── mask_pseudo_sampler.py │ │ ├── msdeformattn_decoder.py │ │ ├── semantic_fpn_wrapper.py │ │ └── utils.py │ ├── kernel_updator.py │ └── video/ │ ├── __init__.py │ ├── dice_loss.py │ ├── kernel_head.py │ ├── kernel_iter_head.py │ ├── kernel_update_head.py │ ├── knet.py │ ├── knet_quansi_dense.py │ ├── knet_quansi_dense_embed_fc.py │ ├── knet_quansi_dense_embed_fc_joint_train.py │ ├── knet_quansi_dense_embed_fc_toy_exp.py │ ├── knet_quansi_dense_roi_gt_box.py │ ├── knet_quansi_dense_roi_gt_box_joint_train.py │ ├── knet_track_head.py │ ├── knet_track_head_roi_align.py │ ├── knet_uni_track.py │ ├── mask_hungarian_assigner.py │ ├── mask_pseudo_sampler.py │ ├── qdtrack/ │ │ ├── builder.py │ │ ├── losses/ │ │ │ ├── __init__.py │ │ │ ├── l2_loss.py │ │ │ └── multipos_cross_entropy_loss.py │ │ ├── track/ │ │ │ ├── __init__.py │ │ │ ├── similarity.py │ │ │ └── transforms.py │ │ └── trackers/ │ │ ├── __init__.py │ │ ├── quasi_dense_embed_tracker.py │ │ └── tao_tracker.py │ ├── track_heads.py │ ├── tracker.py │ └── util.py ├── knet_vis/ │ ├── __init__.py │ ├── det/ │ │ ├── __init__.py │ │ ├── kernel_head.py │ │ ├── kernel_iter_head.py │ │ ├── kernel_update_head.py │ │ ├── knet.py │ │ ├── mask_hungarian_assigner.py │ │ ├── mask_pseudo_sampler.py │ │ ├── semantic_fpn_wrapper.py │ │ └── utils.py │ ├── kernel_updator.py │ └── tracker/ │ ├── __init__.py │ ├── kernel_frame_head.py │ ├── kernel_frame_iter_head.py │ ├── kernel_head.py │ ├── kernel_iter_head.py │ ├── kernel_update_head.py │ ├── mask_hungarian_assigner.py │ ├── positional_encoding.py │ ├── semantic_fpn_wrapper3D.py │ └── track.py ├── mmtrack/ │ ├── datasets/ │ │ ├── coco_video_dataset.py │ │ ├── parsers/ │ │ │ ├── __init__.py │ │ │ └── coco_video_parser.py │ │ └── youtube_vis_dataset.py │ ├── pipelines/ │ │ ├── __init__.py │ │ ├── formatting.py │ │ ├── loading.py │ │ ├── test_time_aug.py │ │ └── transforms.py │ └── transform.py ├── scripts/ │ ├── kitti_step_prepare.py │ └── visualizer.py ├── swin/ │ ├── DetectRS.py │ ├── ckpt_convert.py │ ├── mix_transformer.py │ ├── swin_checkpoint.py │ ├── swin_transformer.py │ ├── swin_transformer_rfp.py │ └── transformer.py ├── tools/ │ ├── dataset/ │ │ ├── cityscapes_instance_idmap.py │ │ └── youtubevis2coco.py │ ├── dist_step_test.sh │ ├── dist_test.sh │ ├── dist_train.sh │ ├── dist_train_new.sh │ ├── dist_vps_test.sh │ ├── docker.sh │ ├── eval_dstq.py │ ├── eval_dstq_step.py │ ├── eval_dstq_vipseg.py │ ├── eval_dvpq_step.py │ ├── eval_dvpq_vipseg.py │ ├── flops_counter.py │ ├── get_flops.py │ ├── inference_kitti_step.sh │ ├── slurm_test.sh │ ├── slurm_test_dvps.sh │ ├── slurm_test_step.sh │ ├── slurm_test_vis.sh │ ├── slurm_test_vps.sh │ ├── slurm_train.sh │ ├── test.py │ ├── test_dvps.py │ ├── test_step.py │ ├── test_vps.py │ ├── train.py │ ├── utils/ │ │ ├── DSTQ.py │ │ ├── STQ.py │ │ └── cityscapesvps_eval.py │ └── visualization.py ├── tools_vis/ │ ├── apis/ │ │ ├── __init__.py │ │ └── test.py │ ├── dist_test_whole_video.sh │ ├── docker.sh │ ├── slurm_test_vis.sh │ ├── test.py │ └── test_whole_video.py └── unitrack/ ├── __init__.py ├── basetrack.py ├── box.py ├── core/ │ ├── __init__.py │ ├── association/ │ │ ├── __init__.py │ │ └── matching.py │ ├── motion/ │ │ └── kalman_filter.py │ └── propagation/ │ ├── __init__.py │ ├── propagate_box.py │ ├── propagate_mask.py │ └── propagate_pose.py ├── mask.py ├── mask_with_train_embs.py ├── model/ │ ├── __init__.py │ ├── functional.py │ ├── hrnet.py │ ├── model.py │ ├── random_feat_generator.py │ └── resnet.py ├── multitracker.py └── utils/ ├── __init__.py ├── box.py ├── io.py ├── log.py ├── mask.py ├── meter.py ├── palette.py └── visualize.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ work_dir/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ data/ data .vscode .idea .DS_Store # custom *.pkl *.pkl.json *.log.json # Pytorch *.pth *.py~ *.sh~ debug/* vis/ analysis/* pretrain/* ================================================ FILE: DATASET.md ================================================ Please prepare the data structure as the following instruction: The final dataset folder should be like this. ``` root ├── data │ ├── kitti-step │ ├── coco │ ├── VIPSeg │ ├── youtube_vis_2019 │ ├── cityscapes ``` ### [VPS] KITTI-STEP Download the KITTI-STEP from the official website. Then run the scripts in scripts/kitti_step_prepare.py. You will get such format. You can get the our pre-process format in https://huggingface.co/LXT/VideoK-Net/tree/main ``` ├── kitti-step │ ├── video_sequence │ │ ├── train ├──00018_000331_leftImg8bit.png ├──000018_000331_panoptic.png ├──**** │ │ ├── val │ │ ├── test ``` ### [VPS] VIPSeg Download the origin dataset from the official repo.\ Following official repo, we use resized videos for training and evaluation (The short size of the input is set to 720 while the ratio is keeped). ``` ├── VIPSeg │ ├── images │ │ ├── 1241_qYvEuwrSiXc │ ├──*.jpg │ ├── panomasks │ │ ├── 1241_qYvEuwrSiXc │ ├──*.png │ ├── panomasksRGB ``` ### [VIS] Youtube-VIS-2019 We use pre-processed json file according to mmtracking codebase. see the "tools/dataset/youtubevis2coco.py" ``` ├── youtube_vis_2019 │ ├── annotations │ │ ├── train.json │ │ ├── valid.json │ │ ├── youtube_vis_2019_train.json │ │ ├── youtube_vis_2019_valid.json │ ├── train │ │ ├──JPEGImages │ │ │ ├──video floders │ ├── valid │ │ ├──JPEGImages │ │ │ ├──video floders ``` ### [VSS] VSPW To do ### [VPS] Cityscapes For Cityscape-VPS and Cityscape-DVPS, we suggest the follower to see The model of Video K-Net will not be released due to the Patent ISSUE and INTERNAL USEAGE. You can find our related works. ECCV-2022, PolyphonicFormer: A Unified Framework For Panoptic Segmentation + Depth Estimation (winner of ICCV-2021 BMTT workshop) (https://github.com/HarborYuan/PolyphonicFormer) ## Image DataSet For Pretraining K-Net ### COCO dataset COCO is most common datatsets. It contains 80 thing classes and 54 stuff classes. The dataset format is the same as origin [Detectron2](https://github.com/facebookresearch/detectron2) including panoptic segmentation preparation [scirpts](https://github.com/facebookresearch/detectron2/blob/master/datasets/prepare_panoptic_fpn.py). Then the final folder is like this: ``` ├── coco │ ├── annotations │ │ ├── panoptic_{train,val}2017.json │ │ ├── instance_{train,val}2017.json │ ├── train2017 │ ├── val2017 │ ├── panoptic_{train,val}2017/ # png annotations ``` ### Cityscapes dataset Cityscapes dataset is a high-resolution road-scene dataset which contains 19 classes. (8 thing classes and 11 stuff classes). 2975 images for training, 500 images for validation and 1525 images for testing. Preparing cityscape dataset has three steps: 1, Convert segmentation id map(origin label id maps) to trainId maps (id ranges: 0-18 for training) using the official scripts [repo](https://github.com/mcordts/cityscapesScripts) 2, The run python dataset/prepare_cityscapes.py to generate the COCO-like annotations. This annotations can be used for Instance Segmentation training. using csCreateTrainIdLabelImgs.py and put the instancesonly_filtered_gtFine_train.json into annotations folder 3, For Panoptic Segmenation dataset, to generate the json file using csCreatePanopticImgs.py or you can download the our transformed .json and .png files via link: () and put the json file into annotations folder. Then the final folder is like this: ``` ├── cityscapes │ ├── annotations │ │ ├── instancesonly_filtered_gtFine_train.json # coco instance annotation file(COCO format) │ │ ├── instancesonly_filtered_gtFine_val.json │ │ ├── cityscapes_panoptic_train.json # panoptic json file │ │ ├── cityscapes_panoptic_val.json │ ├── leftImg8bit │ ├── gtFine │ │ ├──cityscapes_panoptic_{train,val}/ # png annotations │ │ ``` ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2022 Xiangtai Lee Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Video K-Net: A Simple, Strong, and Unified Baseline for Video Segmentation (CVPR-2022, oral) ## [Paper](https://arxiv.org/abs/2204.04656), [Sides](./slides/Video-KNet-cvpr-slides-10-25-version.pptx), [Poster](./slides/cvpr22_poster_lxt_zww_pjm.pdf), [Video](https://www.youtube.com/watch?v=LIEyp_czu20&t=3s) [Xiangtai Li](https://lxtgh.github.io/), [Wenwei Zhang](https://zhangwenwei.cn/), [Jiangmiao Pang](https://oceanpang.github.io/), [Kai Chen](https://chenkai.site/), [Guangliang Cheng](https://scholar.google.com/citations?user=FToOC-wAAAAJ), [Yunhai Tong](https://scholar.google.com/citations?user=T4gqdPkAAAAJ&hl=zh-CN), [Chen Change Loy](https://www.mmlab-ntu.com/person/ccloy/). We introduce Video K-Net, a simple, strong, and unified framework for fully end-to-end dense video segmentation. The method is built upon K-Net, a method of unifying image segmentation via a group of learnable kernels. This project contains the training and testing code of Video K-Net for both VPS (Video Panoptic Segmentation), VSS(Video Semantic Segmentation), VIS(Video Instance Segmentation). To the best of our knowledge, our Video K-Net is the first open-sourced method that supports three different video segmentation tasks (VIS, VPS, VSS) for Video Scene Understanding. ## News! Video K-Net is acknowledged as a strong baseline for CVPR-2023 workshop ["The 2nd Pixel-level Video Understanding in the Wild"](https://www.vspwdataset.com/Workshop%202023.html). ## News! Video K-Net also supports [VIP-Seg](https://github.com/VIPSeg-Dataset/VIPSeg-Dataset) dataset(CVPR-2022). It also achieves the new state-of-the-art result. ### Environment and DataSet Preparation Our codebase is based on MMDetection and MMSegmentation. Parts of the code is borrowed from MMtracking and UniTrack. - MIM >= 0.1.1 - MMCV-full >= v1.3.8 - MMDetection == v2.18.0 - timm - scipy - panopticapi See the [DATASET.md](https://github.com/lxtGH/Video-K-Net/blob/main/DATASET.md) knet folder contains the Video K-Net for VPS. knet_vis folder contains the Video K-Net for VIS. ### Pretrained CKPTs and Trained Models We provide the pretrained models for VPS and VIS. Baidu Yun Link: [here](https://pan.baidu.com/s/12dIinkAF3o60fcAoggVhjQ) Code:i034 One Drive Link: [here](https://1drv.ms/u/s!Ai4mxaXd6lVBgSCTUS0QWNim2zGx?e=uceSee) The pretrained models are provided to train the Video K-Net. The trained models are also provided for play and test. ### [VPS] KITTI-STEP 1. First pretrain K-Net on Cityscapes-STEP datasset. As shown in original STEP paper(Appendix Part) and our own EXP results, this step is very important to improve the segmentation performance. You can also use our trained model for verification. Cityscape-STEP follows the format of STEP: 17 stuff classes and 2 thing classes. ```bash # train cityscapes step panoptic segmentation models sh ./tools/slurm_train.sh $PARTITION knet_step configs/det/knet_cityscapes_step/knet_s3_r50_fpn.py $WORK_DIR --no-validate ``` 2. Then train the Video K-Net on KITTI-STEP. We have provided the pretrained models from Cityscapes of Video K-Net. For slurm users: ```bash # train Video K-Net on KITTI-step using R-50 GPUS=8 sh ./tools/slurm_train.sh $PARTITION video_knet_step configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train.py $WORK_DIR --no-validate --load-from /path_to_knet_step_city_r50 ``` ```bash # train Video K-Net on KITTI-step using Swin-base GPUS=16 GPUS_PER_NODE=8 sh ./tools/slurm_train.sh $PARTITION video_knet_step configs/det/video_knet_kitti_step/video_knet_s3_swinb_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train.py $WORK_DIR --no-validate --load-from /path_to_knet_step_city_r50 ``` Our models are trained with two V100 machines. For Local machine: ```bash # train Video K-Net on KITTI-step with 8 GPUs sh ./tools/dist_train.sh video_knet_step configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train.py 8 $WORK_DIR --no-validate ``` 3. Testing and Demo. We provide both VPQ and STQ metrics to evaluate VPS models. ```bash # test locally sh ./tools/dist_step_test.sh configs/det/knet_cityscapes_ste/knet_s3_r50_fpn.py $MODEL_DIR ``` We also dump the colored images for debug. ```bash # eval STEP STQ python tools/eval_dstq_step.py result_path gt_path ``` ```bash # eval STEP VPQ python tools/eval_dvpq_step.py result_path gt_path ``` #### Toy Video K-Net As shown in the paper, we also provide toy video K-Net in knet/video/knet_quansi_dense_embed_fc_toy_exp.py. You use the K-Net pre-trained on image-level KITTI-STEP without tracking. ### [VIS] YouTube-VIS-2019 1. First Download the pre-trained Image K-Net instance segmentation models. All the models are pretrained on COCO which is a common. You can also pretrain it by yourself. We also provide the config for pretraining. For slurm users: ```bash # train K-Net instance segmentation models on COCO using R-50 GPUS=8 sh ./tools/slurm_train.sh $PARTITION knet_instance configs/det/coco/knet_s3_r50_fpn_ms-3x_coco.py $WORK_DIR ``` 2. Then train the video K-Net in a clip-wised manner. ```bash # train Video K-Net VIS models using R-50 GPUS=8 sh ./tools/slurm_train.sh $PARTITION video_knet_vis configs/video_knet_vis/video_knet_vis/knet_track_r50_1x_youtubevis.py $WORK_DIR --load-from /path_to_knet_instance_coco ``` 3. To evaluate the results of Video K-Net on VIS. Dump the prediction results for submission to the conda server. ```bash # test Video K-Net VIS models using R-50 GPUS=8 sh tools_vis/dist_test_whole_video.sh $PARTITION video_knet_vis configs/video_knet_vis/video_knet_vis/knet_track_r50_1x_youtubevis.py $WORK_DIR --format-only ``` The result json is dumped into the root of this codebase. ### [VPS] VIP-Seg 1. First Download the pre-trained Image K-Net panoptic segmentation models. All the models are pretrained on COCO which is a common step following VIP-Seg. You can also pretrain it by yourself. We also provide the config for pretraining. ```bash # train K-Net on COCO Panoptic Segmetnation GPUS=8 sh ./tools/slurm_train.sh $PARTITION knet_coco configs/det/coco/knet_s3_r50_fpn_ms-3x_coco-panoptic.py $WORK_DIR ``` 2. Train the Video K-Net on the VIP-Seg dataset. ```bash # train Video K-Net on VIP-Seg GPUS=8 sh ./tools/slurm_train.sh $PARTITION video_knet_vis configs/det/video_knet_vipseg/video_knet_s3_r50_rpn_vipseg_mask_embed_link_ffn_joint_train.py $WORK_DIR --load-from /path/knet_coco_pretrained_r50 ``` 3. Test the Video K-Net on VIP-Seg val dataset. ```bash # test locally on VIP-Seg sh ./tools/dist_step_test.sh configs/det/video_knet_vipseg/video_knet_s3_r50_rpn_vipseg_mask_embed_link_ffn_joint_train.py $MODEL_DIR ``` We also dump the colored images for debug. ```bash # eval STEP STQ python tools/eval_dstq_vipseg.py result_path gt_path ``` ```bash # eval STEP VPQ python tools/eval_dvpq_vipseg.py result_path gt_path ``` ## Visualization Results ### Results on KITTI-STEP DataSet ### Results on VIP-Seg DataSet ### Results on YouTube-VIS DataSet ### Short term segmentation and tracking results on Cityscapes VPS dataset. images(left), Video K-Net(middle), Ground Truth ![Alt Text](./figs/cityscapes_vps_video_1_20220318131729.gif) ![Alt Text](./figs/cityscapes_vps_video_2_20220318132943.gif) ### Long term segmentation and tracking results on STEP dataset. ![Alt Text](./figs/step_video_1_20220318133227.gif) ![Alt Text](./figs/step_video_2_20220318133423.gif) ## Related Project and Acknowledgement ## Citing Video K-Net :pray: If you use our codebase in your research or used for CVPR-2023 pixel-level video workshop, please use the following BibTeX entry. NIPS-2021, K-Net: Unified Segmentation: Our Image baseline (https://github.com/ZwwWayne/K-Net) ECCV-2022, PolyphonicFormer: A Unified Framework For Panoptic Segmentation + Depth Estimation (winner of ICCV-2021 BMTT workshop) (https://github.com/HarborYuan/PolyphonicFormer) ```bibtex @inproceedings{li2022videoknet, title={Video k-net: A simple, strong, and unified baseline for video segmentation}, author={Li, Xiangtai and Zhang, Wenwei and Pang, Jiangmiao and Chen, Kai and Cheng, Guangliang and Tong, Yunhai and Loy, Chen Change}, booktitle={CVPR}, year={2022} } @article{zhang2021k, title={K-net: Towards unified image segmentation}, author={Zhang, Wenwei and Pang, Jiangmiao and Chen, Kai and Loy, Chen Change}, journal={NeurIPS}, year={2021} } ``` ================================================ FILE: configs/det/_base_/datasets/cityscapes_panoptic.py ================================================ # dataset settings dataset_type = 'CityscapesPanopticDataset' data_root = 'data/cityscapes/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), dict( type='Resize', img_scale=[(2048, 800), (2048, 1024)], multiscale_mode='range', keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 1024), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type='RepeatDataset', times=8, dataset=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/instancesonly_filtered_gtFine_train.json', panoptic_ann=data_root + 'annotations/cityscapes_panoptic_train.json' ), img_prefix=data_root + 'leftImg8bit/train/', seg_prefix=data_root + 'gtFine/train', pipeline=train_pipeline)), val=dict( type=dataset_type, ann_file=dict( ins_ann=data_root +'annotations/instancesonly_filtered_gtFine_val.json', panoptic_ann=data_root + "annotations/cityscapes_panoptic_val.json" ), img_prefix=data_root + 'leftImg8bit/val/', seg_prefix=data_root + 'gtFine/cityscapes_panoptic_val', pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/instancesonly_filtered_gtFine_val.json', panoptic_ann=data_root + "annotations/cityscapes_panoptic_val.json" ), img_prefix=data_root + 'leftImg8bit/val/', seg_prefix=data_root + 'gtFine/cityscapes_panoptic_val', pipeline=test_pipeline)) evaluation = dict(metric=['panoptic']) ================================================ FILE: configs/det/_base_/datasets/cityscapes_step.py ================================================ dataset_type = 'CityscapesSTEP' data_root = 'data/cityscapes' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True ) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotationsInstanceMasks', cherry=[11, 13]), dict(type='KNetInsAdapterCherryPick', stuff_nums=11, cherry=[11, 13]), dict(type='Resize', img_scale=(1024, 2048), ratio_range=[0.5, 2.0], keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='RandomCrop', crop_size=(1024, 2048)), dict(type='Normalize', **img_norm_cfg), dict(type='PadFutureMMDet', size_divisor=32, pad_val=dict(img=0, masks=0, seg=255)), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_masks', 'gt_labels', 'gt_semantic_seg'], meta_keys=('ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg') ), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', scale_factor=[1.0], flip=False, transforms=[ dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img'], meta_keys=[ 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg' ]), ]) ] data = dict( samples_per_gpu=4, workers_per_gpu=2, train=dict( type='RepeatDataset', times=8, dataset=dict( type=dataset_type, data_root=data_root, split='train', test_mode=False, pipeline=train_pipeline )), val=dict( type=dataset_type, data_root=data_root, split='val', test_mode=True, pipeline=test_pipeline ), test=dict( type=dataset_type, data_root=data_root, split='val', test_mode=True, pipeline=test_pipeline ) ) evaluation = dict() ================================================ FILE: configs/det/_base_/datasets/cityscapes_vps_clips.py ================================================ dataset_type = 'CityscapesVPSDataset' data_root = 'data/cityscapes_vps/' dataset_type_test = "CityscapesPanopticDataset" img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadMultiImagesFromFile'), dict(type='SeqLoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), dict(type='SeqResize', img_scale=[(512, 1024), (2048, 4096)], multiscale_mode='range', keep_ratio=True), dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5), dict(type='SeqRandomCrop', crop_size=(1024, 1024), share_params=True), dict(type='SeqNormalize', **img_norm_cfg), dict(type='SeqPad', size_divisor=32), dict( type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', "gt_instance_ids"]), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ] test_pipeline = [ dict(type='LoadRefImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=[(2048, 1024)], flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img', 'ref_img']), dict(type='Collect', keys=['img', 'ref_img']), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type='RepeatDataset', times=8, dataset=dict( type=dataset_type, ann_file=dict(ins_ann=data_root +'instances_train_city_vps_rle.json', panoptic_ann=data_root + 'panoptic_im_train_city_vps.json' ), img_prefix=data_root + 'train/img/', seg_prefix=data_root + 'train/labelmap/', pipeline=train_pipeline, offsets=[-1,+1])), val=dict( type=dataset_type_test, ann_file=dict(ins_ann=data_root + 'instances_val_city_vps_rle.json', panoptic_ann=data_root + 'panoptic_gt_val_city_vps.json', vps=True ), img_prefix=data_root + 'val/img/', seg_prefix=data_root + 'val/panoptic_video/', pipeline=test_pipeline), test=dict( type=dataset_type_test, ann_file=dict(ins_ann=data_root + 'instances_val_city_vps_rle.json', panoptic_ann=data_root + 'panoptic_gt_val_city_vps.json', vps=True ), img_prefix=data_root + 'val/img_all/', # img for validation ref_prefix=data_root + 'val/img_all/', # ref_images nframes_span_test=30, pipeline=test_pipeline)) evaluation = dict(metric=['panoptic']) ================================================ FILE: configs/det/_base_/datasets/cityscapes_vps_clips_trainval.py ================================================ dataset_type = 'CityscapesVPSDataset' data_root = 'data/cityscapes_vps/' dataset_type_test = "CityscapesPanopticDataset" img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadMultiImagesFromFile'), dict(type='SeqLoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), dict(type='SeqResize', img_scale=[(512, 1024), (2048, 4096)], multiscale_mode='range', keep_ratio=True), dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5), dict(type='SeqRandomCrop', crop_size=(1024, 2048), share_params=True), dict(type='SeqNormalize', **img_norm_cfg), dict(type='SeqPad', size_divisor=32), dict( type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', "gt_instance_ids"]), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ] test_pipeline = [ dict(type='LoadRefImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=[(2048, 1024)], flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img', 'ref_img']), dict(type='Collect', keys=['img', 'ref_img']), ]) ] data = dict( samples_per_gpu=1, workers_per_gpu=2, train=dict( type='RepeatDataset', times=8, dataset=dict( type='ConcatDataset', separate_eval=False, datasets=[ dict( type=dataset_type, ann_file=dict(ins_ann=data_root +'instances_train_city_vps_rle.json', panoptic_ann=data_root + 'panoptic_im_train_city_vps.json' ), img_prefix=data_root + 'train/img/', seg_prefix=data_root + 'train/labelmap/', pipeline=train_pipeline, offsets=[-1,+1] ), dict( type=dataset_type, ann_file=dict(ins_ann=data_root +'instances_val_city_vps_rle.json', panoptic_ann=data_root + 'panoptic_gt_val_city_vps.json' ), img_prefix=data_root + 'val/img/', seg_prefix=data_root + 'val/labelmap/', pipeline=train_pipeline, offsets=[-1,+1]), ], ) ), val=dict( type=dataset_type, ann_file=dict(ins_ann=data_root + 'instances_val_city_vps_rle.json', panoptic_ann=data_root + 'panoptic_gt_val_city_vps.json', vps=True ), img_prefix=data_root + 'val/img_all/', # img for validation ref_prefix=data_root + 'val/img_all/', # ref_images nframes_span_test=30, pipeline=test_pipeline) ) evaluation = dict(metric=['panoptic']) ================================================ FILE: configs/det/_base_/datasets/coco_instance.py ================================================ dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', pipeline=train_pipeline), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', pipeline=test_pipeline)) # we do not evaluate bbox because K-Net does not predict bounding boxes evaluation = dict(metric=['segm']) ================================================ FILE: configs/det/_base_/datasets/coco_panoptic.py ================================================ dataset_type = 'CocoPanopticDatasetCustom' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict( type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict( type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/instances_train2017.json', panoptic_ann=data_root + 'annotations/panoptic_train2017.json'), img_prefix=data_root + 'train2017/', seg_prefix=data_root + 'panoptic_stuff_train2017/', pipeline=train_pipeline), val=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/instances_val2017.json', panoptic_ann=data_root + 'annotations/panoptic_val2017.json'), seg_prefix=data_root + 'panoptic_val2017/', img_prefix=data_root + 'val2017/', pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/instances_val2017.json', panoptic_ann=data_root + 'annotations/panoptic_val2017.json'), seg_prefix=data_root + 'panoptic_val2017/', img_prefix=data_root + 'val2017/', pipeline=test_pipeline)) evaluation = dict(metric=['segm', 'panoptic']) ================================================ FILE: configs/det/_base_/datasets/coco_panoptic_instance_annotations.py ================================================ dataset_type = 'CocoPanopticDatasetCustom' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict( type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict( type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/instances_train2017.json', panoptic_ann=data_root + 'annotations/panoptic_train2017.json'), img_prefix=data_root + 'train2017/', seg_prefix=data_root + 'panoptic_stuff_train2017/', pipeline=train_pipeline), val=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/instances_val2017.json', panoptic_ann=data_root + 'annotations/panoptic_val2017.json'), seg_prefix=data_root + 'panoptic_val2017/', img_prefix=data_root + 'val2017/', pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/instances_val2017.json', panoptic_ann=data_root + 'annotations/panoptic_val2017.json'), seg_prefix=data_root + 'panoptic_val2017/', img_prefix=data_root + 'val2017/', pipeline=test_pipeline)) evaluation = dict(metric=['segm', 'panoptic']) ================================================ FILE: configs/det/_base_/datasets/kitti_step_dvps.py ================================================ dataset_type = 'KITTISTEPDVPSDataset' data_root = 'data/kitti-step' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False) # The kitti dataset contains 1226 x 370 and 1241 x 376 train_pipeline = [ dict(type='LoadMultiImagesDirect'), dict(type='LoadMultiAnnotationsDirect', with_depth=True, divisor=-1, cherry_pick=True, cherry=[11, 13]), # dict(type='SeqResizeWithDepth', img_scale=(370, 1226), ratio_range=[1.0, 2.0], keep_ratio=True), dict(type='SeqFlipWithDepth', flip_ratio=0.5), # dict(type='SeqRandomCropWithDepth', crop_size=(352, 1024), share_params=True), dict(type='SeqNormalizeWithDepth', **img_norm_cfg), dict(type='SeqPadWithDepth', size_divisor=32), dict( type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_depth', 'gt_instance_ids', ]), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ] test_pipeline = [ dict(type='LoadImgDirect'), dict( type='MultiScaleFlipAug', scale_factor=[1.0], flip=False, transforms=[ dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img'], meta_keys=[ 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg', 'ori_filename' ]), ]) ] data = dict( samples_per_gpu=1, workers_per_gpu=2, train=dict( type='RepeatDataset', times=4, dataset=dict( type=dataset_type, data_root=data_root, split='train', ref_seq_index=None, test_mode=False, pipeline=train_pipeline, with_depth=True, )), val=dict( type=dataset_type, data_root=data_root, split='val', ref_seq_index=None, test_mode=True, pipeline=test_pipeline, with_depth=True, ), test=dict( type=dataset_type, data_root=data_root, split='val', ref_seq_index=None, test_mode=True, pipeline=test_pipeline, with_depth=True, ) ) evaluation = dict() ================================================ FILE: configs/det/_base_/datasets/kitti_step_vps.py ================================================ dataset_type = 'KITTISTEPDVPSDataset' data_root = 'data/kitti-step' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False) # The kitti dataset contains 1226 x 370 and 1241 x 376 # 384 x 1248 is the minimum size that is 32-divisible train_pipeline = [ dict(type='LoadMultiImagesDirect'), dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]), dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True), dict(type='SeqFlipWithDepth', flip_ratio=0.5), dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True), dict(type='SeqNormalizeWithDepth', **img_norm_cfg), dict(type='SeqPadWithDepth', size_divisor=32), dict( type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids']), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ] test_pipeline = [ dict(type='LoadImgDirect'), dict( type='MultiScaleFlipAug', scale_factor=[1.0], flip=False, transforms=[ dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img'], meta_keys=[ 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg', 'ori_filename', "filename" ]), ]) ] data = dict( samples_per_gpu=1, workers_per_gpu=2, train=dict( type='RepeatDataset', times=4, dataset=dict( type=dataset_type, data_root=data_root, split='train', ref_seq_index=None, test_mode=False, pipeline=train_pipeline, with_depth=False, )), val=dict( type=dataset_type, data_root=data_root, split='val', ref_seq_index=None, test_mode=True, pipeline=test_pipeline, with_depth=False, ), test=dict( type=dataset_type, data_root=data_root, split='val', ref_seq_index=None, test_mode=True, pipeline=test_pipeline, with_depth=False, ) ) evaluation = dict() ================================================ FILE: configs/det/_base_/datasets/kitti_step_vps_trainval.py ================================================ dataset_type = 'KITTISTEPDVPSDataset' data_root = 'data/kitti-step' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False) # The kitti dataset contains 1226 x 370 and 1241 x 376 # 384 x 1248 is the minimum size that is 32-divisible train_pipeline = [ dict(type='LoadMultiImagesDirect'), dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]), dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True), dict(type='SeqFlipWithDepth', flip_ratio=0.5), dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True), dict(type='SeqNormalizeWithDepth', **img_norm_cfg), dict(type='SeqPadWithDepth', size_divisor=32), dict( type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids']), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ] test_pipeline = [ dict(type='LoadImgDirect'), dict( type='MultiScaleFlipAug', scale_factor=[1.0], flip=False, transforms=[ dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img'], meta_keys=[ 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg', 'ori_filename', "filename" ]), ]) ] data = dict( samples_per_gpu=1, workers_per_gpu=2, train=dict( type='RepeatDataset', times=4, dataset=dict( type='ConcatDataset', separate_eval=False, datasets=[ dict( type=dataset_type, data_root=data_root, split='train', ref_seq_index=None, test_mode=False, pipeline=train_pipeline, with_depth=False, ), dict( type=dataset_type, data_root=data_root, split='val', ref_seq_index=None, test_mode=False, pipeline=train_pipeline, with_depth=False, ) ] ), ), val=dict( type=dataset_type, data_root=data_root, split='val', ref_seq_index=None, test_mode=True, pipeline=test_pipeline, with_depth=False, ), test=dict( type=dataset_type, data_root=data_root, split='val', ref_seq_index=None, test_mode=True, pipeline=test_pipeline, with_depth=False, ) ) evaluation = dict() ================================================ FILE: configs/det/_base_/datasets/mapillary_panoptic.py ================================================ dataset_type = 'MapillaryPanopticDataset' data_root = 'data/mapillary/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict( type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), dict(type='Resize', img_scale=[(1024, 4096), (2048, 4096)], multiscale_mode='range', keep_ratio=True), dict(type='RandomCrop', crop_size=(1024, 1024)), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict( type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 4096), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/coco/training.json', panoptic_ann=data_root + 'annotations/panoptic_train.json' ), img_prefix=data_root + 'training/images', seg_prefix=data_root + 'training/panoptic_stuff_train', pipeline=train_pipeline), val=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/coco/validation.json', panoptic_ann=data_root + 'annotations/panoptic_val.json'), seg_prefix=data_root + 'validation/panoptic', img_prefix=data_root + 'validation/images', pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/coco/validation.json', panoptic_ann=data_root + 'annotations/panoptic_val.json'), seg_prefix=data_root + 'validation/panoptic', img_prefix=data_root + 'validation/images', pipeline=test_pipeline)) evaluation = dict(metric=['segm', 'panoptic']) ================================================ FILE: configs/det/_base_/datasets/vipseg_dvps.py ================================================ dataset_type = 'VIPSegDVPSDataset' data_root = 'data/VIPSeg' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False) crop_size = (736, 736) train_pipeline = [ dict(type='LoadMultiImagesDirect'), dict(type='LoadMultiAnnotationsDirect', with_depth=False, vipseg=True), dict(type='SeqResizeWithDepth', img_scale=(720, 100000), ratio_range=[1., 2.], keep_ratio=True), dict(type='SeqFlipWithDepth', flip_ratio=0.5), dict(type='SeqRandomCropWithDepth', crop_size=(736, 736), share_params=True), dict(type='SeqPadWithDepth', size_divisor=32), dict(type='SeqNormalize', **img_norm_cfg), dict( type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids']), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ] test_pipeline = [ dict(type='LoadImgDirect'), dict( type='MultiScaleFlipAug', scale_factor=[1.0], flip=False, transforms=[ dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img', 'img_id', 'seq_id'], meta_keys=[ 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg', 'ori_filename', "filename" ]), ]) ] data = dict( samples_per_gpu=1, workers_per_gpu=2, train=dict( type='RepeatDataset', times=1, dataset=dict( type=dataset_type, data_root=data_root, test_mode=False, split='train', ref_seq_index=[-2, -1, 1, 2], is_instance_only=True, pipeline=train_pipeline, )), val=dict( type=dataset_type, data_root=data_root, split='val', ref_seq_index=None, test_mode=True, pipeline=test_pipeline, ), test=dict( type=dataset_type, data_root=data_root, split='val', ref_seq_index=None, test_mode=True, pipeline=test_pipeline, ) ) evaluation = dict() ================================================ FILE: configs/det/_base_/default_runtime.py ================================================ checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), ]) # yapf:enable dist_params = dict(backend='nccl') log_level = 'INFO' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: configs/det/_base_/models/knet_citystep_s3_r50_fpn.py ================================================ num_stages = 3 num_proposals = 100 conv_kernel_size = 1 model = dict( type='KNet', cityscapes=False, kitti_step=True, num_thing_classes=2, num_stuff_classes=17, backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch', init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=0, add_extra_convs='on_input', num_outs=4 ), rpn_head=dict( type='ConvKernelHead', num_classes=19, num_thing_classes=2, num_stuff_classes=17, cat_stuff_mask=True, conv_kernel_size=conv_kernel_size, feat_downsample_stride=2, feat_refine_stride=1, feat_refine=False, use_binary=True, num_loc_convs=1, num_seg_convs=1, conv_normal_init=True, localization_fpn=dict( type='SemanticFPNWrapper', in_channels=256, feat_channels=256, out_channels=256, start_level=0, end_level=3, upsample_times=2, positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True), cat_coors=False, cat_coors_level=3, fuse_by_cat=False, return_list=False, num_aux_convs=1, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)), num_proposals=num_proposals, proposal_feats_with_obj=True, xavier_init_kernel=False, kernel_init_std=1, num_cls_fcs=1, in_channels=256, feat_transform_cfg=None, loss_rank=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.1), loss_seg=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict(type='DiceLoss', loss_weight=4.0)), roi_head=dict( type='KernelIterHead', num_thing_classes=2, num_stuff_classes=17, do_panoptic=True, num_stages=num_stages, stage_loss_weights=[1] * num_stages, proposal_feature_channel=256, mask_head=[ dict( type='KernelUpdateHead', num_classes=19, num_thing_classes=2, num_stuff_classes=17, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=2, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None ), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_rank=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.1 ), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0 ), loss_dice=dict( type='DiceLoss', loss_weight=4.0 ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0)) for _ in range(num_stages) ] ), # training and testing settings train_cfg=dict( rpn=dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'), pos_weight=1), rcnn=[ dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'), pos_weight=1) for _ in range(num_stages) ]), test_cfg=dict( rpn=None, rcnn=dict( max_per_img=num_proposals, mask_thr=0.5, stuff_score_thr=0.05, merge_stuff_thing=dict( iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.25 ) ) ) ) custom_imports = dict( imports=[ 'knet.det.kernel_head', 'knet.det.kernel_iter_head', 'knet.det.kernel_update_head', 'knet.det.semantic_fpn_wrapper', 'knet.det.dice_loss', 'knet.det.mask_hungarian_assigner', 'knet.det.mask_pseudo_sampler', 'knet.kernel_updator', 'knet.cross_entropy_loss', 'swin.swin_transformer', 'swin.mix_transformer', 'swin.DetectRS', 'swin.swin_transformer_rfp', 'external.cityscapes_step', 'external.dataset.pipelines.transforms', 'external.dataset.pipelines.loading', ], allow_failed_imports=False ) ================================================ FILE: configs/det/_base_/models/knet_kitti_step_s3_r50_fpn.py ================================================ num_stages = 3 num_proposals = 100 conv_kernel_size = 1 model = dict( type='KNet', cityscapes=False, kitti_step=True, num_thing_classes=2, num_stuff_classes=17, backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch', init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=0, add_extra_convs='on_input', num_outs=4 ), rpn_head=dict( type='ConvKernelHead', num_classes=19, num_thing_classes=2, num_stuff_classes=17, cat_stuff_mask=True, conv_kernel_size=conv_kernel_size, feat_downsample_stride=2, feat_refine_stride=1, feat_refine=False, use_binary=True, num_loc_convs=1, num_seg_convs=1, conv_normal_init=True, localization_fpn=dict( type='SemanticFPNWrapper', in_channels=256, feat_channels=256, out_channels=256, start_level=0, end_level=3, upsample_times=2, positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True), cat_coors=False, cat_coors_level=3, fuse_by_cat=False, return_list=False, num_aux_convs=1, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)), num_proposals=num_proposals, proposal_feats_with_obj=True, xavier_init_kernel=False, kernel_init_std=1, num_cls_fcs=1, in_channels=256, feat_transform_cfg=None, loss_rank=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.1), loss_seg=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict(type='DiceLoss', loss_weight=4.0)), roi_head=dict( type='KernelIterHead', num_thing_classes=2, num_stuff_classes=17, do_panoptic=True, num_stages=num_stages, stage_loss_weights=[1] * num_stages, proposal_feature_channel=256, mask_head=[ dict( type='KernelUpdateHead', num_classes=19, num_thing_classes=2, num_stuff_classes=17, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=2, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None ), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_rank=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.1 ), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0 ), loss_dice=dict( type='DiceLoss', loss_weight=4.0 ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0)) for _ in range(num_stages) ] ), # training and testing settings train_cfg=dict( rpn=dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'), pos_weight=1), rcnn=[ dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'), pos_weight=1) for _ in range(num_stages) ]), test_cfg=dict( rpn=None, rcnn=dict( max_per_img=num_proposals, mask_thr=0.5, stuff_score_thr=0.05, merge_stuff_thing=dict( overlap_thr=0.6, iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.25 ) ) ) ) custom_imports = dict( imports=[ 'knet.det.kernel_head', 'knet.det.kernel_iter_head', 'knet.det.kernel_update_head', 'knet.det.semantic_fpn_wrapper', 'knet.det.dice_loss', 'knet.det.mask_hungarian_assigner', 'knet.det.mask_pseudo_sampler', 'knet.kernel_updator', 'knet.cross_entropy_loss', 'swin.swin_transformer', 'swin.mix_transformer', 'swin.DetectRS', 'swin.swin_transformer_rfp', 'external.cityscapes_step', 'external.kitti_step_dvps', 'external.dataset.dvps_pipelines.transforms', 'external.dataset.dvps_pipelines.loading', 'external.dataset.dvps_pipelines.tricks', 'external.dataset.pipelines.formatting', # 'knet.video.knet_track', # 'knet.video.knet_track_head', 'knet.video.track_heads', 'knet.video.kernel_head', 'knet.video.kernel_iter_head', 'knet.video.kernel_update_head', 'knet.video.knet_uni_track', 'knet.video.knet_quansi_dense', # 'knet.video.knet_quansi_dense_roi', 'knet.video.knet_quansi_dense_roi_gt_box', 'knet.video.knet_quansi_dense_embed_fc', 'knet.video.knet_quansi_dense_embed_fc_joint_train', # 'knet.video.knet_quansi_dense_embed_fc_with_appearance', 'knet.video.knet_quansi_dense_roi_gt_box_joint_train', # 'knet.video.knet_quansi_dense_embed_fc_toy_exp', 'knet.video.qdtrack.losses.l2_loss', 'knet.video.qdtrack.losses.multipos_cross_entropy_loss', 'knet.video.qdtrack.trackers.quasi_dense_embed_tracker', ], allow_failed_imports=False ) ================================================ FILE: configs/det/_base_/models/knet_s3_r50_deformable_fpn.py ================================================ num_stages = 3 num_proposals = 100 conv_kernel_size = 1 model = dict( type='KNet', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch', init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), neck=dict( type='MSDeformAttnPixelDecoder', num_outs=3, norm_cfg=dict(type='GN', num_groups=32), act_cfg=dict(type='ReLU'), return_one_list=True, encoder=dict( type='DetrTransformerEncoder', num_layers=6, transformerlayers=dict( type='BaseTransformerLayer', attn_cfgs=dict( type='MultiScaleDeformableAttention', embed_dims=256, num_heads=8, num_levels=3, num_points=4, im2col_step=64, dropout=0.0, batch_first=False, norm_cfg=None, init_cfg=None), ffn_cfgs=dict( type='FFN', embed_dims=256, feedforward_channels=1024, num_fcs=2, ffn_drop=0.0, act_cfg=dict(type='ReLU', inplace=True)), operation_order=('self_attn', 'norm', 'ffn', 'norm')), init_cfg=None), positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True), init_cfg=None), rpn_head=dict( type='ConvKernelHead', conv_kernel_size=conv_kernel_size, feat_downsample_stride=2, feat_refine_stride=1, feat_refine=False, use_binary=True, num_loc_convs=1, num_seg_convs=1, conv_normal_init=True, localization_fpn=dict( type='SemanticFPNWrapper', in_channels=256, feat_channels=256, out_channels=256, start_level=0, end_level=3, upsample_times=2, positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True), cat_coors=False, cat_coors_level=3, fuse_by_cat=False, return_list=False, num_aux_convs=1, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)), num_proposals=num_proposals, proposal_feats_with_obj=True, xavier_init_kernel=False, kernel_init_std=1, num_cls_fcs=1, in_channels=256, num_classes=80, feat_transform_cfg=None, loss_seg=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict(type='DiceLoss', loss_weight=4.0)), roi_head=dict( type='KernelIterHead', num_stages=num_stages, stage_loss_weights=[1] * num_stages, proposal_feature_channel=256, mask_head=[ dict( type='KernelUpdateHead', num_classes=80, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=2, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict( type='DiceLoss', loss_weight=4.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0)) for _ in range(num_stages) ]), # training and testing settings train_cfg=dict( rpn=dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'), pos_weight=1), rcnn=[ dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'), pos_weight=1) for _ in range(num_stages) ]), test_cfg=dict( rpn=None, rcnn=dict( max_per_img=num_proposals, mask_thr=0.5, merge_stuff_thing=dict( iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3)))) custom_imports = dict( imports=[ 'knet.det.kernel_head', 'knet.det.kernel_iter_head', 'knet.det.kernel_update_head', 'knet.det.semantic_fpn_wrapper', 'knet.det.dice_loss', 'knet.kernel_updator', 'knet.det.msdeformattn_decoder', 'knet.det.mask_hungarian_assigner', 'knet.det.mask_pseudo_sampler', 'external.coco_panoptic', 'swin.swin_transformer' ], allow_failed_imports=False) ================================================ FILE: configs/det/_base_/models/knet_s3_r50_fpn.py ================================================ num_stages = 3 num_proposals = 100 conv_kernel_size = 1 model = dict( type='KNet', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch', init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=0, add_extra_convs='on_input', num_outs=4), rpn_head=dict( type='ConvKernelHead', conv_kernel_size=conv_kernel_size, feat_downsample_stride=2, feat_refine_stride=1, feat_refine=False, use_binary=True, num_loc_convs=1, num_seg_convs=1, conv_normal_init=True, localization_fpn=dict( type='SemanticFPNWrapper', in_channels=256, feat_channels=256, out_channels=256, start_level=0, end_level=3, upsample_times=2, positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True), cat_coors=False, cat_coors_level=3, fuse_by_cat=False, return_list=False, num_aux_convs=1, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)), num_proposals=num_proposals, proposal_feats_with_obj=True, xavier_init_kernel=False, kernel_init_std=1, num_cls_fcs=1, in_channels=256, num_classes=80, feat_transform_cfg=None, loss_seg=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict(type='DiceLoss', loss_weight=4.0)), roi_head=dict( type='KernelIterHead', num_stages=num_stages, stage_loss_weights=[1] * num_stages, proposal_feature_channel=256, mask_head=[ dict( type='KernelUpdateHead', num_classes=80, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=2, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict( type='DiceLoss', loss_weight=4.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0)) for _ in range(num_stages) ]), # training and testing settings train_cfg=dict( rpn=dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'), pos_weight=1), rcnn=[ dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'), pos_weight=1) for _ in range(num_stages) ]), test_cfg=dict( rpn=None, rcnn=dict( max_per_img=num_proposals, mask_thr=0.5, merge_stuff_thing=dict( iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3)))) custom_imports = dict( imports=[ 'knet.det.knet', 'knet.det.kernel_head', 'knet.det.kernel_iter_head', 'knet.det.kernel_update_head', 'knet.det.semantic_fpn_wrapper', 'knet.det.dice_loss', 'knet.kernel_updator', 'knet.det.msdeformattn_decoder', 'knet.det.mask_hungarian_assigner', 'knet.det.mask_pseudo_sampler', 'panoptic_fpn.coco_panoptic', ], allow_failed_imports=False) ================================================ FILE: configs/det/_base_/models/knet_s3_r50_fpn_panoptic.py ================================================ num_stages = 3 num_proposals = 100 conv_kernel_size = 1 model = dict( type='KNet', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch', init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=0, add_extra_convs='on_input', num_outs=4), rpn_head=dict( type='ConvKernelHead', num_classes=133, # modified for panoptic cat_stuff_mask=True, # modified for panoptic conv_kernel_size=conv_kernel_size, feat_downsample_stride=2, feat_refine_stride=1, feat_refine=False, use_binary=True, num_loc_convs=1, num_seg_convs=1, conv_normal_init=True, localization_fpn=dict( type='SemanticFPNWrapper', in_channels=256, feat_channels=256, out_channels=256, start_level=0, end_level=3, upsample_times=2, positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True), cat_coors=False, cat_coors_level=3, fuse_by_cat=False, return_list=False, num_aux_convs=1, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)), num_proposals=num_proposals, proposal_feats_with_obj=True, xavier_init_kernel=False, kernel_init_std=1, num_cls_fcs=1, in_channels=256, feat_transform_cfg=None, loss_rank=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.1), loss_seg=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict(type='DiceLoss', loss_weight=4.0)), roi_head=dict( type='KernelIterHead', do_panoptic=True, num_stages=num_stages, stage_loss_weights=[1] * num_stages, proposal_feature_channel=256, mask_head=[ dict( type='KernelUpdateHead', num_classes=133, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=2, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_rank=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.1), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict( type='DiceLoss', loss_weight=4.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0)) for _ in range(num_stages) ]), # training and testing settings train_cfg=dict( rpn=dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'), pos_weight=1), rcnn=[ dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'), pos_weight=1) for _ in range(num_stages) ]), test_cfg=dict( rpn=None, rcnn=dict( max_per_img=num_proposals, mask_thr=0.5, stuff_score_thr=0.05, merge_stuff_thing=dict( overlap_thr=0.6, iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3)))) custom_imports = dict( imports=[ 'knet.det.kernel_head', 'knet.det.kernel_iter_head', 'knet.det.kernel_update_head', 'knet.det.semantic_fpn_wrapper', 'knet.det.dice_loss', 'knet.kernel_updator', 'knet.cross_entropy_loss', 'knet.det.mask_hungarian_assigner', 'knet.det.mask_pseudo_sampler', 'swin.swin_transformer', 'external.mot_step', 'swin.mix_transformer', 'swin.DetectRS', 'swin.swin_transformer_rfp', 'external.coco_panoptic', 'external.mapillary_panoptic', 'external.cityscape_panoptic', 'external.kitti_step_dvps', 'external.mot_step', 'external.dataset.dvps_pipelines.transforms', 'external.dataset.dvps_pipelines.loading', 'external.dataset.dvps_pipelines.tricks', 'external.dataset.pipelines.formatting', ], allow_failed_imports=False) ================================================ FILE: configs/det/_base_/models/knet_vipseg_s3_r50_fpn.py ================================================ num_stages = 3 num_proposals = 100 conv_kernel_size = 1 num_thing_classes = 58 num_stuff_classes = 66 num_classes = num_stuff_classes + num_thing_classes model = dict( type='KNet', cityscapes=False, kitti_step=True, num_thing_classes=num_thing_classes, num_stuff_classes=num_stuff_classes, backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch', init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=0, add_extra_convs='on_input', num_outs=4 ), rpn_head=dict( type='ConvKernelHead', num_classes=num_classes, num_thing_classes=num_thing_classes, num_stuff_classes=num_stuff_classes, cat_stuff_mask=True, conv_kernel_size=conv_kernel_size, feat_downsample_stride=2, feat_refine_stride=1, feat_refine=False, use_binary=True, num_loc_convs=1, num_seg_convs=1, conv_normal_init=True, localization_fpn=dict( type='SemanticFPNWrapper', in_channels=256, feat_channels=256, out_channels=256, start_level=0, end_level=3, upsample_times=2, positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True), cat_coors=False, cat_coors_level=3, fuse_by_cat=False, return_list=False, num_aux_convs=1, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)), num_proposals=num_proposals, proposal_feats_with_obj=True, xavier_init_kernel=False, kernel_init_std=1, num_cls_fcs=1, in_channels=256, feat_transform_cfg=None, loss_rank=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.1), loss_seg=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict(type='DiceLoss', loss_weight=4.0)), roi_head=dict( type='KernelIterHead', num_thing_classes=num_thing_classes, num_stuff_classes=num_stuff_classes, do_panoptic=True, num_stages=num_stages, stage_loss_weights=[1] * num_stages, proposal_feature_channel=256, mask_head=[ dict( type='KernelUpdateHead', num_classes=num_classes, num_thing_classes=num_thing_classes, num_stuff_classes=num_stuff_classes, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=2, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None ), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_rank=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.1 ), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0 ), loss_dice=dict( type='DiceLoss', loss_weight=4.0 ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0)) for _ in range(num_stages) ] ), # training and testing settings train_cfg=dict( rpn=dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'), pos_weight=1), rcnn=[ dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'), pos_weight=1) for _ in range(num_stages) ]), test_cfg=dict( rpn=None, rcnn=dict( max_per_img=num_proposals, mask_thr=0.5, stuff_score_thr=0.05, merge_stuff_thing=dict( overlap_thr=0.6, iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.25 ) ) ) ) custom_imports = dict( imports=[ 'knet.det.knet', 'knet.det.kernel_head', 'knet.det.kernel_iter_head', 'knet.det.kernel_update_head', 'knet.det.semantic_fpn_wrapper', 'knet.det.dice_loss', 'knet.det.mask_hungarian_assigner', 'knet.det.mask_pseudo_sampler', 'knet.kernel_updator', 'knet.cross_entropy_loss', 'swin.swin_transformer', 'swin.mix_transformer', 'swin.DetectRS', 'swin.swin_transformer_rfp', 'external.cityscapes_step', 'external.kitti_step_dvps', 'external.vipseg_dvps', 'external.dataset.dvps_pipelines.transforms', 'external.dataset.dvps_pipelines.loading', 'external.dataset.dvps_pipelines.tricks', 'external.dataset.pipelines.formatting', 'external.dataset.pipelines.transforms', 'knet.video.knet', 'knet.video.knet_quansi_dense', 'knet.video.knet_quansi_dense_roi_gt_box', # 'knet.video.knet_track', # 'knet.video.knet_track_head', 'knet.video.track_heads', 'knet.video.kernel_head', 'knet.video.kernel_iter_head', 'knet.video.kernel_update_head', 'knet.video.knet_uni_track', 'knet.video.knet_quansi_dense', 'knet.video.knet_quansi_dense_roi_gt_box', 'knet.video.knet_quansi_dense_embed_fc', 'knet.video.knet_quansi_dense_embed_fc_joint_train', 'knet.video.qdtrack.losses.l2_loss', 'knet.video.qdtrack.losses.multipos_cross_entropy_loss', 'knet.video.qdtrack.trackers.quasi_dense_embed_tracker', ], allow_failed_imports=False ) ================================================ FILE: configs/det/_base_/models/video_knet_s3_r50_fpn_panoptic.py ================================================ num_stages = 3 num_proposals = 100 conv_kernel_size = 1 model = dict( type='VideoKNet', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch', init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=0, add_extra_convs='on_input', num_outs=4), rpn_head=dict( type='VideoConvKernelHead', num_classes=133, # modified for panoptic cat_stuff_mask=True, # modified for panoptic conv_kernel_size=conv_kernel_size, feat_downsample_stride=2, feat_refine_stride=1, feat_refine=False, use_binary=True, num_loc_convs=1, num_seg_convs=1, conv_normal_init=True, localization_fpn=dict( type='SemanticFPNWrapper', in_channels=256, feat_channels=256, out_channels=256, start_level=0, end_level=3, upsample_times=2, positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True), cat_coors=False, cat_coors_level=3, fuse_by_cat=False, return_list=False, num_aux_convs=1, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)), num_proposals=num_proposals, proposal_feats_with_obj=True, xavier_init_kernel=False, kernel_init_std=1, num_cls_fcs=1, in_channels=256, feat_transform_cfg=None, loss_rank=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.1), loss_seg=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict(type='DiceLoss', loss_weight=4.0)), roi_head=dict( type='VideoKernelIterHead', do_panoptic=True, num_stages=num_stages, stage_loss_weights=[1] * num_stages, proposal_feature_channel=256, mask_head=[ dict( type='VideoKernelUpdateHead', num_classes=133, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=2, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_rank=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.1), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict( type='DiceLoss', loss_weight=4.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0)) for _ in range(num_stages) ]), # training and testing settings train_cfg=dict( rpn=dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'), pos_weight=1), rcnn=[ dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'), pos_weight=1) for _ in range(num_stages) ]), test_cfg=dict( rpn=None, rcnn=dict( max_per_img=num_proposals, mask_thr=0.5, stuff_score_thr=0.05, merge_stuff_thing=dict( overlap_thr=0.6, iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3)))) custom_imports = dict( imports=[ 'knet.det.kernel_head', 'knet.det.kernel_iter_head', 'knet.det.kernel_update_head', 'knet.det.semantic_fpn_wrapper', 'knet.det.dice_loss', 'knet.cross_entropy_loss', 'knet.kernel_updator', 'knet.det.mask_hungarian_assigner', 'knet.det.mask_pseudo_sampler', 'external.coco_panoptic', 'external.youtubevis_clips', 'external.cityscapes_vps', 'external.cityscape_panoptic', 'external.cityscapes_dvps', 'swin.swin_transformer', 'swin.mix_transformer', 'swin.DetectRS', 'swin.swin_transformer_rfp', # 'knet.video.knet_track', # 'knet.video.knet_track_head', 'knet.video.track_heads', 'knet.video.kernel_head', 'knet.video.kernel_iter_head', 'knet.video.kernel_update_head', 'knet.video.knet_uni_track', 'knet.video.knet_quansi_dense', 'knet.video.knet_quansi_dense_conv_mask', 'knet.video.knet_quansi_dense_roi_gt_box', 'knet.video.knet_quansi_dense_embed_fc', # 'knet.video.knet_quansi_dense_embed_fc_joint_train', 'knet.video.knet_quansi_dense_roi_gt_box_joint_train', 'knet.video.qdtrack.losses.l2_loss', 'knet.video.qdtrack.losses.multipos_cross_entropy_loss', 'knet.video.qdtrack.trackers.quasi_dense_embed_tracker', 'knet.video.knet_quansi_dense_embed_fc_toy_exp', 'external.ext.ytvos', 'external.ext.mask', 'external.dataset.pipelines.transforms', 'external.dataset.pipelines.loading', 'external.dataset.pipelines.formatting', 'external.dataset.dvps_pipelines.transforms', 'external.dataset.dvps_pipelines.loading', 'external.dataset.dvps_pipelines.tricks', 'external.dataset.pipelines.formatting', ], allow_failed_imports=False) ================================================ FILE: configs/det/_base_/schedules/schedule_10e.py ================================================ # optimizer # this is different from the original 1x schedule that use SGD optimizer = dict( type='AdamW', lr=0.0001, weight_decay=0.05, paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)})) optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[8,]) runner = dict(type='EpochBasedRunner', max_epochs=10) ================================================ FILE: configs/det/_base_/schedules/schedule_1x.py ================================================ # optimizer # this is different from the original 1x schedule that use SGD optimizer = dict( type='AdamW', lr=0.0001, weight_decay=0.05, paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)})) optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[8, 11]) runner = dict(type='EpochBasedRunner', max_epochs=12) ================================================ FILE: configs/det/coco/knet_s3_r50_deformable_fpn_ms-3x_coco.py ================================================ _base_ = [ '../_base_/models/knet_s3_r50_deformable_fpn.py', '../common/mstrain_3x_coco_instance.py' ] model = dict( backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=True,), ) ================================================ FILE: configs/det/coco/knet_s3_r50_fpn_ms-3x_coco-panoptic.py ================================================ _base_ = [ '../_base_/models/knet_s3_r50_fpn_panoptic.py', '../common/mstrain_3x_coco_panoptic.py' ] num_stages = 3 num_proposals = 100 conv_kernel_size = 1 model = dict( type='KNet', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch', init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=0, add_extra_convs='on_input', num_outs=4), rpn_head=dict( type='ConvKernelHead', num_classes=133, # modified for panoptic cat_stuff_mask=True, # modified for panoptic conv_kernel_size=conv_kernel_size, feat_downsample_stride=2, feat_refine_stride=1, feat_refine=False, use_binary=True, num_loc_convs=1, num_seg_convs=1, conv_normal_init=True, localization_fpn=dict( type='SemanticFPNWrapper', in_channels=256, feat_channels=256, out_channels=256, start_level=0, end_level=3, upsample_times=2, positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True), cat_coors=False, cat_coors_level=3, fuse_by_cat=False, return_list=False, num_aux_convs=1, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)), num_proposals=num_proposals, proposal_feats_with_obj=True, xavier_init_kernel=False, kernel_init_std=1, num_cls_fcs=1, in_channels=256, feat_transform_cfg=None, loss_rank=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.1), loss_seg=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict(type='DiceLoss', loss_weight=4.0)), roi_head=dict( type='KernelIterHead', do_panoptic=True, merge_joint=True, num_stages=num_stages, stage_loss_weights=[1] * num_stages, proposal_feature_channel=256, mask_head=[ dict( type='KernelUpdateHead', num_classes=133, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=2, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_rank=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.1), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict( type='DiceLoss', loss_weight=4.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0)) for _ in range(num_stages) ]), test_cfg=dict( rpn=None, rcnn=dict( max_per_img=num_proposals, mask_thr=0.5, stuff_score_thr=0.05, merge_stuff_thing=dict( overlap_thr=0.6, iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3))) ) ================================================ FILE: configs/det/coco/knet_s3_r50_fpn_ms-3x_coco.py ================================================ _base_ = [ '../_base_/models/knet_s3_r50_fpn.py', '../common/mstrain_3x_coco_instance.py' ] ================================================ FILE: configs/det/coco/knet_s3_swin-b_deformable_fpn_ms-3x_coco.py ================================================ _base_ = [ '../_base_/models/knet_s3_r50_deformable_fpn.py', '../common/mstrain_3x_coco_instance.py' ] model = dict( pretrained='/mnt/lustre/lixiangtai/pretrained/swin/swin_base_patch4_window7_224_22k.pth', backbone=dict( _delete_=True, type='SwinTransformerDIY', embed_dims=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.3, use_abs_pos_embed=False, patch_norm=True, out_indices=(0, 1, 2, 3), with_cp=False), neck=dict(in_channels=[128, 256, 512, 1024]) ) ================================================ FILE: configs/det/common/lsj_coco_panoptic_50e.py ================================================ _base_ = '../_base_/default_runtime.py' # dataset settings dataset_type = 'CocoPanopticDatasetCustom' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) image_size = (1024, 1024) # In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)], # multiscale_mode='range' train_pipeline = [ dict(type='LoadImageFromFile'), dict( type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), dict( type='Resize', img_scale=image_size, ratio_range=(0.1, 2.0), multiscale_mode='range', keep_ratio=True), dict( type='RandomCrop', crop_type='absolute_range', crop_size=image_size, recompute_bbox=True, allow_negative_crop=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict( type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] # Use RepeatDataset to speed up training data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type='RepeatDataset', times=1, dataset=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/panoptic_train2017_thing_only_coco.json', panoptic_ann=data_root + 'annotations/panoptic_train2017.json'), img_prefix=data_root + 'train2017/', seg_prefix=data_root + 'panoptic_stuff_train2017/', pipeline=train_pipeline)), val=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/instances_val2017.json', panoptic_ann=data_root + 'annotations/panoptic_val2017.json'), seg_prefix=data_root + 'panoptic_val2017/', img_prefix=data_root + 'val2017/', pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/instances_val2017.json', panoptic_ann=data_root + 'annotations/panoptic_val2017.json'), seg_prefix=data_root + 'panoptic_val2017/', img_prefix=data_root + 'val2017/', pipeline=test_pipeline)) evaluation = dict(metric=['segm', 'panoptic'], interval=5) checkpoint_config = dict(interval=5) # optimizer # this is different from the original 1x schedule that use SGD optimizer = dict( type='AdamW', lr=0.0001, weight_decay=0.05, paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)})) optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2)) # learning policy # Experiments show that using step=[9, 11] has higher performance lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[42, 48]) runner = dict(type='EpochBasedRunner', max_epochs=50) ================================================ FILE: configs/det/common/mstrain_3x_coco_instance.py ================================================ _base_ = '../_base_/default_runtime.py' # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) # In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)], # multiscale_mode='range' train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), dict( type='Resize', img_scale=[(1333, 640), (1333, 800)], multiscale_mode='range', keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] # Use RepeatDataset to speed up training data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type='RepeatDataset', times=3, dataset=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', pipeline=train_pipeline)), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', pipeline=test_pipeline)) evaluation = dict(interval=1, metric=['segm']) # optimizer # this is different from the original 1x schedule that use SGD optimizer = dict( type='AdamW', lr=0.0001, weight_decay=0.05, paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)})) optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2)) # learning policy # Experiments show that using step=[9, 11] has higher performance lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[9, 11]) runner = dict(type='EpochBasedRunner', max_epochs=12) ================================================ FILE: configs/det/common/mstrain_3x_coco_panoptic_inst_anno.py ================================================ _base_ = '../_base_/default_runtime.py' # dataset settings dataset_type = 'CocoPanopticDatasetCustom' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) # In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)], # multiscale_mode='range' train_pipeline = [ dict(type='LoadImageFromFile'), dict( type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), dict( type='Resize', img_scale=[(1333, 640), (1333, 800)], multiscale_mode='range', keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict( type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] # Use RepeatDataset to speed up training data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type='RepeatDataset', times=3, dataset=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/panoptic_train2017_thing_only_coco.json', panoptic_ann=data_root + 'annotations/panoptic_train2017.json'), img_prefix=data_root + 'train2017/', seg_prefix=data_root + 'panoptic_stuff_train2017/', pipeline=train_pipeline)), val=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/instances_val2017.json', panoptic_ann=data_root + 'annotations/panoptic_val2017.json'), seg_prefix=data_root + 'panoptic_val2017/', img_prefix=data_root + 'val2017/', pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/instances_val2017.json', panoptic_ann=data_root + 'annotations/panoptic_val2017.json'), seg_prefix=data_root + 'panoptic_val2017/', img_prefix=data_root + 'val2017/', pipeline=test_pipeline)) evaluation = dict(metric=['segm', 'panoptic']) # optimizer # this is different from the original 1x schedule that use SGD optimizer = dict( type='AdamW', lr=0.0001, weight_decay=0.05, paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)})) optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2)) # learning policy # Experiments show that using step=[9, 11] has higher performance lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[9, 11]) runner = dict(type='EpochBasedRunner', max_epochs=12) ================================================ FILE: configs/det/common/mstrain_3x_coco_panoptic_inst_anno_detr_aug.py ================================================ _base_ = '../_base_/default_runtime.py' # dataset settings dataset_type = 'CocoPanopticDatasetCustom' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) # In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)], # multiscale_mode='range' train_pipeline = [ dict(type='LoadImageFromFile'), dict( type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), dict(type='RandomFlip', flip_ratio=0.5), dict( type='AutoAugment', policies=[[ dict( type='Resize', img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), (608, 1333), (640, 1333), (672, 1333), (704, 1333), (736, 1333), (768, 1333), (800, 1333)], multiscale_mode='value', keep_ratio=True) ], [ dict( type='Resize', img_scale=[(400, 1333), (500, 1333), (600, 1333)], multiscale_mode='value', keep_ratio=True), dict( type='RandomCrop', crop_type='relative', crop_size=(0.7, 0.7), allow_negative_crop=True), dict( type='Resize', img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), (608, 1333), (640, 1333), (672, 1333), (704, 1333), (736, 1333), (768, 1333), (800, 1333)], multiscale_mode='value', override=True, keep_ratio=True) ]]), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict( type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] # Use RepeatDataset to speed up training data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type='RepeatDataset', times=3, dataset=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/panoptic_train2017_thing_only_coco.json', panoptic_ann=data_root + 'annotations/panoptic_train2017.json'), img_prefix=data_root + 'train2017/', seg_prefix=data_root + 'panoptic_stuff_train2017/', pipeline=train_pipeline)), val=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/instances_val2017.json', panoptic_ann=data_root + 'annotations/panoptic_val2017.json'), seg_prefix=data_root + 'panoptic_val2017/', img_prefix=data_root + 'val2017/', pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/instances_val2017.json', panoptic_ann=data_root + 'annotations/panoptic_val2017.json'), seg_prefix=data_root + 'panoptic_val2017/', img_prefix=data_root + 'val2017/', pipeline=test_pipeline)) evaluation = dict(metric=['segm', 'panoptic']) # optimizer # this is different from the original 1x schedule that use SGD optimizer = dict( type='AdamW', lr=0.0001, weight_decay=0.05, paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)})) optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2)) # learning policy # Experiments show that using step=[9, 11] has higher performance lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[9, 11]) runner = dict(type='EpochBasedRunner', max_epochs=12) ================================================ FILE: configs/det/common/mstrain_64e_city_panoptic.py ================================================ _base_ = '../_base_/default_runtime.py' # dataset settings dataset_type = 'CityscapesPanopticDataset' data_root = 'data/cityscapes/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), dict( type='Resize', img_scale=[(512, 1024), (2048, 4096)], multiscale_mode='range', keep_ratio=True), dict(type='RandomCrop', crop_size=(1024, 2048)), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 1024), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type='RepeatDataset', times=8, dataset=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/instancesonly_filtered_gtFine_train.json', panoptic_ann=data_root + 'annotations/cityscapes_panoptic_train.json' ), img_prefix=data_root + 'leftImg8bit/train/', seg_prefix=data_root + 'gtFine/train', pipeline=train_pipeline)), val=dict( type=dataset_type, ann_file=dict( ins_ann=data_root +'annotations/instancesonly_filtered_gtFine_val.json', panoptic_ann=data_root + "annotations/cityscapes_panoptic_val.json" ), img_prefix=data_root + 'leftImg8bit/val/', seg_prefix=data_root + 'gtFine/cityscapes_panoptic_val', pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=dict( ins_ann=data_root + 'annotations/instancesonly_filtered_gtFine_val.json', panoptic_ann=data_root + "annotations/cityscapes_panoptic_val.json" ), img_prefix=data_root + 'leftImg8bit/val/', seg_prefix=data_root + 'gtFine/cityscapes_panoptic_val', pipeline=test_pipeline)) evaluation = dict(metric=['panoptic']) # optimizer # this is different from the original 1x schedule that use SGD optimizer = dict( type='AdamW', lr=0.0001, weight_decay=0.05, paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)})) optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2)) # learning policy # Experiments show that using step=[9, 11] has higher performance lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=0.001, # [7] yields higher performance than [6] step=[7]) runner = dict( type='EpochBasedRunner', max_epochs=8) # actual epoch = 8 * 8 = 64 ================================================ FILE: configs/det/knet_cityscapes_step/knet_s3_r50_fpn.py ================================================ _base_ = [ '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py', '../_base_/models/knet_citystep_s3_r50_fpn.py', '../_base_/datasets/cityscapes_step.py', ] num_proposals = 100 # load_from = "/mnt/lustre/lixiangtai/pretrained/video_knet_vis/knet_r50_city.pth" load_from = None work_dir = 'logger/blackhole' runner = dict(type='EpochBasedRunner', max_epochs=8) model = dict( type='KNet', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=True, style='pytorch', init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), roi_head=dict( type='KernelIterHead', merge_joint=True,), test_cfg=dict( rpn=None, rcnn=dict( max_per_img=num_proposals, mask_thr=0.5, stuff_score_thr=0.05, merge_stuff_thing=dict( overlap_thr=0.6, iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3))) ) lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[7, ], ) data = dict( samples_per_gpu=2, workers_per_gpu=2, ) ================================================ FILE: configs/det/knet_cityscapes_step/knet_s3_swin_b_fpn.py ================================================ _base_ = [ '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py', '../_base_/models/knet_citystep_s3_r50_fpn.py', '../_base_/datasets/cityscapes_step.py', ] num_proposals = 100 # load_from = "/mnt/lustre/lixiangtai/pretrained/video_knet_vis/knet_swin_b_city.pth" load_from = None work_dir = 'logger/blackhole' runner = dict(type='EpochBasedRunner', max_epochs=8) model = dict( type='KNet', backbone=dict( _delete_=True, type='SwinTransformerDIY', embed_dims=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.3, use_abs_pos_embed=False, patch_norm=True, out_indices=(0, 1, 2, 3), with_cp=False), neck=dict(in_channels=[128, 256, 512, 1024]), roi_head=dict( type='KernelIterHead', merge_joint=True, ), test_cfg=dict( rpn=None, rcnn=dict( max_per_img=num_proposals, mask_thr=0.5, stuff_score_thr=0.05, merge_stuff_thing=dict( overlap_thr=0.6, iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3))) ) lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[7, ], ) data = dict( samples_per_gpu=1, workers_per_gpu=2, ) ================================================ FILE: configs/det/knet_cityscapes_step/knet_s3_swin_l_fpn.py ================================================ _base_ = [ '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py', '../_base_/models/knet_citystep_s3_r50_fpn.py', '../_base_/datasets/cityscapes_step.py', ] num_proposals = 100 # load_from = "/mnt/lustre/lixiangtai/pretrained/video_knet_vis/knet_swin_l_city.pth" load_from = None work_dir = 'logger/blackhole' runner = dict(type='EpochBasedRunner', max_epochs=8) model = dict( type='KNet', backbone=dict( _delete_=True, type='SwinTransformerDIY', embed_dims=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.2, use_abs_pos_embed=False, patch_norm=True, out_indices=(0, 1, 2, 3), with_cp=False), neck=dict(in_channels=[192, 384, 768, 1536]), roi_head=dict( type='KernelIterHead', merge_joint=True, ), test_cfg=dict( rpn=None, rcnn=dict( max_per_img=num_proposals, mask_thr=0.5, stuff_score_thr=0.05, merge_stuff_thing=dict( overlap_thr=0.6, iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3))) ) lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[7, ], ) data = dict( samples_per_gpu=1, workers_per_gpu=2, ) ================================================ FILE: configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train.py ================================================ _base_ = [ '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py', '../_base_/models/knet_kitti_step_s3_r50_fpn.py', '../_base_/datasets/kitti_step_vps.py', ] load_from = None num_stages = 3 conv_kernel_size = 1 num_thing_classes = 2 num_stuff_classes = 17 num_classes = num_thing_classes + num_stuff_classes model = dict( type="VideoKNetQuansiEmbedFCJointTrain", cityscapes=False, kitti_step=True, link_previous=True, mask_assign_stride=2, num_thing_classes=2, num_stuff_classes=17, ignore_label=255, backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='SyncBN', requires_grad=True), ), rpn_head=dict( loss_seg=dict( _delete_=True, type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), feat_downsample_stride=4, ), # add video_knet_vis roi head track_head=dict( type='QuasiDenseMaskEmbedHeadGTMask', num_convs=0, num_fcs=2, roi_feat_size=1, in_channels=256, fc_out_channels=256, embed_channels=256, norm_cfg=dict(type='GN', num_groups=32), loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25), loss_track_aux=dict( type='L2Loss', neg_pos_ub=3, pos_margin=0, neg_margin=0.1, hard_mining=True, loss_weight=1.0), ), # add tracker config tracker=dict( type='QuasiDenseEmbedTracker', init_score_thr=0.35, obj_score_thr=0.3, match_score_thr=0.5, memo_tracklet_frames=5, memo_backdrop_frames=1, memo_momentum=0.8, nms_conf_thr=0.5, nms_backdrop_iou_thr=0.3, nms_class_iou_thr=0.7, with_cats=True, match_metric='bisoftmax' ), # roi head roi_head=dict( type='VideoKernelIterHead', num_stages=num_stages, num_thing_classes=num_thing_classes, num_stuff_classes=num_stuff_classes, with_track=True, merge_joint=True, mask_head=[ dict( type='VideoKernelUpdateHead', num_classes=19, previous='placeholder', previous_type="ffn", num_thing_classes=2, num_stuff_classes=17, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=4, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict( type='DiceLoss', loss_weight=4.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), ) for _ in range(num_stages) ] ), track_train_cfg=dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'),), bbox_roi_extractor=None ) img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False) train_pipeline = [ dict(type='LoadMultiImagesDirect'), dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]), dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True), dict(type='SeqFlipWithDepth', flip_ratio=0.5), dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True), dict(type='SeqNormalizeWithDepth', **img_norm_cfg), dict(type='SeqPadWithDepth', size_divisor=32), dict( type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids',]), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ] test_pipeline = [ dict(type='LoadImgDirect'), dict( type='MultiScaleFlipAug', scale_factor=[1.0], flip=False, transforms=[ dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img', 'img_id', 'seq_id'], meta_keys=[ 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg', 'ori_filename' ]), ]) ] runner = dict(type='EpochBasedRunner', max_epochs=12) lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[9, 11]) data = dict( samples_per_gpu=1, workers_per_gpu=2, train=dict( type='RepeatDataset', times=2, dataset=dict( split='train', ref_seq_index=[-2, -1, 1, 2], test_mode=False, pipeline=train_pipeline )), test=dict( ref_seq_index=None, test_mode=True, pipeline=test_pipeline, split='val', ) ) find_unused_parameters=True ================================================ FILE: configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train_8e.py ================================================ _base_ = [ '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py', '../_base_/models/knet_kitti_step_s3_r50_fpn.py', '../_base_/datasets/kitti_step_vps.py', ] load_from = None num_stages = 3 conv_kernel_size = 1 num_thing_classes = 2 num_stuff_classes = 17 num_classes = num_thing_classes + num_stuff_classes model = dict( type="VideoKNetQuansiEmbedFCJointTrain", cityscapes=False, kitti_step=True, link_previous=True, mask_assign_stride=2, num_thing_classes=2, num_stuff_classes=17, ignore_label=255, backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='SyncBN', requires_grad=True), ), rpn_head=dict( loss_seg=dict( _delete_=True, type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), feat_downsample_stride=4, ), # add video_knet_vis roi head track_head=dict( type='QuasiDenseMaskEmbedHeadGTMask', num_convs=0, num_fcs=2, roi_feat_size=1, in_channels=256, fc_out_channels=256, embed_channels=256, norm_cfg=dict(type='GN', num_groups=32), loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25), loss_track_aux=dict( type='L2Loss', neg_pos_ub=3, pos_margin=0, neg_margin=0.1, hard_mining=True, loss_weight=1.0), ), # add tracker config tracker=dict( type='QuasiDenseEmbedTracker', init_score_thr=0.35, obj_score_thr=0.3, match_score_thr=0.5, memo_tracklet_frames=5, memo_backdrop_frames=1, memo_momentum=0.8, nms_conf_thr=0.5, nms_backdrop_iou_thr=0.3, nms_class_iou_thr=0.7, with_cats=True, match_metric='bisoftmax' ), # roi head roi_head=dict( type='VideoKernelIterHead', num_stages=num_stages, num_thing_classes=num_thing_classes, num_stuff_classes=num_stuff_classes, with_track=True, merge_joint=True, mask_head=[ dict( type='VideoKernelUpdateHead', num_classes=19, previous='placeholder', previous_type="ffn", num_thing_classes=2, num_stuff_classes=17, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=4, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict( type='DiceLoss', loss_weight=4.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), ) for _ in range(num_stages) ] ), track_train_cfg=dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'),), bbox_roi_extractor=None ) img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False) train_pipeline = [ dict(type='LoadMultiImagesDirect'), dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]), dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True), dict(type='SeqFlipWithDepth', flip_ratio=0.5), dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True), dict(type='SeqNormalizeWithDepth', **img_norm_cfg), dict(type='SeqPadWithDepth', size_divisor=32), dict( type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids',]), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ] test_pipeline = [ dict(type='LoadImgDirect'), dict( type='MultiScaleFlipAug', scale_factor=[1.0], flip=False, transforms=[ dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img', 'img_id', 'seq_id'], meta_keys=[ 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg', 'ori_filename' ]), ]) ] runner = dict(type='EpochBasedRunner', max_epochs=8) lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[7]) data = dict( samples_per_gpu=1, workers_per_gpu=2, train=dict( type='RepeatDataset', times=2, dataset=dict( split='train', ref_seq_index=[-2, -1, 1, 2], test_mode=False, pipeline=train_pipeline )), test=dict( ref_seq_index=None, test_mode=True, pipeline=test_pipeline, split='val', ) ) find_unused_parameters=True ================================================ FILE: configs/det/video_knet_kitti_step/video_knet_s3_swinb_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_update.py ================================================ _base_ = [ '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py', '../_base_/models/knet_kitti_step_s3_r50_fpn.py', '../_base_/datasets/kitti_step_vps.py', ] load_from = None num_stages = 3 conv_kernel_size = 1 num_thing_classes = 2 num_stuff_classes = 17 num_classes = num_thing_classes + num_stuff_classes model = dict( type="VideoKNetQuansiEmbedFCJointTrain", cityscapes=False, kitti_step=True, link_previous=True, mask_assign_stride=2, num_thing_classes=num_thing_classes, num_stuff_classes=num_stuff_classes, ignore_label=255, backbone=dict( _delete_=True, type='SwinTransformerDIY', embed_dims=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=7, mlp_ratio=4, qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.3, use_abs_pos_embed=False, patch_norm=True, out_indices=(0, 1, 2, 3), with_cp=False ), neck=dict(in_channels=[128, 256, 512, 1024]), rpn_head=dict( loss_seg=dict( _delete_=True, type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), feat_downsample_stride=4, ), # add video_knet_vis roi head track_head=dict( type='QuasiDenseMaskEmbedHeadGTMask', num_convs=0, num_fcs=2, roi_feat_size=1, in_channels=256, fc_out_channels=256, embed_channels=256, norm_cfg=dict(type='GN', num_groups=32), loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25), loss_track_aux=dict( type='L2Loss', neg_pos_ub=3, pos_margin=0, neg_margin=0.1, hard_mining=True, loss_weight=1.0), ), # add tracker config tracker=dict( type='QuasiDenseEmbedTracker', init_score_thr=0.35, obj_score_thr=0.3, match_score_thr=0.5, memo_tracklet_frames=5, memo_backdrop_frames=1, memo_momentum=0.8, nms_conf_thr=0.5, nms_backdrop_iou_thr=0.3, nms_class_iou_thr=0.7, with_cats=True, match_metric='bisoftmax' ), # roi head roi_head=dict( type='VideoKernelIterHead', num_stages=num_stages, num_thing_classes=2, num_stuff_classes=17, with_track=True, merge_joint=True, mask_head=[ dict( type='VideoKernelUpdateHead', num_classes=num_classes, previous='placeholder', previous_link="update_dynamic_cov", previous_type="update", num_thing_classes=num_thing_classes, num_stuff_classes=num_stuff_classes, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=4, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict( type='DiceLoss', loss_weight=4.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), ) for _ in range(num_stages) ] ), track_train_cfg=dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'),), bbox_roi_extractor=None ) img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False) train_pipeline = [ dict(type='LoadMultiImagesDirect'), dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]), dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True), dict(type='SeqFlipWithDepth', flip_ratio=0.5), dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True), dict(type='SeqNormalizeWithDepth', **img_norm_cfg), dict(type='SeqPadWithDepth', size_divisor=32), dict( type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids',]), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ] test_pipeline = [ dict(type='LoadImgDirect'), dict( type='MultiScaleFlipAug', scale_factor=[1.0], flip=False, transforms=[ dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img', 'img_id', 'seq_id'], meta_keys=[ 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg', 'ori_filename' ]), ]) ] runner = dict(type='EpochBasedRunner', max_epochs=12) lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[9, 11]) data = dict( samples_per_gpu=1, workers_per_gpu=2, train=dict( type='RepeatDataset', times=2, dataset=dict( split='train', ref_seq_index=[-2, -1, 1, 2], test_mode=False, pipeline=train_pipeline )), test=dict( ref_seq_index=None, test_mode=True, pipeline=test_pipeline, split='val', ) ) find_unused_parameters=True ================================================ FILE: configs/det/video_knet_kitti_step/video_knet_s3_swinl_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_update.py ================================================ _base_ = [ '../../_base_/schedules/schedule_1x.py', '../../_base_/default_runtime.py', '../../_base_/models/knet_kitti_step_s3_r50_fpn.py', '../../_base_/datasets/kitti_step_vps.py', ] load_from = None num_stages = 3 conv_kernel_size = 1 num_thing_classes = 2 num_stuff_classes = 17 num_classes = num_thing_classes + num_stuff_classes model = dict( type="VideoKNetQuansiEmbedFCJointTrain", cityscapes=False, kitti_step=True, link_previous=True, mask_assign_stride=2, num_thing_classes=2, num_stuff_classes=17, ignore_label=255, backbone=dict( _delete_=True, type='SwinTransformerDIY', embed_dims=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.2, use_abs_pos_embed=False, patch_norm=True, out_indices=(0, 1, 2, 3), with_cp=False), neck=dict(in_channels=[192, 384, 768, 1536]), rpn_head=dict( loss_seg=dict( _delete_=True, type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), feat_downsample_stride=4, ), # add video_knet_vis roi head track_head=dict( type='QuasiDenseMaskEmbedHeadGTMask', num_convs=0, num_fcs=2, roi_feat_size=1, in_channels=256, fc_out_channels=256, embed_channels=256, norm_cfg=dict(type='GN', num_groups=32), loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25), loss_track_aux=dict( type='L2Loss', neg_pos_ub=3, pos_margin=0, neg_margin=0.1, hard_mining=True, loss_weight=1.0), ), # add tracker config tracker=dict( type='QuasiDenseEmbedTracker', init_score_thr=0.35, obj_score_thr=0.3, match_score_thr=0.5, memo_tracklet_frames=5, memo_backdrop_frames=1, memo_momentum=0.8, nms_conf_thr=0.5, nms_backdrop_iou_thr=0.3, nms_class_iou_thr=0.7, with_cats=True, match_metric='bisoftmax' ), # roi head roi_head=dict( type='VideoKernelIterHead', num_stages=num_stages, num_thing_classes=2, num_stuff_classes=17, with_track=True, merge_joint=True, mask_head=[ dict( type='VideoKernelUpdateHead', num_classes=19, previous='placeholder', previous_link="update_dynamic_cov", previous_type="update", num_thing_classes=2, num_stuff_classes=17, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=4, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict( type='DiceLoss', loss_weight=4.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), ) for _ in range(num_stages) ] ), track_train_cfg=dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'),), bbox_roi_extractor=None ) work_dir = 'logger/ks_wodepth_4x8_step_stride2_nocrop_2_17' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False) train_pipeline = [ dict(type='LoadMultiImagesDirect'), dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]), dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True), dict(type='SeqFlipWithDepth', flip_ratio=0.5), dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True), dict(type='SeqNormalizeWithDepth', **img_norm_cfg), dict(type='SeqPadWithDepth', size_divisor=32), dict( type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids',]), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ] test_pipeline = [ dict(type='LoadImgDirect'), dict( type='MultiScaleFlipAug', scale_factor=[1.0], flip=False, transforms=[ dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img', 'img_id', 'seq_id'], meta_keys=[ 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg', 'ori_filename' ]), ]) ] runner = dict(type='EpochBasedRunner', max_epochs=12) lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[9, 11]) data = dict( samples_per_gpu=1, workers_per_gpu=2, train=dict( type='RepeatDataset', times=2, dataset=dict( split='train', ref_seq_index=[-2, -1, 1, 2], test_mode=False, pipeline=train_pipeline )), test=dict( ref_seq_index=None, test_mode=True, pipeline=test_pipeline, split='val', ) ) find_unused_parameters=True ================================================ FILE: configs/det/video_knet_kitti_step/video_knet_s3_swinl_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_update_conv_short_track_fc.py ================================================ _base_ = [ '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py', '../_base_/models/knet_kitti_step_s3_r50_fpn.py', '../_base_/datasets/kitti_step_vps.py', ] # load_from = "/mnt/lustre/lixiangtai/project/Knet/work_dirs/city_step/swin_l_joint_8e/latest.pth" load_from = None num_stages = 3 conv_kernel_size = 1 model = dict( type="VideoKNetQuansiEmbedFCJointTrain", cityscapes=False, kitti_step=True, link_previous=True, mask_assign_stride=2, num_thing_classes=2, num_stuff_classes=17, ignore_label=255, backbone=dict( _delete_=True, type='SwinTransformerDIY', embed_dims=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.2, use_abs_pos_embed=False, patch_norm=True, out_indices=(0, 1, 2, 3), with_cp=False), neck=dict(in_channels=[192, 384, 768, 1536]), rpn_head=dict( loss_seg=dict( _delete_=True, type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), feat_downsample_stride=4, ), # add track roi head track_head=dict( type='QuasiDenseMaskEmbedHeadGTMask', num_convs=0, num_fcs=1, roi_feat_size=1, in_channels=256, fc_out_channels=256, embed_channels=256, norm_cfg=dict(type='GN', num_groups=32), loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25), loss_track_aux=dict( type='L2Loss', neg_pos_ub=3, pos_margin=0, neg_margin=0.1, hard_mining=True, loss_weight=1.0), ), # add tracker config tracker=dict( type='QuasiDenseEmbedTracker', init_score_thr=0.35, obj_score_thr=0.3, match_score_thr=0.5, memo_tracklet_frames=5, memo_backdrop_frames=1, memo_momentum=0.8, nms_conf_thr=0.5, nms_backdrop_iou_thr=0.3, nms_class_iou_thr=0.7, with_cats=True, match_metric='bisoftmax' ), # roi head roi_head=dict( type='VideoKernelIterHead', num_stages=num_stages, num_thing_classes=2, num_stuff_classes=17, with_track=True, merge_joint=True, mask_head=[ dict( type='VideoKernelUpdateHead', num_classes=19, previous='placeholder', previous_link="update_dynamic_cov", previous_type="ffn", num_thing_classes=2, num_stuff_classes=17, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=4, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict( type='DiceLoss', loss_weight=4.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), ) for _ in range(num_stages) ] ), track_train_cfg=dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'),), bbox_roi_extractor=None ) img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False) train_pipeline = [ dict(type='LoadMultiImagesDirect'), dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]), dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True), dict(type='SeqFlipWithDepth', flip_ratio=0.5), dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True), dict(type='SeqNormalizeWithDepth', **img_norm_cfg), dict(type='SeqPadWithDepth', size_divisor=32), dict( type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids',]), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ] test_pipeline = [ dict(type='LoadImgDirect'), dict( type='MultiScaleFlipAug', scale_factor=[1.0], flip=False, transforms=[ dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img', 'img_id', 'seq_id'], meta_keys=[ 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg', 'ori_filename' ]), ]) ] runner = dict(type='EpochBasedRunner', max_epochs=12) lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[9, 11]) data = dict( samples_per_gpu=1, workers_per_gpu=2, train=dict( type='RepeatDataset', times=2, dataset=dict( split='train', ref_seq_index=[-2, -1, 1, 2], test_mode=False, pipeline=train_pipeline )), test=dict( ref_seq_index=None, test_mode=True, pipeline=test_pipeline, split='val', ) ) find_unused_parameters=True ================================================ FILE: configs/det/video_knet_vipseg/video_knet_s3_r50_rpn_vipseg_mask_embed_link_ffn_joint_train.py ================================================ _base_ = [ '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py', '../_base_/models/knet_vipseg_s3_r50_fpn.py', '../_base_/datasets/vipseg_dvps.py', ] num_stages = 3 conv_kernel_size = 1 num_thing_classes = 58 num_stuff_classes = 66 num_classes = num_stuff_classes + num_thing_classes model = dict( type="VideoKNetQuansiEmbedFCJointTrain", # use cityscape style label distribution. # thing first , stuff second cityscapes=False, vipseg=True, kitti_step=False, link_previous=True, mask_assign_stride=2, num_thing_classes=num_thing_classes, num_stuff_classes=num_stuff_classes, ignore_label=255, backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True ), rpn_head=dict( num_thing_classes=num_thing_classes, num_stuff_classes=num_stuff_classes, loss_seg=dict( _delete_=True, type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), feat_downsample_stride=4, ), # add track roi head track_head=dict( type='QuasiDenseMaskEmbedHeadGTMask', num_convs=0, num_fcs=2, roi_feat_size=1, in_channels=256, fc_out_channels=256, embed_channels=256, norm_cfg=dict(type='GN', num_groups=32), loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25), loss_track_aux=dict( type='L2Loss', neg_pos_ub=3, pos_margin=0, neg_margin=0.1, hard_mining=True, loss_weight=1.0), ), # add tracker config tracker=dict( type='QuasiDenseEmbedTracker', init_score_thr=0.35, obj_score_thr=0.3, match_score_thr=0.5, memo_tracklet_frames=5, memo_backdrop_frames=1, memo_momentum=0.8, nms_conf_thr=0.5, nms_backdrop_iou_thr=0.3, nms_class_iou_thr=0.7, with_cats=True, match_metric='bisoftmax' ), # roi head roi_head=dict( type='VideoKernelIterHead', num_stages=num_stages, num_thing_classes=num_thing_classes, num_stuff_classes=num_stuff_classes, with_track=True, merge_joint=True, mask_head=[ dict( type='VideoKernelUpdateHead', num_classes=num_classes, previous='placeholder', previous_type="ffn", num_thing_classes=num_thing_classes, num_stuff_classes=num_stuff_classes, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=4, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict( type='DiceLoss', loss_weight=4.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), ) for _ in range(num_stages) ] ), track_train_cfg=dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'),), bbox_roi_extractor=None ) runner = dict(type='EpochBasedRunner', max_epochs=12) lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[9, 11]) find_unused_parameters=True ================================================ FILE: configs/det/video_knet_vipseg/video_knet_s3_swin_b_rpn_vipseg_mask_embed_link_ffn_joint_train_8e.py ================================================ _base_ = [ '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py', '../_base_/models/knet_vipseg_s3_r50_fpn.py', '../_base_/datasets/vipseg_dvps.py', ] num_stages = 3 conv_kernel_size = 1 num_thing_classes = 58 num_stuff_classes = 66 num_classes = num_stuff_classes + num_thing_classes model = dict( type="VideoKNetQuansiEmbedFCJointTrain", # use cityscape style label distribution. # thing first , stuff second cityscapes=False, vipseg=True, kitti_step=False, link_previous=True, mask_assign_stride=2, num_thing_classes=num_thing_classes, num_stuff_classes=num_stuff_classes, ignore_label=255, backbone=dict( _delete_=True, type='SwinTransformerDIY', embed_dims=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.3, use_abs_pos_embed=False, patch_norm=True, out_indices=(0, 1, 2, 3), with_cp=False), neck=dict( in_channels=[128, 256, 512, 1024], ), rpn_head=dict( num_thing_classes=num_thing_classes, num_stuff_classes=num_stuff_classes, loss_seg=dict( _delete_=True, type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), feat_downsample_stride=4, ), # add track roi head track_head=dict( type='QuasiDenseMaskEmbedHeadGTMask', num_convs=0, num_fcs=2, roi_feat_size=1, in_channels=256, fc_out_channels=256, embed_channels=256, norm_cfg=dict(type='GN', num_groups=32), loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25), loss_track_aux=dict( type='L2Loss', neg_pos_ub=3, pos_margin=0, neg_margin=0.1, hard_mining=True, loss_weight=1.0), ), # add tracker config tracker=dict( type='QuasiDenseEmbedTracker', init_score_thr=0.35, obj_score_thr=0.3, match_score_thr=0.5, memo_tracklet_frames=5, memo_backdrop_frames=1, memo_momentum=0.8, nms_conf_thr=0.5, nms_backdrop_iou_thr=0.3, nms_class_iou_thr=0.7, with_cats=True, match_metric='bisoftmax' ), # roi head roi_head=dict( type='VideoKernelIterHead', num_stages=num_stages, num_thing_classes=num_thing_classes, num_stuff_classes=num_stuff_classes, with_track=True, merge_joint=True, mask_head=[ dict( type='VideoKernelUpdateHead', num_classes=num_classes, previous='placeholder', previous_type="ffn", num_thing_classes=num_thing_classes, num_stuff_classes=num_stuff_classes, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=4, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict( type='DiceLoss', loss_weight=4.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), ) for _ in range(num_stages) ] ), track_train_cfg=dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), sampler=dict(type='MaskPseudoSampler'),), bbox_roi_extractor=None ) runner = dict(type='EpochBasedRunner', max_epochs=8) lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[7,]) find_unused_parameters=True ================================================ FILE: configs/video_knet_vis/_base_/datasets/coco_instance.py ================================================ dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', pipeline=train_pipeline), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', pipeline=test_pipeline)) # we do not evaluate bbox because K-Net does not predict bounding boxes evaluation = dict(metric=['segm']) ================================================ FILE: configs/video_knet_vis/_base_/datasets/youtubevis_2019.py ================================================ # dataset settings img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True ) train_pipeline = [ dict(type='LoadMultiImagesFromFile', to_float32=True), dict( type='SeqLoadAnnotations', with_bbox=True, with_mask=True, with_track=True), dict( type='SeqResize', multiscale_mode='value', share_params=True, img_scale=[(288,1e6), (320,1e6), (352,1e6), (392,1e6), (416,1e6), (448,1e6), (480,1e6), (512,1e6)], keep_ratio=True ), dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5), dict(type='SeqNormalize', **img_norm_cfg), dict(type='SeqPad', size_divisor=32), dict( type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_instance_ids'], reject_empty=True, num_ref_imgs=5, ), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ] test_pipeline = [ dict(type='LoadMultiImagesFromFile', to_float32=True), dict(type='MultiScaleFlipAugVideo', img_scale=(640, 360), flip=False, transforms=[ dict(type='SeqResize'), dict(type='SeqNormalize', **img_norm_cfg), dict(type='SeqPad', size_divisor=32), dict( type='VideoCollect', keys=['img'], reject_empty=False, num_ref_imgs=0, # 0 means do not apply check ), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ]) ] dataset_type = 'YouTubeVISDataset' data_root = 'data/youtube_vis_2019/' dataset_version = '2019' data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, dataset_version=dataset_version, ann_file=data_root + 'annotations/youtube_vis_2019_train.json', img_prefix=data_root + 'train/JPEGImages', ref_img_sampler=dict( num_ref_imgs=5, frame_range=[-2, 2], filter_key_img=False, method='uniform'), pipeline=train_pipeline ), val=dict( type=dataset_type, dataset_version=dataset_version, ann_file=data_root + 'annotations/youtube_vis_2019_valid.json', img_prefix=data_root + 'valid/JPEGImages', ref_img_sampler=None, load_all_frames=True, pipeline=test_pipeline ), test=dict( type=dataset_type, dataset_version=dataset_version, ann_file=data_root + 'annotations/youtube_vis_2019_valid.json', img_prefix=data_root + 'valid/JPEGImages', ref_img_sampler=None, load_all_frames=True, pipeline=test_pipeline ) ) ================================================ FILE: configs/video_knet_vis/_base_/default_runtime.py ================================================ checkpoint_config = dict(interval=1) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), ] ) # custom_hooks = [dict(type='NumClassCheckHook')] dist_params = dict(backend='nccl') log_level = 'INFO' load_from = None resume_from = None workflow = [('train', 1)] work_dir = 'logger/blackhole' ================================================ FILE: configs/video_knet_vis/_base_/models/knet_track_r50.py ================================================ num_stages = 3 num_proposals = 100 conv_kernel_size = 1 model = dict( type='KNetTrack', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch', init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=0, add_extra_convs='on_input', num_outs=4), rpn_head=dict( type='ConvKernelHeadVideo', conv_kernel_size=conv_kernel_size, feat_downsample_stride=2, feat_refine_stride=1, feat_refine=False, use_binary=True, num_loc_convs=1, num_seg_convs=1, conv_normal_init=True, localization_fpn=dict( type='SemanticFPNWrapper', in_channels=256, feat_channels=256, out_channels=256, start_level=0, end_level=3, upsample_times=2, positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True), cat_coors=False, cat_coors_level=3, fuse_by_cat=False, return_list=False, num_aux_convs=1, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True) ), num_proposals=num_proposals, proposal_feats_with_obj=True, xavier_init_kernel=False, kernel_init_std=1, num_cls_fcs=1, in_channels=256, num_classes=40, feat_transform_cfg=None, loss_seg=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict(type='DiceLoss', loss_weight=4.0)), roi_head=dict( type='KernelIterHeadVideo', num_stages=num_stages, stage_loss_weights=[1] * num_stages, proposal_feature_channel=256, num_thing_classes=40, num_stuff_classes=0, mask_head=[ dict( type='KernelUpdateHead', num_classes=40, num_thing_classes=40, num_stuff_classes=0, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=2, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict( type='DiceLoss', loss_weight=4.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0) ) for _ in range(num_stages) ]), tracker=dict( type="KernelFrameIterHeadVideo", num_proposals=num_proposals, num_stages=3, assign_stages=2, proposal_feature_channel=256, stage_loss_weights=(1., 1., 1.), num_thing_classes=40, num_stuff_classes=0, mask_head=dict( type='KernelUpdateHeadVideo', num_proposals=num_proposals, num_classes=40, num_thing_classes=40, num_stuff_classes=0, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=2, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict( type='DiceLoss', loss_weight=4.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0) ), ), # training and testing settings train_cfg=dict( rpn=dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True) ), sampler=dict(type='MaskPseudoSampler'), pos_weight=1), rcnn=[ dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True) ), sampler=dict(type='MaskPseudoSampler'), pos_weight=1) for _ in range(num_stages) ], tracker=dict( assigner=dict( type='MaskHungarianAssignerVideo', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True) ), sampler=dict(type='MaskPseudoSampler'), pos_weight=1) ), test_cfg=dict( rpn=None, rcnn=dict( max_per_img=10, mask_thr=0.5, merge_stuff_thing=dict( iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3 ) ), tracker=dict( max_per_img=10, mask_thr=0.5, merge_stuff_thing=dict( iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3 ), ), ) ) custom_imports = dict( imports=[ 'knet_vis.det.knet', 'knet_vis.det.kernel_head', 'knet_vis.det.kernel_iter_head', 'knet_vis.det.kernel_update_head', 'knet_vis.det.semantic_fpn_wrapper', 'knet_vis.kernel_updator', 'knet_vis.det.mask_hungarian_assigner', 'knet_vis.det.mask_pseudo_sampler', 'knet_vis.tracker.track', 'knet_vis.tracker.kernel_head', 'knet_vis.tracker.kernel_iter_head', 'knet_vis.tracker.kernel_frame_iter_head', 'knet_vis.tracker.mask_hungarian_assigner', 'knet_vis.tracker.kernel_update_head', 'swin.swin_transformer', 'mmtrack.datasets.youtube_vis_dataset', 'mmtrack.pipelines', ], allow_failed_imports=False ) ================================================ FILE: configs/video_knet_vis/_base_/models/knet_track_r50_deformablefpn.py ================================================ num_stages = 3 num_proposals = 100 conv_kernel_size = 1 model = dict( type='KNetTrack', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch', init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), neck=dict( type='MSDeformAttnPixelDecoder', num_outs=3, norm_cfg=dict(type='GN', num_groups=32), act_cfg=dict(type='ReLU'), return_one_list=True, encoder=dict( type='DetrTransformerEncoder', num_layers=6, transformerlayers=dict( type='BaseTransformerLayer', attn_cfgs=dict( type='MultiScaleDeformableAttention', embed_dims=256, num_heads=8, num_levels=3, num_points=4, im2col_step=64, dropout=0.0, batch_first=False, norm_cfg=None, init_cfg=None), ffn_cfgs=dict( type='FFN', embed_dims=256, feedforward_channels=1024, num_fcs=2, ffn_drop=0.0, act_cfg=dict(type='ReLU', inplace=True)), operation_order=('self_attn', 'norm', 'ffn', 'norm')), init_cfg=None), positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True), init_cfg=None), rpn_head=dict( type='ConvKernelHeadVideo', conv_kernel_size=conv_kernel_size, feat_downsample_stride=2, feat_refine_stride=1, feat_refine=False, use_binary=True, num_loc_convs=1, num_seg_convs=1, conv_normal_init=True, localization_fpn=dict( type='SemanticFPNWrapper', in_channels=256, feat_channels=256, out_channels=256, start_level=0, end_level=3, upsample_times=2, positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True), cat_coors=False, cat_coors_level=3, fuse_by_cat=False, return_list=False, num_aux_convs=1, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True) ), num_proposals=num_proposals, proposal_feats_with_obj=True, xavier_init_kernel=False, kernel_init_std=1, num_cls_fcs=1, in_channels=256, num_classes=40, feat_transform_cfg=None, loss_seg=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict(type='DiceLoss', loss_weight=4.0)), roi_head=dict( type='KernelIterHeadVideo', num_stages=num_stages, stage_loss_weights=[1] * num_stages, proposal_feature_channel=256, num_thing_classes=40, num_stuff_classes=0, mask_head=[ dict( type='KernelUpdateHead', num_classes=40, num_thing_classes=40, num_stuff_classes=0, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=2, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict( type='DiceLoss', loss_weight=4.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0) ) for _ in range(num_stages) ]), tracker=dict( type="KernelFrameIterHeadVideo", num_proposals=num_proposals, num_stages=3, assign_stages=2, proposal_feature_channel=256, stage_loss_weights=(1., 1., 1.), num_thing_classes=40, num_stuff_classes=0, mask_head=dict( type='KernelUpdateHeadVideo', num_proposals=num_proposals, num_classes=40, num_thing_classes=40, num_stuff_classes=0, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=1, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, conv_kernel_size=conv_kernel_size, mask_upsample_stride=2, ffn_act_cfg=dict(type='ReLU', inplace=True), with_ffn=True, feat_transform_cfg=dict( conv_cfg=dict(type='Conv2d'), act_cfg=None), kernel_updator_cfg=dict( type='KernelUpdator', in_channels=256, feat_channels=256, out_channels=256, input_feat_shape=3, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_dice=dict( type='DiceLoss', loss_weight=4.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0) ), ), # training and testing settings train_cfg=dict( rpn=dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True) ), sampler=dict(type='MaskPseudoSampler'), pos_weight=1), rcnn=[ dict( assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True) ), sampler=dict(type='MaskPseudoSampler'), pos_weight=1) for _ in range(num_stages) ], tracker=dict( assigner=dict( type='MaskHungarianAssignerVideo', cls_cost=dict(type='FocalLossCost', weight=2.0), dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True) ), sampler=dict(type='MaskPseudoSampler'), pos_weight=1) ), test_cfg=dict( rpn=None, rcnn=dict( max_per_img=10, mask_thr=0.5, merge_stuff_thing=dict( iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3 ) ), tracker=dict( max_per_img=10, mask_thr=0.5, merge_stuff_thing=dict( iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3 ), ), ) ) custom_imports = dict( imports=[ 'knet_vis.det.knet', 'knet_vis.det.kernel_head', 'knet_vis.det.kernel_iter_head', 'knet_vis.det.kernel_update_head', 'knet_vis.det.semantic_fpn_wrapper', 'knet_vis.kernel_updator', 'knet.det.msdeformattn_decoder', 'knet_vis.det.mask_hungarian_assigner', 'knet_vis.det.mask_pseudo_sampler', 'knet_vis.tracker.track', 'knet_vis.tracker.kernel_head', 'knet_vis.tracker.kernel_iter_head', 'knet_vis.tracker.kernel_frame_iter_head', 'knet_vis.tracker.mask_hungarian_assigner', 'knet_vis.tracker.kernel_update_head', 'swin.swin_transformer', 'mmtrack.datasets.youtube_vis_dataset', 'mmtrack.pipelines', ], allow_failed_imports=False ) ================================================ FILE: configs/video_knet_vis/_base_/schedules/schedule_0.75x.py ================================================ # optimizer optimizer = dict( type='AdamW', lr=0.0001, weight_decay=0.05, paramwise_cfg=dict( custom_keys={ 'backbone': dict(lr_mult=0.25) } ) ) optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[5, 7] ) runner = dict(type='EpochBasedRunner', max_epochs=8) ================================================ FILE: configs/video_knet_vis/_base_/schedules/schedule_1x.py ================================================ # optimizer optimizer = dict( type='AdamW', lr=0.0001, weight_decay=0.05, paramwise_cfg=dict( custom_keys={ 'backbone': dict(lr_mult=0.25) } ) ) optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[8, 11] ) runner = dict(type='EpochBasedRunner', max_epochs=12) ================================================ FILE: configs/video_knet_vis/_base_/schedules/schedule_8e.py ================================================ # optimizer optimizer = dict( type='AdamW', lr=0.0001, weight_decay=0.05, paramwise_cfg=dict( custom_keys={ 'backbone': dict(lr_mult=0.25) } ) ) optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[7, ] ) runner = dict(type='EpochBasedRunner', max_epochs=8) ================================================ FILE: configs/video_knet_vis/common/mstrain_3x_coco_instance.py ================================================ _base_ = '../_base_/default_runtime.py' # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) # In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)], # multiscale_mode='range' train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), dict( type='Resize', img_scale=[(1333, 640), (1333, 800)], multiscale_mode='range', keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] # Use RepeatDataset to speed up training data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type='RepeatDataset', times=3, dataset=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', pipeline=train_pipeline)), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', pipeline=test_pipeline)) evaluation = dict(interval=1, metric=['segm']) # optimizer # this is different from the original 1x schedule that use SGD optimizer = dict( type='AdamW', lr=0.0001, weight_decay=0.05, paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)})) optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2)) # learning policy # Experiments show that using step=[9, 11] has higher performance lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[9, 11]) runner = dict(type='EpochBasedRunner', max_epochs=12) ================================================ FILE: configs/video_knet_vis/video_knet_vis/knet_track_r50_1x_youtubevis.py ================================================ _base_ = [ '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py', '../_base_/models/knet_track_r50.py', '../_base_/datasets/youtubevis_2019.py', ] ================================================ FILE: configs/video_knet_vis/video_knet_vis/knet_track_r50_deformable_fpn_1x_youtubevis.py ================================================ _base_ = [ '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py', '../_base_/models/knet_track_r50_deformablefpn.py', '../_base_/datasets/youtubevis_2019.py', ] data = dict( samples_per_gpu=1, workers_per_gpu=2,) ================================================ FILE: configs/video_knet_vis/video_knet_vis/knet_track_swinb_1x_youtubevis_8e.py ================================================ _base_ = [ '../_base_/schedules/schedule_8e.py', '../_base_/default_runtime.py', '../_base_/models/knet_track_r50.py', '../_base_/datasets/youtubevis_2019.py', ] model = dict( backbone=dict( _delete_=True, type='SwinTransformerDIY', embed_dims=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=7, mlp_ratio=4, qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.3, use_abs_pos_embed=False, patch_norm=True, out_indices=(0, 1, 2, 3), with_cp=False ), neck=dict(in_channels=[128, 256, 512, 1024]), ) data = dict( samples_per_gpu=1, workers_per_gpu=2, ) ================================================ FILE: configs/video_knet_vis/video_knet_vis/knet_track_swinb_deformable_1x_youtubevis.py ================================================ _base_ = [ '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py', '../_base_/models/knet_track_r50_deformablefpn.py', '../_base_/datasets/youtubevis_2019.py', ] model = dict( backbone=dict( _delete_=True, type='SwinTransformerDIY', embed_dims=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=7, mlp_ratio=4, qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.3, use_abs_pos_embed=False, patch_norm=True, out_indices=(0, 1, 2, 3), with_cp=True ), neck=dict(in_channels=[128, 256, 512, 1024]), ) dataset_type = 'YouTubeVISDataset' data_root = 'data/youtube_vis_2019/' dataset_version = '2019' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True ) train_pipeline = [ dict(type='LoadMultiImagesFromFile', to_float32=True), dict( type='SeqLoadAnnotations', with_bbox=True, with_mask=True, with_track=True), dict( type='SeqResize', multiscale_mode='value', share_params=True, img_scale=[(288,1e6), (320,1e6), (352,1e6), (392,1e6), (416,1e6), (448,1e6), (480,1e6), (512,1e6)], keep_ratio=True ), dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5), dict(type='SeqNormalize', **img_norm_cfg), dict(type='SeqPad', size_divisor=32), dict( type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_instance_ids'], reject_empty=True, num_ref_imgs=5, ), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ] data = dict( samples_per_gpu=1, workers_per_gpu=2, train=dict( type='RepeatDataset', times=1, dataset=dict( type=dataset_type, dataset_version=dataset_version, ann_file=data_root + 'annotations/youtube_vis_2019_train.json', img_prefix=data_root + 'train/JPEGImages', ref_img_sampler=dict( num_ref_imgs=5, frame_range=[-2, 2], filter_key_img=False, method='uniform'), pipeline=train_pipeline )), ) ================================================ FILE: external/cityscape_panoptic.py ================================================ import contextlib import io import itertools import os import glob import tempfile import logging import os.path as osp from collections import OrderedDict import pycocotools.mask as maskUtils import mmcv import numpy as np from mmcv.utils import print_log from mmdet.datasets.builder import DATASETS from mmdet.datasets.coco import CocoDataset from mmdet.datasets.api_wrappers import COCO, COCOeval from terminaltables import AsciiTable from external.coco_panoptic import parse_pq_results, _print_panoptic_results @DATASETS.register_module() class CityscapesPanopticDataset(CocoDataset): CLASSES = ('person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle') def load_annotations(self, ann_file): """Load annotation from COCO style annotation file. Args: ann_file (str): Path of annotation file. Returns: list[dict]: Annotation info from COCO api. """ self.coco = COCO(ann_file['ins_ann']) self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES) self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} self.img_ids = sorted(self.coco.get_img_ids()) self.panoptic_anns = mmcv.load(ann_file['panoptic_ann']) self.stuff_ids = [ k['id'] for k in self.panoptic_anns['categories'] if k['isthing'] == 0 ] self.thing_ids = [ k['id'] for k in self.panoptic_anns['categories'] if k['isthing'] == 1 ] assert self.thing_ids == self.cat_ids self.seg2stuff_ids = { i + 1: stuff_id for i, stuff_id in enumerate(self.stuff_ids) } self.seg2stuff_ids.update({0: 0}) self.ins2thing_ids = { i: thing_id for i, thing_id in enumerate(self.thing_ids) } data_infos = [] total_ann_ids = [] for i in self.img_ids: info = self.coco.load_imgs([i])[0] info['filename'] = info['file_name'] data_infos.append(info) ann_ids = self.coco.get_ann_ids(img_ids=[i]) total_ann_ids.extend(ann_ids) assert len(set(total_ann_ids)) == len( total_ann_ids), f"Annotation ids in '{ann_file}' are not unique!" return data_infos def _filter_imgs(self, min_size=32): """Filter images too small or without ground truths.""" valid_inds = [] # obtain images that contain annotation ids_with_ann = set(_['image_id'] for _ in self.coco.anns.values()) # obtain images that contain annotations of the required categories ids_in_cat = set() for i, class_id in enumerate(self.cat_ids): ids_in_cat |= set(self.coco.cat_img_map[class_id]) # merge the image id sets of the two conditions and use the merged set # to filter out images if self.filter_empty_gt=True ids_in_cat &= ids_with_ann valid_img_ids = [] for i, img_info in enumerate(self.data_infos): img_id = img_info['id'] ann_ids = self.coco.getAnnIds(imgIds=[img_id]) ann_info = self.coco.loadAnns(ann_ids) all_iscrowd = all([_['iscrowd'] for _ in ann_info]) if self.filter_empty_gt and (self.img_ids[i] not in ids_in_cat or all_iscrowd): continue if min(img_info['width'], img_info['height']) >= min_size: valid_inds.append(i) valid_img_ids.append(img_id) self.img_ids = valid_img_ids return valid_inds def _parse_ann_info(self, img_info, ann_info): """Parse bbox and mask annotation. Args: img_info (dict): Image info of an image. ann_info (list[dict]): Annotation info of an image. Returns: dict: A dict containing the following keys: bboxes, \ bboxes_ignore, labels, masks, seg_map. \ "masks" are already decoded into binary masks. """ gt_bboxes = [] gt_labels = [] gt_bboxes_ignore = [] gt_masks_ann = [] for i, ann in enumerate(ann_info): if ann.get('ignore', False): continue x1, y1, w, h = ann['bbox'] if ann['area'] <= 0 or w < 1 or h < 1: continue if ann['category_id'] not in self.cat_ids: continue bbox = [x1, y1, x1 + w, y1 + h] if ann.get('iscrowd', False): gt_bboxes_ignore.append(bbox) else: gt_bboxes.append(bbox) gt_labels.append(self.cat2label[ann['category_id']]) gt_masks_ann.append(ann['segmentation']) if gt_bboxes: gt_bboxes = np.array(gt_bboxes, dtype=np.float32) gt_labels = np.array(gt_labels, dtype=np.int64) else: gt_bboxes = np.zeros((0, 4), dtype=np.float32) gt_labels = np.array([], dtype=np.int64) if gt_bboxes_ignore: gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32) else: gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32) ann = dict( bboxes=gt_bboxes, labels=gt_labels, bboxes_ignore=gt_bboxes_ignore, masks=gt_masks_ann, seg_map=img_info['segm_file']) return ann def _panoptic2json(self, results, outfile_prefix): panoptic_json_results = [] mmcv.mkdir_or_exist(outfile_prefix) for idx in range(len(self)): img_id = self.img_ids[idx] panoptic = results[idx] png_string, segments_info = panoptic data = dict() # hack # To match the corresponding ids for panoptic segmentation prediction # for both cityscape vps and cityscapes if self.vps is not None: data['image_id'] = "_".join(self.data_infos[idx]['file_name'].split(".")[0].split("_")[:5]) else: data['image_id'] = self.data_infos[idx]['file_name'].split("/")[-1].split(".")[0][:-12] for segment_info in segments_info: isthing = segment_info.pop('isthing') cat_id = segment_info['category_id'] if isthing is True: segment_info['category_id'] = self.ins2thing_ids[cat_id] else: segment_info['category_id'] = self.seg2stuff_ids[cat_id] png_path = self.data_infos[idx]['file_name'].replace( '.jpg', '.png') # hack: to save all the images into one folder png_path = png_path.split("/")[-1] png_save_path = osp.join(outfile_prefix, png_path) data['file_name'] = png_path with open(png_save_path, 'wb') as f: f.write(png_string) data['segments_info'] = segments_info panoptic_json_results.append(data) return panoptic_json_results def results2json(self, results, outfile_prefix): """Dump the detection results to a COCO style json file. There are 3 types of results: proposals, bbox predictions, mask predictions, and they have different data types. This method will automatically recognize the type, and dump them to json files. Args: results (list[list | tuple | ndarray]): Testing results of the dataset. outfile_prefix (str): The filename prefix of the json files. If the prefix is "somepath/xxx", the json files will be named "somepath/xxx.bbox.json", "somepath/xxx.segm.json", "somepath/xxx.proposal.json". Returns: dict[str: str]: Possible keys are "bbox", "segm", "proposal", and \ values are corresponding filenames. """ result_files = dict() if isinstance(results[0], list): json_results = self._det2json(results) result_files['bbox'] = f'{outfile_prefix}.bbox.json' result_files['proposal'] = f'{outfile_prefix}.bbox.json' mmcv.dump(json_results, result_files['bbox']) elif isinstance(results[0], tuple): if len(results[0]) == 3: # dump the panoptic instance_segm_results = [] panoptic_results = [] for idx in range(len(self)): det, seg, panoptic = results[idx] instance_segm_results.append([det, seg]) panoptic_results.append(panoptic) panoptic_json = dict() panoptic_json['annotations'] = self._panoptic2json( panoptic_results, outfile_prefix) result_files['panoptic'] = f'{outfile_prefix}.panoptic.json' mmcv.dump(panoptic_json, result_files['panoptic']) else: instance_segm_results = results json_results = self._segm2json(instance_segm_results) result_files['bbox'] = f'{outfile_prefix}.bbox.json' result_files['proposal'] = f'{outfile_prefix}.bbox.json' result_files['segm'] = f'{outfile_prefix}.segm.json' mmcv.dump(json_results[0], result_files['bbox']) mmcv.dump(json_results[1], result_files['segm']) elif isinstance(results[0], np.ndarray): json_results = self._proposal2json(results) result_files['proposal'] = f'{outfile_prefix}.proposal.json' mmcv.dump(json_results, result_files['proposal']) else: raise TypeError('invalid type of results') return result_files def results2txt(self, results, outfile_prefix): """Dump the detection results to a txt file. Args: results (list[list | tuple]): Testing results of the dataset. outfile_prefix (str): The filename prefix of the json files. If the prefix is "somepath/xxx", the txt files will be named "somepath/xxx.txt". Returns: list[str]: Result txt files which contains corresponding \ instance segmentation images. """ try: import cityscapesscripts.helpers.labels as CSLabels except ImportError: raise ImportError('Please run "pip install citscapesscripts" to ' 'install cityscapesscripts first.') result_files = [] os.makedirs(outfile_prefix, exist_ok=True) prog_bar = mmcv.ProgressBar(len(self)) for idx in range(len(self)): result = results[idx] filename = self.data_infos[idx]['filename'] basename = osp.splitext(osp.basename(filename))[0] pred_txt = osp.join(outfile_prefix, basename + '_pred.txt') bbox_result, segm_result = result bboxes = np.vstack(bbox_result) # segm results if isinstance(segm_result, tuple): # Some detectors use different scores for bbox and mask, # like Mask Scoring R-CNN. Score of segm will be used instead # of bbox score. segms = mmcv.concat_list(segm_result[0]) mask_score = segm_result[1] else: # use bbox score for mask score segms = mmcv.concat_list(segm_result) mask_score = [bbox[-1] for bbox in bboxes] labels = [ np.full(bbox.shape[0], i, dtype=np.int32) for i, bbox in enumerate(bbox_result) ] labels = np.concatenate(labels) assert len(bboxes) == len(segms) == len(labels) num_instances = len(bboxes) prog_bar.update() with open(pred_txt, 'w') as fout: for i in range(num_instances): pred_class = labels[i] classes = self.CLASSES[pred_class] class_id = CSLabels.name2label[classes].id score = mask_score[i] mask = maskUtils.decode(segms[i]).astype(np.uint8) png_filename = osp.join(outfile_prefix, basename + f'_{i}_{classes}.png') mmcv.imwrite(mask, png_filename) fout.write(f'{osp.basename(png_filename)} {class_id} ' f'{score}\n') result_files.append(pred_txt) return result_files def format_results(self, results, jsonfile_prefix="./test", **kwargs): """Format the results to json (standard format for COCO evaluation). Args: results (list[tuple | numpy.ndarray]): Testing results of the dataset. jsonfile_prefix (str | None): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: tuple: (result_files, tmp_dir), result_files is a dict containing \ the json filepaths, tmp_dir is the temporal directory created \ for saving json files when jsonfile_prefix is not specified. """ assert isinstance(results, list), 'results must be a list' assert len(results) == len(self), ( 'The length of results is not equal to the dataset len: {} != {}'. format(len(results), len(self))) if jsonfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() jsonfile_prefix = osp.join(tmp_dir.name, 'results') else: tmp_dir = None result_files = self.results2json(results, jsonfile_prefix) return result_files, tmp_dir def evaluate(self, results, metric='bbox', logger=None, outfile_prefix=None, classwise=False, proposal_nums=(100, 300, 1000), iou_thrs=np.arange(0.5, 0.96, 0.05), metric_items = None): """Evaluation in Cityscapes/COCO protocol. Args: results (list[list | tuple]): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. Options are 'bbox', 'segm', 'proposal', 'proposal_fast'. logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. outfile_prefix (str | None): The prefix of output file. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If results are evaluated with COCO protocol, it would be the prefix of output json file. For example, the metric is 'bbox' and 'segm', then json files would be "a/b/prefix.bbox.json" and "a/b/prefix.segm.json". If results are evaluated with cityscapes protocol, it would be the prefix of output txt/png files. The output files would be png images under folder "a/b/prefix/xxx/" and the file name of images would be written into a txt file "a/b/prefix/xxx_pred.txt", where "xxx" is the video name of cityscapes. If not specified, a temp file will be created. Default: None. classwise (bool): Whether to evaluating the AP for each class. proposal_nums (Sequence[int]): Proposal number used for evaluating recalls, such as recall@100, recall@1000. Default: (100, 300, 1000). iou_thrs (Sequence[float]): IoU threshold used for evaluating recalls. If set to a list, the average recall of all IoUs will also be computed. Default: 0.5. Returns: dict[str, float]: COCO style evaluation metric or cityscapes mAP \ and AP@50. """ eval_results = dict() metrics = metric.copy() if isinstance(metric, list) else [metric] allowed_metrics = [ 'bbox', 'segm', 'cityscapes', 'panoptic' ] for metric in metrics: if metric not in allowed_metrics: raise KeyError(f'metric {metric} is not supported') if 'cityscapes' in metrics: eval_results.update( self._evaluate_cityscapes(results, outfile_prefix, logger)) metrics.remove('cityscapes') if iou_thrs is None: iou_thrs = np.linspace( .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) if metric_items is not None: if not isinstance(metric_items, list): metric_items = [metric_items] result_files, tmp_dir = self.format_results(results, outfile_prefix) eval_results = OrderedDict() cocoGt = self.coco for metric in metrics: msg = f'Evaluating {metric}...' if logger is None: msg = '\n' + msg print_log(msg, logger=logger) if metric == 'proposal_fast': ar = self.fast_eval_recall( results, proposal_nums, iou_thrs, logger='silent') log_msg = [] for i, num in enumerate(proposal_nums): eval_results[f'AR@{num}'] = ar[i] log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}') log_msg = ''.join(log_msg) print_log(log_msg, logger=logger) continue if metric == 'panoptic': from panopticapi.evaluation import pq_compute with contextlib.redirect_stdout(io.StringIO()): pq_res = pq_compute( self.ann_file['panoptic_ann'], result_files['panoptic'], gt_folder=self.seg_prefix, pred_folder=result_files['panoptic'].split('.')[0]) results = parse_pq_results(pq_res) for k, v in results.items(): eval_results[f'{metric}_{k}'] = f'{float(v):0.3f}' print_log( 'Panoptic Evaluation Results:\n' + _print_panoptic_results(pq_res), logger=logger) continue iou_type = 'bbox' if metric == 'proposal' else metric if metric not in result_files: raise KeyError(f'{metric} is not in results') try: predictions = mmcv.load(result_files[metric]) if iou_type == 'segm': # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331 # noqa # When evaluating mask AP, if the results contain bbox, # cocoapi will use the box area instead of the mask area # for calculating the instance area. Though the overall AP # is not affected, this leads to different small, medium, # and large mask AP results. for x in predictions: x.pop('bbox') cocoDt = cocoGt.loadRes(predictions) except IndexError: print_log( 'The testing results of the whole dataset is empty.', logger=logger, level=logging.ERROR) break cocoEval = COCOeval(cocoGt, cocoDt, iou_type) cocoEval.params.catIds = self.cat_ids cocoEval.params.imgIds = self.img_ids cocoEval.params.maxDets = list(proposal_nums) cocoEval.params.iouThrs = iou_thrs # mapping of cocoEval.stats coco_metric_names = { 'mAP': 0, 'mAP_50': 1, 'mAP_75': 2, 'mAP_s': 3, 'mAP_m': 4, 'mAP_l': 5, 'AR@100': 6, 'AR@300': 7, 'AR@1000': 8, 'AR_s@1000': 9, 'AR_m@1000': 10, 'AR_l@1000': 11 } if metric_items is not None: for metric_item in metric_items: if metric_item not in coco_metric_names: raise KeyError( f'metric item {metric_item} is not supported') if metric == 'proposal': cocoEval.params.useCats = 0 cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() if metric_items is None: metric_items = [ 'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000', 'AR_m@1000', 'AR_l@1000' ] for item in metric_items: val = float( f'{cocoEval.stats[coco_metric_names[item]]:.3f}') eval_results[item] = val else: cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() if classwise: # Compute per-category AP # Compute per-category AP # from https://github.com/facebookresearch/detectron2/ precisions = cocoEval.eval['precision'] # precision: (iou, recall, cls, area range, max dets) assert len(self.cat_ids) == precisions.shape[2] results_per_category = [] for idx, catId in enumerate(self.cat_ids): # area range index 0: all area ranges # max dets index -1: typically 100 per image nm = self.coco.loadCats(catId)[0] precision = precisions[:, :, idx, 0, -1] precision = precision[precision > -1] if precision.size: ap = np.mean(precision) else: ap = float('nan') results_per_category.append( (f'{nm["name"]}', f'{float(ap):0.3f}')) num_columns = min(6, len(results_per_category) * 2) results_flatten = list( itertools.chain(*results_per_category)) headers = ['category', 'AP'] * (num_columns // 2) results_2d = itertools.zip_longest(*[ results_flatten[i::num_columns] for i in range(num_columns) ]) table_data = [headers] table_data += [result for result in results_2d] table = AsciiTable(table_data) print_log('\n' + table.table, logger=logger) if metric_items is None: metric_items = [ 'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l' ] for metric_item in metric_items: key = f'{metric}_{metric_item}' val = float( f'{cocoEval.stats[coco_metric_names[metric_item]]:.3f}' ) eval_results[key] = val ap = cocoEval.stats[:6] eval_results[f'{metric}_mAP_copypaste'] = ( f'{ap[0]:.3f} {ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} ' f'{ap[4]:.3f} {ap[5]:.3f}') if tmp_dir is not None: tmp_dir.cleanup() return eval_results def _evaluate_cityscapes(self, results, txtfile_prefix, logger): """Evaluation in Cityscapes protocol. Args: results (list): Testing results of the dataset. txtfile_prefix (str | None): The prefix of output txt file logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. Returns: dict[str: float]: Cityscapes evaluation results, contains 'mAP' \ and 'AP@50'. """ try: import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as CSEval # noqa except ImportError: raise ImportError('Please run "pip install citscapesscripts" to ' 'install cityscapesscripts first.') msg = 'Evaluating in Cityscapes style' if logger is None: msg = '\n' + msg print_log(msg, logger=logger) result_files, tmp_dir = self.format_results(results, txtfile_prefix) if tmp_dir is None: result_dir = osp.join(txtfile_prefix, 'results') else: result_dir = osp.join(tmp_dir.name, 'results') eval_results = OrderedDict() print_log(f'Evaluating results under {result_dir} ...', logger=logger) # set global states in cityscapes evaluation API CSEval.args.cityscapesPath = os.path.join(self.img_prefix, '../..') CSEval.args.predictionPath = os.path.abspath(result_dir) CSEval.args.predictionWalk = None CSEval.args.JSONOutput = False CSEval.args.colorized = False CSEval.args.gtInstancesFile = os.path.join(result_dir, 'gtInstances.json') CSEval.args.groundTruthSearch = os.path.join( self.img_prefix.replace('leftImg8bit', 'gtFine'), '*/*_gtFine_instanceIds.png') groundTruthImgList = glob.glob(CSEval.args.groundTruthSearch) assert len(groundTruthImgList), 'Cannot find ground truth images' \ f' in {CSEval.args.groundTruthSearch}.' predictionImgList = [] for gt in groundTruthImgList: predictionImgList.append(CSEval.getPrediction(gt, CSEval.args)) CSEval_results = CSEval.evaluateImgLists(predictionImgList, groundTruthImgList, CSEval.args)['averages'] eval_results['mAP'] = CSEval_results['allAp'] eval_results['AP@50'] = CSEval_results['allAp50%'] if tmp_dir is not None: tmp_dir.cleanup() return eval_results ================================================ FILE: external/cityscapes_step.py ================================================ import os import numpy as np from mmdet.datasets.builder import DATASETS from mmdet.datasets.pipelines.compose import Compose from external.dataset.mIoU import eval_miou @DATASETS.register_module() class CityscapesSTEP: CLASSES = ('road', 'sidewalk', 'building', 'wall', 'fence', 'pole', 'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky', 'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle') def __init__( self, pipeline=None, data_root=None, test_mode=False, split='train', ): # Let's figure out where is the cityscapes first assert os.path.exists(os.path.join(data_root, 'license.txt')), \ "It seems that '{}' is not the root folder of cityscapes".format(data_root) assert os.path.exists(os.path.join(data_root, 'leftImg8bit')), \ "leftImg8bit cannot be found." assert os.path.exists(os.path.join(data_root, 'gtFine')), \ "gtFine cannot be found." if pipeline is None: pipeline = [] image_main_dir = os.path.join(data_root, 'leftImg8bit', split) gt_dir = os.path.join(data_root, 'gtFine', split) locations = os.listdir(image_main_dir) samples = [] for loc in locations: for sample in os.listdir(os.path.join(image_main_dir, loc)): location, seq_id, img_id, _ = sample.split('_') assert location == loc samples.append((location, int(seq_id), int(img_id))) samples = sorted(samples) self.samples = samples # Set the image dirs self.gt_dir = gt_dir self.img_dir = image_main_dir self.pipeline = Compose(pipeline) self.load_ann_pipeline = Compose([ dict( type='LoadAnnotationsInstanceMasks', with_mask=False, with_seg=True, with_inst=True, ), ]) self.test_mode = test_mode self.flag = self._set_groups() # eval self.max_ins = 1000 self.no_obj_id = 255 def pre_pipeline(self, results): results['img_prefix'] = None results['img_fields'] = [] results['mask_fields'] = [] results['seg_fields'] = [] results['bbox_fields'] = [] return results def prepare_test_img(self, idx): get_idx = self.samples[idx] filename = os.path.join(self.img_dir, get_idx[0], '{}_{:06d}_{:06d}_leftImg8bit.png'.format(*get_idx)) results = { 'img_info': { 'filename': filename } } results = self.pre_pipeline(results) return self.pipeline(results) def prepare_val_annotation(self, idx): get_idx = self.samples[idx] results = { 'ann_info': { 'seg_map': os.path.join(self.gt_dir, get_idx[0], '{}_{:06d}_{:06d}_gtFine_labelTrainIds.png'.format(*get_idx)), 'inst_map': os.path.join(self.gt_dir, get_idx[0], '{}_{:06d}_{:06d}_gtFine_instanceTrainIds.png'.format(*get_idx)), } } results = self.pre_pipeline(results) return self.load_ann_pipeline(results) def prepare_train_img(self, idx): get_idx = self.samples[idx] filename = os.path.join(self.img_dir, get_idx[0], '{}_{:06d}_{:06d}_leftImg8bit.png'.format(*get_idx)) results = { 'img_info': { 'filename': filename }, 'ann_info': { 'seg_map': os.path.join(self.gt_dir, get_idx[0], '{}_{:06d}_{:06d}_gtFine_labelTrainIds.png'.format(*get_idx)), 'inst_map': os.path.join(self.gt_dir, get_idx[0], '{}_{:06d}_{:06d}_gtFine_instanceTrainIds.png'.format(*get_idx)), } } results = self.pre_pipeline(results) return self.pipeline(results) # Copy and Modify from mmdet def __getitem__(self, idx): """Get training/test data after pipeline. Args: idx (int): Index of data. Returns: dict: Training/test data (with annotation if `test_mode` is set \ True). """ if self.test_mode: return self.prepare_test_img(idx) else: while True: cur_data = self.prepare_train_img(idx) if cur_data is None: idx = self._rand_another(idx) continue return cur_data def _rand_another(self, idx): """Get another random index from the same group as the given index.""" pool = np.arange(len(self)) return np.random.choice(pool) def __len__(self): return len(self.samples) def _set_groups(self): return np.zeros((len(self)), dtype=np.int64) # The evaluate func def evaluate( self, results, **kwargs ): # logger and metric thing_lower = 11 thing_upper = 19 num_thing_classes = 8 num_stuff_classes = 11 pred_results_handled = [] sem_preds = [] thing_knet2real = [11, 13] for item in results: bbox_results, mask_results, seg_results, _, _ = item # in seg_info id starts from 1 inst_map, seg_info = seg_results cat_map = np.zeros_like(inst_map) + num_thing_classes + num_stuff_classes for instance in seg_info: cat_cur = instance['category_id'] if instance['isthing']: cat_cur = thing_knet2real[cat_cur] else: cat_cur -= 1 offset = 0 for thing_id in thing_knet2real: if cat_cur + offset >= thing_id: offset += 1 cat_cur += offset assert cat_cur < num_thing_classes + num_stuff_classes cat_map[inst_map == instance['id']] = cat_cur if not instance['isthing']: inst_map[inst_map == instance['id']] = 0 pred_results_handled.append(cat_map.astype(np.int32) * self.max_ins + inst_map.astype(np.int32)) sem_preds.append(cat_map) gt_panseg = [] sem_targets = [] for idx in range(len(self)): results = self.prepare_val_annotation(idx) panseg_map = results['gt_instance_map'] sem_targets.append(panseg_map // self.max_ins) gt_panseg.append(panseg_map) vpq_results = [] for pred, gt in zip(pred_results_handled, gt_panseg): vpq_result = vpq_eval([pred, gt]) vpq_results.append(vpq_result) iou_per_class = np.stack([result[0] for result in vpq_results]).sum(axis=0)[ :num_thing_classes + num_stuff_classes] tp_per_class = np.stack([result[1] for result in vpq_results]).sum(axis=0)[ :num_thing_classes + num_stuff_classes] fn_per_class = np.stack([result[2] for result in vpq_results]).sum(axis=0)[ :num_thing_classes + num_stuff_classes] fp_per_class = np.stack([result[3] for result in vpq_results]).sum(axis=0)[ :num_thing_classes + num_stuff_classes] # calculate the PQs epsilon = 0. sq = iou_per_class / (tp_per_class + epsilon) rq = tp_per_class / (tp_per_class + 0.5 * fn_per_class + 0.5 * fp_per_class + epsilon) pq = sq * rq # stuff_pq = pq[:num_stuff_classes] # things_pq = pq[num_stuff_classes:] things_index = np.zeros((19,)).astype(bool) things_index[11] = True things_index[13] = True stuff_pq = pq[np.logical_not(things_index)] things_pq = pq[things_index] miou_per_class = eval_miou(sem_preds, sem_targets, num_classes=num_thing_classes + num_stuff_classes) pq = sq * rq print("class pq\t\tsq\t\trq\t\ttp\t\tfp\t\tfn\t\tmIoU") for i in range(len(self.CLASSES)): print("{}{}{:.3f}\t\t{:.3f}\t\t{:.3f}\t\t{:.0f}\t\t{:.0f}\t\t{:.0f}\t\t{:.3f}".format( self.CLASSES[i], ' '*(13 - len(self.CLASSES[i])), pq[i], sq[i], rq[i], tp_per_class[i], fp_per_class[i], fn_per_class[i], miou_per_class[i] )) return { "PQ": np.nan_to_num(pq).mean() * 100, "Stuff PQ": np.nan_to_num(stuff_pq).mean() * 100, "Things PQ": np.nan_to_num(things_pq).mean() * 100, "mIoU":np.nan_to_num(miou_per_class).mean() * 100, } def vpq_eval(element): import six pred_ids, gt_ids = element max_ins = 1000 ign_id = 255 offset = 256 * 256 num_cat = 19 + 1 iou_per_class = np.zeros(num_cat, dtype=np.float64) tp_per_class = np.zeros(num_cat, dtype=np.float64) fn_per_class = np.zeros(num_cat, dtype=np.float64) fp_per_class = np.zeros(num_cat, dtype=np.float64) def _ids_to_counts(id_array): ids, counts = np.unique(id_array, return_counts=True) return dict(six.moves.zip(ids, counts)) pred_areas = _ids_to_counts(pred_ids) gt_areas = _ids_to_counts(gt_ids) void_id = ign_id * max_ins ign_ids = { gt_id for gt_id in six.iterkeys(gt_areas) if (gt_id // max_ins) == ign_id } int_ids = gt_ids.astype(np.int64) * offset + pred_ids.astype(np.int64) int_areas = _ids_to_counts(int_ids) def prediction_void_overlap(pred_id): void_int_id = void_id * offset + pred_id return int_areas.get(void_int_id, 0) def prediction_ignored_overlap(pred_id): total_ignored_overlap = 0 for _ign_id in ign_ids: int_id = _ign_id * offset + pred_id total_ignored_overlap += int_areas.get(int_id, 0) return total_ignored_overlap gt_matched = set() pred_matched = set() for int_id, int_area in six.iteritems(int_areas): gt_id = int(int_id // offset) gt_cat = int(gt_id // max_ins) pred_id = int(int_id % offset) pred_cat = int(pred_id // max_ins) if gt_cat != pred_cat: continue union = ( gt_areas[gt_id] + pred_areas[pred_id] - int_area - prediction_void_overlap(pred_id) ) iou = int_area / union if iou > 0.5: tp_per_class[gt_cat] += 1 iou_per_class[gt_cat] += iou gt_matched.add(gt_id) pred_matched.add(pred_id) for gt_id in six.iterkeys(gt_areas): if gt_id in gt_matched: continue cat_id = gt_id // max_ins if cat_id == ign_id: continue fn_per_class[cat_id] += 1 for pred_id in six.iterkeys(pred_areas): if pred_id in pred_matched: continue if (prediction_ignored_overlap(pred_id) / pred_areas[pred_id]) > 0.5: continue cat = pred_id // max_ins fp_per_class[cat] += 1 return iou_per_class, tp_per_class, fn_per_class, fp_per_class if __name__ == '__main__': import dataset.pipelines.loading import dataset.pipelines.transforms img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True ) train_pipelines = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotationsInstanceMasks', cherry=[11, 13]), dict(type='KNetInsAdapterCherryPick', stuff_nums=11, cherry=[11, 13]), dict(type='Resize', img_scale=(1024, 2048), ratio_range=[0.5, 2.0], keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='RandomCrop', crop_size=(1024, 2048)), dict(type='Normalize', **img_norm_cfg), dict(type='PadFutureMMDet', size_divisor=32, pad_val=dict(img=0, masks=0, seg=255)), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_masks', 'gt_labels', 'gt_semantic_seg'], meta_keys=('ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg') ), ] data = CityscapesSTEP( pipeline=train_pipelines, data_root='data/cityscapes', split='train', test_mode=False ) for item in data: print(item) ================================================ FILE: external/cityscapes_vps.py ================================================ import contextlib import io import itertools import os import glob import tempfile import logging import os.path as osp from collections import OrderedDict import pycocotools.mask as maskUtils import mmcv import numpy as np from mmcv.utils import print_log from mmdet.datasets.builder import DATASETS from mmdet.datasets.coco import CocoDataset from mmdet.datasets.api_wrappers import COCO, COCOeval from terminaltables import AsciiTable from external.coco_panoptic import parse_pq_results, _print_panoptic_results @DATASETS.register_module() class CityscapesVPSDataset(CocoDataset): def __init__(self, ann_file, pipeline, data_root=None, img_prefix=None, seg_prefix=None, proposal_file=None, test_mode=False, offsets=None, ref_prefix=None, nframes_span_test=6): super(CityscapesVPSDataset, self).__init__( ann_file=ann_file, pipeline=pipeline, data_root=data_root, img_prefix=img_prefix, seg_prefix=seg_prefix, proposal_file=proposal_file, test_mode=test_mode) # Hack: we use ref_img_infos to load reference images. self.ref_img_infos = self.load_ref_annotations( self.ann_file) self.ref_prefix = ref_prefix self.offsets = offsets self.nframes_span_test = nframes_span_test self.iid2_img_infos = {x['id']: x for x in self.ref_img_infos} CLASSES = ('person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle') def load_ref_annotations(self, ann_file): self.ref_coco = COCO(ann_file['ins_ann']) self.ref_cat_ids = self.ref_coco.getCatIds() self.ref_cat2label = { cat_id: i + 1 for i, cat_id in enumerate(self.ref_cat_ids) } self.ref_img_ids = self.ref_coco.getImgIds() img_infos = [] for i in self.ref_img_ids: info = self.ref_coco.loadImgs([i])[0] info['filename'] = info['file_name'] img_infos.append(info) return img_infos def load_annotations(self, ann_file): """Load annotation from COCO style annotation file. Args: ann_file (str): Path of annotation file. Returns: list[dict]: Annotation info from COCO api. """ self.coco = COCO(ann_file['ins_ann']) self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES) self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} self.img_ids = sorted(self.coco.get_img_ids()) self.panoptic_anns = mmcv.load(ann_file['panoptic_ann']) self.stuff_ids = [ k['trainid'] for k in self.panoptic_anns['categories'] if k['isthing'] == 0 ] self.thing_ids = [ k['trainid'] for k in self.panoptic_anns['categories'] if k['isthing'] == 1 ] assert self.thing_ids == self.cat_ids self.seg2stuff_ids = { i + 1: stuff_id for i, stuff_id in enumerate(self.stuff_ids) } self.seg2stuff_ids.update({0: 0}) self.ins2thing_ids = { i: thing_id for i, thing_id in enumerate(self.thing_ids) } data_infos = [] total_ann_ids = [] for i in self.img_ids: info = self.coco.load_imgs([i])[0] info['filename'] = info['file_name'] data_infos.append(info) ann_ids = self.coco.get_ann_ids(img_ids=[i]) total_ann_ids.extend(ann_ids) assert len(set(total_ann_ids)) == len( total_ann_ids), f"Annotation ids in '{ann_file}' are not unique!" return data_infos def _filter_imgs(self, min_size=32): """Filter images too small or without ground truths.""" valid_inds = [] # obtain images that contain annotation ids_with_ann = set(_['image_id'] for _ in self.coco.anns.values()) # obtain images that contain annotations of the required categories ids_in_cat = set() for i, class_id in enumerate(self.cat_ids): ids_in_cat |= set(self.coco.cat_img_map[class_id]) # merge the image id sets of the two conditions and use the merged set # to filter out images if self.filter_empty_gt=True ids_in_cat &= ids_with_ann valid_img_ids = [] for i, img_info in enumerate(self.data_infos): img_id = img_info['id'] ann_ids = self.coco.getAnnIds(imgIds=[img_id]) ann_info = self.coco.loadAnns(ann_ids) all_iscrowd = all([_['iscrowd'] for _ in ann_info]) if self.filter_empty_gt and (self.img_ids[i] not in ids_in_cat or all_iscrowd): continue if min(img_info['width'], img_info['height']) >= min_size: valid_inds.append(i) valid_img_ids.append(img_id) self.img_ids = valid_img_ids return valid_inds def prepare_train_img(self, idx): """Get training data and annotations after pipeline. Args: idx (int): Index of data. Returns: dict: Training data and annotation after pipeline with new keys \ introduced by pipeline. """ img_info = self.data_infos[idx] ann_info = self.get_ann_info(idx) results = [dict(img_info=img_info, ann_info=ann_info)] iid = img_info['id'] # self.offsets = [-1, 1] for Cityscapes_VPS offsets = self.offsets.copy() # random sampling of future or past 5-th frame [-1, 1] while True: m = np.random.choice(offsets) ref_iid = iid + m if ref_iid in self.img_ids and self.check_whether_has_correspondence(ref_iid, iid): break offsets.remove(m) # If all offset values fail, return None. if len(offsets) == 0: return None # Reference image: information, annotations ref_iid = iid + m ref_img_info = self.iid2_img_infos[ref_iid] ref_ann_info = self.get_ref_ann_info_by_iid(ref_iid, ref_img_info) results.append(dict(img_info=ref_img_info, ann_info=ref_ann_info)) if self.proposals is not None: results['proposals'] = self.proposals[idx] self.pre_pipeline(results) return self.pipeline(results) def check_whether_has_correspondence(self, ref_iid, iid): ref_img_info = self.iid2_img_infos[ref_iid] ref_ann_info = self.get_ref_ann_info_by_iid(ref_iid, ref_img_info) img_info = self.iid2_img_infos[iid] ann_info = self.get_ref_ann_info_by_iid(iid, img_info) nomatch = self.check_match(ref_ann_info, ann_info) if nomatch: # no match return False else: return True def check_match(self, ref_ann_info, ann_info): ref_ids = ref_ann_info['instance_ids'].tolist() gt_ids = ann_info['instance_ids'].tolist() gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids] nomatch = (np.array(gt_pids) == -1).all() return nomatch def prepare_test_img(self, idx): """Get testing data after pipeline. Args: idx (int): Index of data. Returns: dict: Testing data after pipeline with new keys introduced by \ pipeline. """ img_info = self.data_infos[idx] prev_img_info = self.data_infos[idx - 1] if idx % (self.nframes_span_test) > 0 else img_info img_info['ref_id'] = prev_img_info['id'] - 1 img_info['ref_filename'] = prev_img_info['file_name'] results = dict(img_info=img_info) if self.proposals is not None: results['proposals'] = self.proposals[idx] self.pre_test_pipeline(results) return self.pipeline(results) def pre_pipeline(self, results): """Prepare results dict for pipeline.""" for result in results: result['img_prefix'] = self.img_prefix result['seg_prefix'] = self.seg_prefix result['proposal_file'] = self.proposal_file result['bbox_fields'] = [] result['mask_fields'] = [] result['seg_fields'] = [] seg_filename = result['ann_info']['seg_map'].replace('leftImg8bit', 'gtFine_color').\ replace('newImg8bit', 'final_mask') result['ann_info']['seg_map'] = seg_filename def pre_test_pipeline(self, results): results['img_prefix'] = self.img_prefix results['seg_prefix'] = self.seg_prefix results['ref_prefix'] = self.ref_prefix results['proposal_file'] = self.proposal_file results['bbox_fields'] = [] results['mask_fields'] = [] results['ref_bbox_fields'] = [] results['ref_mask_fields'] = [] def _parse_ann_info(self, img_info, ann_info): """Parse bbox and mask annotation. Args: img_info (dict): Image info of an image. ann_info (list[dict]): Annotation info of an image. Returns: dict: A dict containing the following keys: bboxes, \ bboxes_ignore, labels, masks, seg_map. \ "masks" are already decoded into binary masks. """ gt_bboxes = [] gt_labels = [] gt_bboxes_ignore = [] gt_masks_ann = [] gt_obj_ids = [] for i, ann in enumerate(ann_info): if ann.get('ignore', False): continue x1, y1, w, h = ann['bbox'] if ann['area'] <= 0 or w < 1 or h < 1: continue if ann['category_id'] not in self.cat_ids: continue bbox = [x1, y1, x1 + w, y1 + h] if ann.get('iscrowd', False): gt_bboxes_ignore.append(bbox) else: gt_bboxes.append(bbox) gt_labels.append(self.cat2label[ann['category_id']]) gt_masks_ann.append(ann['segmentation']) gt_obj_ids.append(ann['inst_id']) if gt_bboxes: gt_bboxes = np.array(gt_bboxes, dtype=np.float32) gt_labels = np.array(gt_labels, dtype=np.int64) gt_obj_ids = np.array(gt_obj_ids, dtype=np.int64) else: gt_bboxes = np.zeros((0, 4), dtype=np.float32) gt_labels = np.array([], dtype=np.int64) gt_obj_ids = np.array([], dtype=np.int64) if gt_bboxes_ignore: gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32) else: gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32) seg_map = img_info['filename'].replace('jpg', 'png') ann = dict( bboxes=gt_bboxes, labels=gt_labels, bboxes_ignore=gt_bboxes_ignore, masks=gt_masks_ann, instance_ids=gt_obj_ids, seg_map=seg_map) return ann def get_ref_ann_info_by_iid(self, img_id, ref_img_info): ann_ids = self.ref_coco.getAnnIds(imgIds=[img_id]) ann_info = self.ref_coco.loadAnns(ann_ids) return self._parse_ann_info(ref_img_info, ann_info) def _panoptic2json(self, results, outfile_prefix): panoptic_json_results = [] mmcv.mkdir_or_exist(outfile_prefix) for idx in range(len(self)): img_id = self.img_ids[idx] panoptic = results[idx] png_string, segments_info = panoptic data = dict() # hack # To match the corresponding ids for panoptic segmentation prediction data['image_id'] = self.data_infos[idx]['file_name'].split("/")[-1].split(".")[0][:-12] for segment_info in segments_info: isthing = segment_info.pop('isthing') cat_id = segment_info['category_id'] if isthing is True: segment_info['category_id'] = self.ins2thing_ids[cat_id] else: segment_info['category_id'] = self.seg2stuff_ids[cat_id] png_path = self.data_infos[idx]['file_name'].replace( '.jpg', '.png') # hack: to save all the images into one folder png_path = png_path.split("/")[-1] png_save_path = osp.join(outfile_prefix, png_path) data['file_name'] = png_path with open(png_save_path, 'wb') as f: f.write(png_string) data['segments_info'] = segments_info panoptic_json_results.append(data) return panoptic_json_results def results2json(self, results, outfile_prefix): """Dump the detection results to a COCO style json file. There are 3 types of results: proposals, bbox predictions, mask predictions, and they have different data types. This method will automatically recognize the type, and dump them to json files. Args: results (list[list | tuple | ndarray]): Testing results of the dataset. outfile_prefix (str): The filename prefix of the json files. If the prefix is "somepath/xxx", the json files will be named "somepath/xxx.bbox.json", "somepath/xxx.segm.json", "somepath/xxx.proposal.json". Returns: dict[str: str]: Possible keys are "bbox", "segm", "proposal", and \ values are corresponding filenames. """ result_files = dict() if isinstance(results[0], list): json_results = self._det2json(results) result_files['bbox'] = f'{outfile_prefix}.bbox.json' result_files['proposal'] = f'{outfile_prefix}.bbox.json' mmcv.dump(json_results, result_files['bbox']) elif isinstance(results[0], tuple): if len(results[0]) == 3: # dump the panoptic instance_segm_results = [] panoptic_results = [] for idx in range(len(self)): det, seg, panoptic = results[idx] instance_segm_results.append([det, seg]) panoptic_results.append(panoptic) panoptic_json = dict() panoptic_json['annotations'] = self._panoptic2json( panoptic_results, outfile_prefix) result_files['panoptic'] = f'{outfile_prefix}.panoptic.json' mmcv.dump(panoptic_json, result_files['panoptic']) else: instance_segm_results = results json_results = self._segm2json(instance_segm_results) result_files['bbox'] = f'{outfile_prefix}.bbox.json' result_files['proposal'] = f'{outfile_prefix}.bbox.json' result_files['segm'] = f'{outfile_prefix}.segm.json' mmcv.dump(json_results[0], result_files['bbox']) mmcv.dump(json_results[1], result_files['segm']) elif isinstance(results[0], np.ndarray): json_results = self._proposal2json(results) result_files['proposal'] = f'{outfile_prefix}.proposal.json' mmcv.dump(json_results, result_files['proposal']) else: raise TypeError('invalid type of results') return result_files def results2txt(self, results, outfile_prefix): """Dump the detection results to a txt file. Args: results (list[list | tuple]): Testing results of the dataset. outfile_prefix (str): The filename prefix of the json files. If the prefix is "somepath/xxx", the txt files will be named "somepath/xxx.txt". Returns: list[str]: Result txt files which contains corresponding \ instance segmentation images. """ try: import cityscapesscripts.helpers.labels as CSLabels except ImportError: raise ImportError('Please run "pip install citscapesscripts" to ' 'install cityscapesscripts first.') result_files = [] os.makedirs(outfile_prefix, exist_ok=True) prog_bar = mmcv.ProgressBar(len(self)) for idx in range(len(self)): result = results[idx] filename = self.data_infos[idx]['filename'] basename = osp.splitext(osp.basename(filename))[0] pred_txt = osp.join(outfile_prefix, basename + '_pred.txt') bbox_result, segm_result = result bboxes = np.vstack(bbox_result) # segm results if isinstance(segm_result, tuple): # Some detectors use different scores for bbox and mask, # like Mask Scoring R-CNN. Score of segm will be used instead # of bbox score. segms = mmcv.concat_list(segm_result[0]) mask_score = segm_result[1] else: # use bbox score for mask score segms = mmcv.concat_list(segm_result) mask_score = [bbox[-1] for bbox in bboxes] labels = [ np.full(bbox.shape[0], i, dtype=np.int32) for i, bbox in enumerate(bbox_result) ] labels = np.concatenate(labels) assert len(bboxes) == len(segms) == len(labels) num_instances = len(bboxes) prog_bar.update() with open(pred_txt, 'w') as fout: for i in range(num_instances): pred_class = labels[i] classes = self.CLASSES[pred_class] class_id = CSLabels.name2label[classes].id score = mask_score[i] mask = maskUtils.decode(segms[i]).astype(np.uint8) png_filename = osp.join(outfile_prefix, basename + f'_{i}_{classes}.png') mmcv.imwrite(mask, png_filename) fout.write(f'{osp.basename(png_filename)} {class_id} ' f'{score}\n') result_files.append(pred_txt) return result_files def format_results(self, results, jsonfile_prefix=None, **kwargs): """Format the results to json (standard format for COCO evaluation). Args: results (list[tuple | numpy.ndarray]): Testing results of the dataset. jsonfile_prefix (str | None): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: tuple: (result_files, tmp_dir), result_files is a dict containing \ the json filepaths, tmp_dir is the temporal directory created \ for saving json files when jsonfile_prefix is not specified. """ assert isinstance(results, list), 'results must be a list' assert len(results) == len(self), ( 'The length of results is not equal to the dataset len: {} != {}'. format(len(results), len(self))) if jsonfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() jsonfile_prefix = osp.join(tmp_dir.name, 'results') else: tmp_dir = None result_files = self.results2json(results, jsonfile_prefix) return result_files, tmp_dir def evaluate(self, results, metric='bbox', logger=None, outfile_prefix=None, classwise=False, proposal_nums=(100, 300, 1000), iou_thrs=np.arange(0.5, 0.96, 0.05), metric_items = None): """Evaluation in Cityscapes/COCO protocol. Args: results (list[list | tuple]): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. Options are 'bbox', 'segm', 'proposal', 'proposal_fast'. logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. outfile_prefix (str | None): The prefix of output file. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If results are evaluated with COCO protocol, it would be the prefix of output json file. For example, the metric is 'bbox' and 'segm', then json files would be "a/b/prefix.bbox.json" and "a/b/prefix.segm.json". If results are evaluated with cityscapes protocol, it would be the prefix of output txt/png files. The output files would be png images under folder "a/b/prefix/xxx/" and the file name of images would be written into a txt file "a/b/prefix/xxx_pred.txt", where "xxx" is the video name of cityscapes. If not specified, a temp file will be created. Default: None. classwise (bool): Whether to evaluating the AP for each class. proposal_nums (Sequence[int]): Proposal number used for evaluating recalls, such as recall@100, recall@1000. Default: (100, 300, 1000). iou_thrs (Sequence[float]): IoU threshold used for evaluating recalls. If set to a list, the average recall of all IoUs will also be computed. Default: 0.5. Returns: dict[str, float]: COCO style evaluation metric or cityscapes mAP \ and AP@50. """ eval_results = dict() metrics = metric.copy() if isinstance(metric, list) else [metric] allowed_metrics = [ 'bbox', 'segm', 'cityscapes', 'panoptic' ] for metric in metrics: if metric not in allowed_metrics: raise KeyError(f'metric {metric} is not supported') if 'cityscapes' in metrics: eval_results.update( self._evaluate_cityscapes(results, outfile_prefix, logger)) metrics.remove('cityscapes') if iou_thrs is None: iou_thrs = np.linspace( .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) if metric_items is not None: if not isinstance(metric_items, list): metric_items = [metric_items] result_files, tmp_dir = self.format_results(results, outfile_prefix) eval_results = OrderedDict() cocoGt = self.coco for metric in metrics: msg = f'Evaluating {metric}...' if logger is None: msg = '\n' + msg print_log(msg, logger=logger) if metric == 'proposal_fast': ar = self.fast_eval_recall( results, proposal_nums, iou_thrs, logger='silent') log_msg = [] for i, num in enumerate(proposal_nums): eval_results[f'AR@{num}'] = ar[i] log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}') log_msg = ''.join(log_msg) print_log(log_msg, logger=logger) continue if metric == 'panoptic': from panopticapi.evaluation import pq_compute # print("pred folder", result_files['panoptic'].split('.')[0]) with contextlib.redirect_stdout(io.StringIO()): pq_res = pq_compute( self.ann_file['panoptic_ann'], result_files['panoptic'], gt_folder=self.seg_prefix, pred_folder=result_files['panoptic'].split('.')[0]) results = parse_pq_results(pq_res) for k, v in results.items(): eval_results[f'{metric}_{k}'] = f'{float(v):0.3f}' print_log( 'Panoptic Evaluation Results:\n' + _print_panoptic_results(pq_res), logger=logger) continue iou_type = 'bbox' if metric == 'proposal' else metric if metric not in result_files: raise KeyError(f'{metric} is not in results') try: predictions = mmcv.load(result_files[metric]) if iou_type == 'segm': # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331 # noqa # When evaluating mask AP, if the results contain bbox, # cocoapi will use the box area instead of the mask area # for calculating the instance area. Though the overall AP # is not affected, this leads to different small, medium, # and large mask AP results. for x in predictions: x.pop('bbox') cocoDt = cocoGt.loadRes(predictions) except IndexError: print_log( 'The testing results of the whole dataset is empty.', logger=logger, level=logging.ERROR) break cocoEval = COCOeval(cocoGt, cocoDt, iou_type) cocoEval.params.catIds = self.cat_ids cocoEval.params.imgIds = self.img_ids cocoEval.params.maxDets = list(proposal_nums) cocoEval.params.iouThrs = iou_thrs # mapping of cocoEval.stats coco_metric_names = { 'mAP': 0, 'mAP_50': 1, 'mAP_75': 2, 'mAP_s': 3, 'mAP_m': 4, 'mAP_l': 5, 'AR@100': 6, 'AR@300': 7, 'AR@1000': 8, 'AR_s@1000': 9, 'AR_m@1000': 10, 'AR_l@1000': 11 } if metric_items is not None: for metric_item in metric_items: if metric_item not in coco_metric_names: raise KeyError( f'metric item {metric_item} is not supported') if metric == 'proposal': cocoEval.params.useCats = 0 cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() if metric_items is None: metric_items = [ 'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000', 'AR_m@1000', 'AR_l@1000' ] for item in metric_items: val = float( f'{cocoEval.stats[coco_metric_names[item]]:.3f}') eval_results[item] = val else: cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() if classwise: # Compute per-category AP # Compute per-category AP # from https://github.com/facebookresearch/detectron2/ precisions = cocoEval.eval['precision'] # precision: (iou, recall, cls, area range, max dets) assert len(self.cat_ids) == precisions.shape[2] results_per_category = [] for idx, catId in enumerate(self.cat_ids): # area range index 0: all area ranges # max dets index -1: typically 100 per image nm = self.coco.loadCats(catId)[0] precision = precisions[:, :, idx, 0, -1] precision = precision[precision > -1] if precision.size: ap = np.mean(precision) else: ap = float('nan') results_per_category.append( (f'{nm["name"]}', f'{float(ap):0.3f}')) num_columns = min(6, len(results_per_category) * 2) results_flatten = list( itertools.chain(*results_per_category)) headers = ['category', 'AP'] * (num_columns // 2) results_2d = itertools.zip_longest(*[ results_flatten[i::num_columns] for i in range(num_columns) ]) table_data = [headers] table_data += [result for result in results_2d] table = AsciiTable(table_data) print_log('\n' + table.table, logger=logger) if metric_items is None: metric_items = [ 'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l' ] for metric_item in metric_items: key = f'{metric}_{metric_item}' val = float( f'{cocoEval.stats[coco_metric_names[metric_item]]:.3f}' ) eval_results[key] = val ap = cocoEval.stats[:6] eval_results[f'{metric}_mAP_copypaste'] = ( f'{ap[0]:.3f} {ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} ' f'{ap[4]:.3f} {ap[5]:.3f}') if tmp_dir is not None: tmp_dir.cleanup() return eval_results def _evaluate_cityscapes(self, results, txtfile_prefix, logger): """Evaluation in Cityscapes protocol. Args: results (list): Testing results of the dataset. txtfile_prefix (str | None): The prefix of output txt file logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. Returns: dict[str: float]: Cityscapes evaluation results, contains 'mAP' \ and 'AP@50'. """ try: import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as CSEval # noqa except ImportError: raise ImportError('Please run "pip install citscapesscripts" to ' 'install cityscapesscripts first.') msg = 'Evaluating in Cityscapes style' if logger is None: msg = '\n' + msg print_log(msg, logger=logger) result_files, tmp_dir = self.format_results(results, txtfile_prefix) if tmp_dir is None: result_dir = osp.join(txtfile_prefix, 'results') else: result_dir = osp.join(tmp_dir.name, 'results') eval_results = OrderedDict() print_log(f'Evaluating results under {result_dir} ...', logger=logger) # set global states in cityscapes evaluation API CSEval.args.cityscapesPath = os.path.join(self.img_prefix, '../..') CSEval.args.predictionPath = os.path.abspath(result_dir) CSEval.args.predictionWalk = None CSEval.args.JSONOutput = False CSEval.args.colorized = False CSEval.args.gtInstancesFile = os.path.join(result_dir, 'gtInstances.json') CSEval.args.groundTruthSearch = os.path.join( self.img_prefix.replace('leftImg8bit', 'gtFine'), '*/*_gtFine_instanceIds.png') groundTruthImgList = glob.glob(CSEval.args.groundTruthSearch) assert len(groundTruthImgList), 'Cannot find ground truth images' \ f' in {CSEval.args.groundTruthSearch}.' predictionImgList = [] for gt in groundTruthImgList: predictionImgList.append(CSEval.getPrediction(gt, CSEval.args)) CSEval_results = CSEval.evaluateImgLists(predictionImgList, groundTruthImgList, CSEval.args)['averages'] eval_results['mAP'] = CSEval_results['allAp'] eval_results['AP@50'] = CSEval_results['allAp50%'] if tmp_dir is not None: tmp_dir.cleanup() return eval_results ================================================ FILE: external/coco_panoptic.py ================================================ import contextlib import io import itertools import logging import tempfile import os.path as osp from collections import OrderedDict import mmcv import numpy as np from mmcv.utils import print_log from mmdet.datasets.builder import DATASETS from mmdet.datasets.coco import CocoDataset from mmdet.datasets.api_wrappers import COCO, COCOeval from terminaltables import AsciiTable @DATASETS.register_module() class CocoPanopticDatasetCustom(CocoDataset): def load_annotations(self, ann_file): """Load annotation from COCO style annotation file. Args: ann_file (str): Path of annotation file. Returns: list[dict]: Annotation info from COCO api. """ self.coco = COCO(ann_file['ins_ann']) self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES) self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} self.img_ids = sorted(self.coco.get_img_ids()) self.panoptic_anns = mmcv.load(ann_file['panoptic_ann']) self.stuff_ids = [ k['id'] for k in self.panoptic_anns['categories'] if k['isthing'] == 0 ] self.thing_ids = [ k['id'] for k in self.panoptic_anns['categories'] if k['isthing'] == 1 ] assert self.thing_ids == self.cat_ids self.seg2stuff_ids = { i + 1: stuff_id for i, stuff_id in enumerate(self.stuff_ids) } self.seg2stuff_ids.update({0: 0}) self.ins2thing_ids = { i: thing_id for i, thing_id in enumerate(self.thing_ids) } data_infos = [] total_ann_ids = [] for i in self.img_ids: info = self.coco.load_imgs([i])[0] info['filename'] = info['file_name'] data_infos.append(info) ann_ids = self.coco.get_ann_ids(img_ids=[i]) total_ann_ids.extend(ann_ids) assert len(set(total_ann_ids)) == len( total_ann_ids), f"Annotation ids in '{ann_file}' are not unique!" return data_infos def get_ann_info(self, idx): """Get COCO annotation by index. Args: idx (int): Index of data. Returns: dict: Annotation info of specified index. """ img_id = self.data_infos[idx]['id'] ann_ids = self.coco.get_ann_ids(img_ids=[img_id]) ann_info = self.coco.load_anns(ann_ids) return self._parse_ann_info(self.data_infos[idx], ann_info) def get_cat_ids(self, idx): """Get COCO category ids by index. Args: idx (int): Index of data. Returns: list[int]: All categories in the image of specified index. """ img_id = self.data_infos[idx]['id'] ann_ids = self.coco.get_ann_ids(img_ids=[img_id]) ann_info = self.coco.load_anns(ann_ids) return [ann['category_id'] for ann in ann_info] def _parse_ann_info(self, img_info, ann_info): """Parse bbox and mask annotation. Args: ann_info (list[dict]): Annotation info of an image. with_mask (bool): Whether to parse mask annotations. Returns: dict: A dict containing the following keys: bboxes, bboxes_ignore,\ labels, masks, seg_map. "masks" are raw annotations and not \ decoded into binary masks. """ gt_bboxes = [] gt_labels = [] gt_bboxes_ignore = [] gt_masks_ann = [] for i, ann in enumerate(ann_info): if ann.get('ignore', False): continue x1, y1, w, h = ann['bbox'] inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0)) inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0)) if inter_w * inter_h == 0: continue if ann['area'] <= 0 or w < 1 or h < 1: continue if ann['category_id'] not in self.cat_ids: continue bbox = [x1, y1, x1 + w, y1 + h] if ann.get('iscrowd', False): gt_bboxes_ignore.append(bbox) else: gt_bboxes.append(bbox) gt_labels.append(self.cat2label[ann['category_id']]) gt_masks_ann.append(ann.get('segmentation', None)) if gt_bboxes: gt_bboxes = np.array(gt_bboxes, dtype=np.float32) gt_labels = np.array(gt_labels, dtype=np.int64) else: gt_bboxes = np.zeros((0, 4), dtype=np.float32) gt_labels = np.array([], dtype=np.int64) if gt_bboxes_ignore: gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32) else: gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32) seg_map = img_info['filename'].replace('jpg', 'png') ann = dict( bboxes=gt_bboxes, labels=gt_labels, bboxes_ignore=gt_bboxes_ignore, masks=gt_masks_ann, seg_map=seg_map) return ann def _panoptic2json(self, results, outfile_prefix): panoptic_json_results = [] mmcv.mkdir_or_exist(outfile_prefix) for idx in range(len(self)): img_id = self.img_ids[idx] panoptic = results[idx] png_string, segments_info = panoptic data = dict() data['image_id'] = img_id for segment_info in segments_info: isthing = segment_info.pop('isthing') cat_id = segment_info['category_id'] if isthing is True: segment_info['category_id'] = self.ins2thing_ids[cat_id] else: segment_info['category_id'] = self.seg2stuff_ids[cat_id] png_path = self.data_infos[idx]['file_name'].replace( '.jpg', '.png') png_save_path = osp.join(outfile_prefix, png_path) data['file_name'] = png_path with open(png_save_path, 'wb') as f: f.write(png_string) data['segments_info'] = segments_info panoptic_json_results.append(data) return panoptic_json_results def results2json(self, results, outfile_prefix): """Dump the detection results to a COCO style json file. There are 3 types of results: proposals, bbox predictions, mask predictions, and they have different data types. This method will automatically recognize the type, and dump them to json files. Args: results (list[list | tuple | ndarray]): Testing results of the dataset. outfile_prefix (str): The filename prefix of the json files. If the prefix is "somepath/xxx", the json files will be named "somepath/xxx.bbox.json", "somepath/xxx.segm.json", "somepath/xxx.proposal.json". Returns: dict[str: str]: Possible keys are "bbox", "segm", "proposal", and \ values are corresponding filenames. """ result_files = dict() if isinstance(results[0], list): json_results = self._det2json(results) result_files['bbox'] = f'{outfile_prefix}.bbox.json' result_files['proposal'] = f'{outfile_prefix}.bbox.json' mmcv.dump(json_results, result_files['bbox']) elif isinstance(results[0], tuple): if len(results[0]) == 3: instance_segm_results = [] panoptic_results = [] for idx in range(len(self)): det, seg, panoptic = results[idx] instance_segm_results.append([det, seg]) panoptic_results.append(panoptic) panoptic_json = dict() panoptic_json['annotations'] = self._panoptic2json( panoptic_results, outfile_prefix) result_files['panoptic'] = f'{outfile_prefix}.panoptic.json' mmcv.dump(panoptic_json, result_files['panoptic']) else: instance_segm_results = results json_results = self._segm2json(instance_segm_results) result_files['bbox'] = f'{outfile_prefix}.bbox.json' result_files['proposal'] = f'{outfile_prefix}.bbox.json' result_files['segm'] = f'{outfile_prefix}.segm.json' mmcv.dump(json_results[0], result_files['bbox']) mmcv.dump(json_results[1], result_files['segm']) elif isinstance(results[0], np.ndarray): json_results = self._proposal2json(results) result_files['proposal'] = f'{outfile_prefix}.proposal.json' mmcv.dump(json_results, result_files['proposal']) else: raise TypeError('invalid type of results') return result_files def format_results(self, results, jsonfile_prefix=None, **kwargs): """Format the results to json (standard format for COCO evaluation). Args: results (list[tuple | numpy.ndarray]): Testing results of the dataset. jsonfile_prefix (str | None): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: tuple: (result_files, tmp_dir), result_files is a dict containing \ the json filepaths, tmp_dir is the temporal directory created \ for saving json files when jsonfile_prefix is not specified. """ assert isinstance(results, list), 'results must be a list' assert len(results) == len(self), ( 'The length of results is not equal to the dataset len: {} != {}'. format(len(results), len(self))) if jsonfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() jsonfile_prefix = osp.join(tmp_dir.name, 'results') else: tmp_dir = None result_files = self.results2json(results, jsonfile_prefix) return result_files, tmp_dir def evaluate(self, results, metric='bbox', logger=None, jsonfile_prefix=None, classwise=False, proposal_nums=(100, 300, 1000), iou_thrs=None, metric_items=None): """Evaluation in COCO protocol. Args: results (list[list | tuple]): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. Options are 'bbox', 'segm', 'proposal', 'proposal_fast'. logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. jsonfile_prefix (str | None): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. classwise (bool): Whether to evaluating the AP for each class. proposal_nums (Sequence[int]): Proposal number used for evaluating recalls, such as recall@100, recall@1000. Default: (100, 300, 1000). iou_thrs (Sequence[float], optional): IoU threshold used for evaluating recalls/mAPs. If set to a list, the average of all IoUs will also be computed. If not specified, [0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95] will be used. Default: None. metric_items (list[str] | str, optional): Metric items that will be returned. If not specified, ``['AR@100', 'AR@300', 'AR@1000', 'AR_s@1000', 'AR_m@1000', 'AR_l@1000' ]`` will be used when ``metric=='proposal'``, ``['mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l']`` will be used when ``metric=='bbox' or metric=='segm'``. Returns: dict[str, float]: COCO style evaluation metric. """ metrics = metric if isinstance(metric, list) else [metric] allowed_metrics = [ 'bbox', 'segm', 'proposal', 'proposal_fast', 'panoptic' ] for metric in metrics: if metric not in allowed_metrics: raise KeyError(f'metric {metric} is not supported') if iou_thrs is None: iou_thrs = np.linspace( .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) if metric_items is not None: if not isinstance(metric_items, list): metric_items = [metric_items] result_files, tmp_dir = self.format_results(results, jsonfile_prefix) eval_results = OrderedDict() cocoGt = self.coco for metric in metrics: msg = f'Evaluating {metric}...' if logger is None: msg = '\n' + msg print_log(msg, logger=logger) if metric == 'proposal_fast': ar = self.fast_eval_recall( results, proposal_nums, iou_thrs, logger='silent') log_msg = [] for i, num in enumerate(proposal_nums): eval_results[f'AR@{num}'] = ar[i] log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}') log_msg = ''.join(log_msg) print_log(log_msg, logger=logger) continue if metric == 'panoptic': from panopticapi.evaluation import pq_compute with contextlib.redirect_stdout(io.StringIO()): pq_res = pq_compute( self.ann_file['panoptic_ann'], result_files['panoptic'], gt_folder=self.seg_prefix, pred_folder=result_files['panoptic'].split('.')[0]) results = parse_pq_results(pq_res) for k, v in results.items(): eval_results[f'{metric}_{k}'] = f'{float(v):0.3f}' print_log( 'Panoptic Evaluation Results:\n' + _print_panoptic_results(pq_res), logger=logger) continue iou_type = 'bbox' if metric == 'proposal' else metric if metric not in result_files: raise KeyError(f'{metric} is not in results') try: predictions = mmcv.load(result_files[metric]) if iou_type == 'segm': # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331 # noqa # When evaluating mask AP, if the results contain bbox, # cocoapi will use the box area instead of the mask area # for calculating the instance area. Though the overall AP # is not affected, this leads to different small, medium, # and large mask AP results. for x in predictions: x.pop('bbox') cocoDt = cocoGt.loadRes(predictions) except IndexError: print_log( 'The testing results of the whole dataset is empty.', logger=logger, level=logging.ERROR) break cocoEval = COCOeval(cocoGt, cocoDt, iou_type) cocoEval.params.catIds = self.cat_ids cocoEval.params.imgIds = self.img_ids cocoEval.params.maxDets = list(proposal_nums) cocoEval.params.iouThrs = iou_thrs # mapping of cocoEval.stats coco_metric_names = { 'mAP': 0, 'mAP_50': 1, 'mAP_75': 2, 'mAP_s': 3, 'mAP_m': 4, 'mAP_l': 5, 'AR@100': 6, 'AR@300': 7, 'AR@1000': 8, 'AR_s@1000': 9, 'AR_m@1000': 10, 'AR_l@1000': 11 } if metric_items is not None: for metric_item in metric_items: if metric_item not in coco_metric_names: raise KeyError( f'metric item {metric_item} is not supported') if metric == 'proposal': cocoEval.params.useCats = 0 cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() if metric_items is None: metric_items = [ 'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000', 'AR_m@1000', 'AR_l@1000' ] for item in metric_items: val = float( f'{cocoEval.stats[coco_metric_names[item]]:.3f}') eval_results[item] = val else: cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() if classwise: # Compute per-category AP # Compute per-category AP # from https://github.com/facebookresearch/detectron2/ precisions = cocoEval.eval['precision'] # precision: (iou, recall, cls, area range, max dets) assert len(self.cat_ids) == precisions.shape[2] results_per_category = [] for idx, catId in enumerate(self.cat_ids): # area range index 0: all area ranges # max dets index -1: typically 100 per image nm = self.coco.loadCats(catId)[0] precision = precisions[:, :, idx, 0, -1] precision = precision[precision > -1] if precision.size: ap = np.mean(precision) else: ap = float('nan') results_per_category.append( (f'{nm["name"]}', f'{float(ap):0.3f}')) num_columns = min(6, len(results_per_category) * 2) results_flatten = list( itertools.chain(*results_per_category)) headers = ['category', 'AP'] * (num_columns // 2) results_2d = itertools.zip_longest(*[ results_flatten[i::num_columns] for i in range(num_columns) ]) table_data = [headers] table_data += [result for result in results_2d] table = AsciiTable(table_data) print_log('\n' + table.table, logger=logger) if metric_items is None: metric_items = [ 'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l' ] for metric_item in metric_items: key = f'{metric}_{metric_item}' val = float( f'{cocoEval.stats[coco_metric_names[metric_item]]:.3f}' ) eval_results[key] = val ap = cocoEval.stats[:6] eval_results[f'{metric}_mAP_copypaste'] = ( f'{ap[0]:.3f} {ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} ' f'{ap[4]:.3f} {ap[5]:.3f}') if tmp_dir is not None: tmp_dir.cleanup() return eval_results def parse_pq_results(pq_res): res = dict() res['PQ'] = 100 * pq_res['All']['pq'] res['SQ'] = 100 * pq_res['All']['sq'] res['RQ'] = 100 * pq_res['All']['rq'] res['PQ_th'] = 100 * pq_res['Things']['pq'] res['SQ_th'] = 100 * pq_res['Things']['sq'] res['RQ_th'] = 100 * pq_res['Things']['rq'] res['PQ_st'] = 100 * pq_res['Stuff']['pq'] res['SQ_st'] = 100 * pq_res['Stuff']['sq'] res['RQ_st'] = 100 * pq_res['Stuff']['rq'] return res def _print_panoptic_results(pq_res): headers = ['', 'PQ', 'SQ', 'RQ', 'categories'] data = [headers] for name in ['All', 'Things', 'Stuff']: numbers = [ f'{(pq_res[name][k] * 100):0.3f}' for k in ['pq', 'sq', 'rq'] ] row = [name] + numbers + [pq_res[name]['n']] data.append(row) table = AsciiTable(data) return table.table ================================================ FILE: external/dataset/dvps_pipelines/__init__.py ================================================ ================================================ FILE: external/dataset/dvps_pipelines/loading.py ================================================ import mmcv import numpy as np from mmdet.core import BitmapMasks from mmdet.datasets.builder import PIPELINES def bitmasks2bboxes(bitmasks): bitmasks_array = bitmasks.masks boxes = np.zeros((bitmasks_array.shape[0], 4), dtype=np.float32) x_any = np.any(bitmasks_array, axis=1) y_any = np.any(bitmasks_array, axis=2) for idx in range(bitmasks_array.shape[0]): x = np.where(x_any[idx, :])[0] y = np.where(y_any[idx, :])[0] if len(x) > 0 and len(y) > 0: boxes[idx, :] = np.array((x[0], y[0], x[-1], y[-1]), dtype=np.float32) return boxes @PIPELINES.register_module() class LoadImgDirect: """Go ahead and just load image """ def __init__(self, to_float32=False, color_type='color'): self.to_float32 = to_float32 self.color_type = color_type def __call__(self, results): """Call functions to load image and get image meta information. Args: results (dict): Result dict requires "img" which is the img path. Returns: dict: The dict contains loaded image and meta information. 'img' : img 'img_shape' : img_shape 'ori_shape' : original shape 'img_fields' : the img fields """ img = mmcv.imread(results['img'], channel_order='rgb', flag=self.color_type) if self.to_float32: img = img.astype(np.float32) results['img'] = img results['img_shape'] = img.shape results['ori_shape'] = img.shape results['img_fields'] = ['img'] return results def __repr__(self): repr_str = (f'{self.__class__.__name__}(' f'to_float32={self.to_float32}, ' f"color_type='{self.color_type}', ") return repr_str @PIPELINES.register_module() class LoadMultiImagesDirect(LoadImgDirect): """Load multi images from file. Please refer to `mmdet.datasets.pipelines.loading.py:LoadImageFromFile` for detailed docstring. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def __call__(self, results): """Call function. For each dict in `results`, call the call function of `LoadImageFromFile` to load image. Args: results (list[dict]): List of dict from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains loaded image. """ outs = [] for _results in results: _results = super().__call__(_results) outs.append(_results) return outs @PIPELINES.register_module() class LoadAnnotationsDirect: """Go ahead and just load image """ def __init__(self, with_depth=True, divisor: int = 1000, cherry_pick=False, cherry=None, viper=False, vipseg=False ): self.with_depth = with_depth self.panseg_divisor = divisor self.cherry_pick = cherry_pick self.cherry = cherry self.viper = viper self.vipseg=vipseg if self.vipseg: self.panseg_divisor = 1000 def __call__(self, results): """Call functions to load image and get image meta information. Args: results (dict): Result dict requires "img" which is the img path. Returns: dict: The dict contains loaded image and meta information. 'depth_fields' : the depth fields for supporting depth aug """ if self.with_depth: depth = mmcv.imread(results['depth'], flag='unchanged').astype(np.float32) / 256. del results['depth'] depth[depth >= 80.] = 80. results['gt_depth'] = depth results['depth_fields'] = ['gt_depth'] local_divisor = 10000 if self.panseg_divisor == 0: # The seperate file to store class id and inst id gt_semantic_seg = mmcv.imread(results['ann_class'], flag='unchanged').astype(np.float32) inst_map = mmcv.imread(results['ann_inst'], flag='unchanged').astype(np.float32) ps_id = gt_semantic_seg * local_divisor + inst_map del results['ann_class'] del results['ann_inst'] elif self.panseg_divisor == -1: # KITTI step mode which means the panseg is stored with RGB id_map = mmcv.imread(results['ann'], flag='color', channel_order='rgb') gt_semantic_seg = id_map[..., 0].astype(np.float32) inst_map = id_map[..., 1].astype(np.float32) * 256 + id_map[..., 2].astype(np.float32) ps_id = gt_semantic_seg * local_divisor + inst_map del results['ann'] else: ps_id = mmcv.imread(results['ann'], flag='unchanged').astype(np.float32) if self.vipseg: ps_id = results['pre_hook'](ps_id) del results['pre_hook'] # This is for viper if self.viper or self.vipseg: ps_id[ps_id < 1000] *= 1000 del results['ann'] gt_semantic_seg = ps_id // self.panseg_divisor if self.viper: gt_semantic_seg[gt_semantic_seg >= results['thing_upper']] = results['no_obj_class'] results['gt_semantic_seg'] = gt_semantic_seg.astype(np.int) results['seg_fields'] = ['gt_semantic_seg'] classes = [] masks = [] instance_ids = [] no_obj_class = results['no_obj_class'] for pan_seg_id in np.unique(ps_id): classes.append(pan_seg_id // self.panseg_divisor if self.panseg_divisor > 0 else pan_seg_id // local_divisor) masks.append((ps_id == pan_seg_id).astype(np.int)) instance_ids.append(pan_seg_id) gt_labels = np.stack(classes).astype(np.int) gt_instance_ids = np.stack(instance_ids).astype(np.int) gt_masks = BitmapMasks(masks, height=results['img_shape'][0], width=results['img_shape'][1]) # check the sanity of gt_masks verify = np.sum(gt_masks.masks.astype(np.int), axis=0) assert (verify == np.ones(gt_masks.masks.shape[-2:], dtype=verify.dtype)).all() # now delete the no_obj_class gt_masks.masks = np.delete(gt_masks.masks, gt_labels == no_obj_class, axis=0) gt_instance_ids = np.delete(gt_instance_ids, gt_labels == no_obj_class) gt_labels = np.delete(gt_labels, gt_labels == no_obj_class) if results['is_instance_only'] and not self.cherry_pick: gt_masks.masks = np.delete( gt_masks.masks, (gt_labels >= results['thing_upper']) | (gt_labels < results['thing_lower']), axis=0 ) gt_instance_ids = np.delete( gt_instance_ids, (gt_labels >= results['thing_upper']) | (gt_labels < results['thing_lower']) ) gt_labels = np.delete( gt_labels, (gt_labels >= results['thing_upper']) | (gt_labels < results['thing_lower']) ) gt_labels -= results['thing_lower'] elif results['is_instance_only'] and self.cherry_pick: gt_masks.masks = np.delete( gt_masks.masks, list(map(lambda x: x not in self.cherry, gt_labels)), axis=0 ) gt_instance_ids = np.delete( gt_instance_ids, list(map(lambda x: x not in self.cherry, gt_labels)), ) gt_labels = np.delete( gt_labels, list(map(lambda x: x not in self.cherry, gt_labels)), ) gt_labels = np.array(list(map(lambda x: self.cherry.index(x), gt_labels))) if len(gt_labels) > 0 else [] if len(gt_labels) == 0: return None results['gt_labels'] = gt_labels results['gt_masks'] = gt_masks results['gt_instance_ids'] = gt_instance_ids results['mask_fields'] = ['gt_masks'] # generate boxes boxes = bitmasks2bboxes(gt_masks) results['gt_bboxes'] = boxes results['bbox_fields'] = ['gt_bboxes'] return results @PIPELINES.register_module() class LoadMultiAnnotationsDirect(LoadAnnotationsDirect): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def __call__(self, results): outs = [] for _results in results: _results = super().__call__(_results) if _results is None: return None outs.append(_results) return outs ================================================ FILE: external/dataset/dvps_pipelines/transforms.py ================================================ import mmcv import numpy as np from mmdet.datasets.builder import PIPELINES from mmdet.datasets.pipelines import Resize, RandomFlip, Pad, Normalize @PIPELINES.register_module() class ResizeWithDepth(Resize): """This subclass of Resize is to support depth resize """ def __init__(self, *args, **kwargs): assert kwargs['keep_ratio'] super().__init__(*args, **kwargs) def _resize_depth(self, results): """Resize depth with ``results['scale']``""" # Although depth is not discrete, we use nearest to match the segmentation for key in results.get('depth_fields', []): if self.keep_ratio: results[key] = mmcv.imrescale( results[key], results['scale'], interpolation='nearest', backend=self.backend) else: results[key] = mmcv.imresize( results[key], results['scale'], interpolation='nearest', backend=self.backend) results[key] /= results['scale_factor'].mean() def __call__(self, results): super().__call__(results) self._resize_depth(results) return results @PIPELINES.register_module() class SeqResizeWithDepth(ResizeWithDepth): """Resize images. Please refer to `mmdet.datasets.pipelines.transfroms.py:Resize` for detailed docstring. Args: share_params (bool): If True, share the resize parameters for all images. Defaults to True. """ def __init__(self, share_params=True, *args, **kwargs): super().__init__(*args, **kwargs) self.share_params = share_params def __call__(self, results): """Call function. For each dict in results, call the call function of `Resize` to resize image and corresponding annotations. Args: results (list[dict]): List of dict that from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains resized results, 'img_shape', 'pad_shape', 'scale_factor', 'keep_ratio' keys are added into result dict. """ outs, scale = [], None for i, _results in enumerate(results): if self.share_params and i > 0: _results['scale'] = scale _results = super().__call__(_results) if self.share_params and i == 0: scale = _results['scale'] outs.append(_results) return outs @PIPELINES.register_module() class RandomFlipWithDepth(RandomFlip): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def __call__(self, results): super().__call__(results) if results['flip']: for key in results.get('depth_fields', []): results[key] = mmcv.imflip( results[key], direction=results['flip_direction']) return results @PIPELINES.register_module() class SeqFlipWithDepth(RandomFlipWithDepth): """Randomly flip for images. Please refer to `mmdet.datasets.pipelines.transfroms.py:RandomFlip` for detailed docstring. Args: share_params (bool): If True, share the flip parameters for all images. Defaults to True. """ def __init__(self, share_params=True, *args, **kwargs): super().__init__(*args, **kwargs) self.share_params = share_params def __call__(self, results): """Call function. For each dict in results, call `RandomFlip` to randomly flip image. Args: results (list[dict]): List of dict that from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains flipped results, 'flip', 'flip_direction' keys are added into the dict. """ if self.share_params: if isinstance(self.direction, list): # None means non-flip direction_list = self.direction + [None] else: # None means non-flip direction_list = [self.direction, None] if isinstance(self.flip_ratio, list): non_flip_ratio = 1 - sum(self.flip_ratio) flip_ratio_list = self.flip_ratio + [non_flip_ratio] else: non_flip_ratio = 1 - self.flip_ratio # exclude non-flip single_ratio = self.flip_ratio / (len(direction_list) - 1) flip_ratio_list = [single_ratio] * (len(direction_list) - 1) + [non_flip_ratio] cur_dir = np.random.choice(direction_list, p=flip_ratio_list) flip = cur_dir is not None flip_direction = cur_dir for _results in results: _results['flip'] = flip _results['flip_direction'] = flip_direction outs = [] for _results in results: _results = super().__call__(_results) outs.append(_results) return outs @PIPELINES.register_module() class SeqRandomCropWithDepth(object): """Sequentially random crop the images & bboxes & masks. The absolute `crop_size` is sampled based on `crop_type` and `image_size`, then the cropped results are generated. Args: crop_size (tuple): The relative ratio or absolute pixels of height and width. allow_negative_crop (bool, optional): Whether to allow a crop that does not contain any bbox area. Default False. share_params (bool, optional): Whether share the cropping parameters for the images. bbox_clip_border (bool, optional): Whether clip the objects outside the border of the image. Defaults to True. Note: - If the image is smaller than the absolute crop size, return the original image. - The keys for bboxes, labels and masks must be aligned. That is, `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and `gt_bboxes_ignore` corresponds to `gt_labels_ignore` and `gt_masks_ignore`. - If the crop does not contain any gt-bbox region and `allow_negative_crop` is set to False, skip this image. """ def __init__(self, crop_size, allow_negative_crop=False, share_params=False, bbox_clip_border=True, check_id_match=True, ): assert crop_size is None or (crop_size[0] > 0 and crop_size[1] > 0) self.crop_size = crop_size self.allow_negative_crop = allow_negative_crop self.share_params = share_params self.bbox_clip_border = bbox_clip_border self.check_id_match = check_id_match # The key correspondence from bboxes to labels and masks. self.bbox2label = { 'gt_bboxes': ['gt_labels', 'gt_instance_ids'], 'gt_bboxes_ignore': ['gt_labels_ignore', 'gt_instance_ids_ignore'] } self.bbox2mask = { 'gt_bboxes': 'gt_masks', 'gt_bboxes_ignore': 'gt_masks_ignore' } def get_offsets(self, img): """Random generate the offsets for cropping.""" margin_h = max(img.shape[0] - self.crop_size[0], 0) margin_w = max(img.shape[1] - self.crop_size[1], 0) offset_h = np.random.randint(0, margin_h + 1) offset_w = np.random.randint(0, margin_w + 1) return offset_h, offset_w def random_crop(self, results, offsets=None): """Call function to randomly crop images, bounding boxes, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. offsets (tuple, optional): Pre-defined offsets for cropping. Default to None. Returns: dict: Randomly cropped results, 'img_shape' key in result dict is updated according to crop size. """ # Only supporting img assert results['img_fields'] == ['img'] img = results['img'] if offsets is not None: offset_h, offset_w = offsets else: offset_h, offset_w = self.get_offsets(img) results['crop_offsets'] = (offset_h, offset_w) crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0] crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1] # crop the image img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...] img_shape = img.shape results['img'] = img results['img_shape'] = img_shape # crop bboxes accordingly and clip to the image boundary for key in results.get('bbox_fields', []): # e.g. gt_bboxes and gt_bboxes_ignore bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h], dtype=np.float32) bboxes = results[key] - bbox_offset if self.bbox_clip_border: bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1]) bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0]) valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & ( bboxes[:, 3] > bboxes[:, 1]) # If the crop does not contain any gt-bbox area and # self.allow_negative_crop is False, skip this image. if (key == 'gt_bboxes' and not valid_inds.any() and not self.allow_negative_crop): return None results[key] = bboxes[valid_inds, :] # label fields. e.g. gt_labels and gt_labels_ignore label_keys = self.bbox2label.get(key) for label_key in label_keys: if label_key in results: results[label_key] = results[label_key][valid_inds] # mask fields, e.g. gt_masks and gt_masks_ignore mask_key = self.bbox2mask.get(key) if mask_key in results: results[mask_key] = results[mask_key][ valid_inds.nonzero()[0]].crop( np.asarray([crop_x1, crop_y1, crop_x2, crop_y2])) # crop semantic seg for key in results.get('seg_fields', []): results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2] # crop depth for key in results.get('depth_fields', []): results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2] return results def __call__(self, results): """Call function to sequentially randomly crop images, bounding boxes, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. Returns: dict: Randomly cropped results, 'img_shape' key in result dict is updated according to crop size. """ if self.share_params and self.crop_size is not None: offsets = self.get_offsets(results[0]['img']) else: offsets = None if self.crop_size is not None: outs = [] for _results in results: _results = self.random_crop(_results, offsets) if _results is None: return None outs.append(_results) else: outs = [] for _results in results: outs.append(_results) if len(outs) == 2 and self.check_id_match: ref_result, result = outs[1], outs[0] if self.check_match(ref_result, result): return None return outs def check_match(self, ref_results, results): ref_ids = ref_results['gt_instance_ids'].tolist() gt_ids = results['gt_instance_ids'].tolist() gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids] nomatch = (np.array(gt_pids) == -1).all() return nomatch @PIPELINES.register_module() class PadWithDepth(Pad): def _pad_depth(self, results): """Pad depth according to ``results['pad_shape']``.""" for key in results.get('depth_fields', []): results[key] = mmcv.impad( results[key], shape=results['pad_shape'][:2], pad_val=0) # the original pad sem_seg does not consider the no_obj_class with value except for 0 # def _pad_seg(self, results): """Pad semantic segmentation map according to ``results['pad_shape']``.""" no_obj_class = results['no_obj_class'] for key in results.get('seg_fields', []): results[key] = mmcv.impad( results[key], shape=results['pad_shape'][:2], pad_val=no_obj_class) def __call__(self, results): """Call function to pad images, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. Returns: dict: Updated result dict. """ self._pad_img(results) self._pad_masks(results) self._pad_seg(results) self._pad_depth(results) return results @PIPELINES.register_module() class SeqPadWithDepth(PadWithDepth): """Pad images. Please refer to `mmdet.datasets.pipelines.transfroms.py:Pad` for detailed docstring. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def __call__(self, results): """Call function. For each dict in results, call the call function of `Pad` to pad image. Args: results (list[dict]): List of dict that from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains padding results, 'pad_shape', 'pad_fixed_size' and 'pad_size_divisor' keys are added into the dict. """ outs = [] for _results in results: _results = super().__call__(_results) outs.append(_results) return outs # There is nothing new from SeqNormalize. @PIPELINES.register_module() class SeqNormalizeWithDepth(Normalize): """Normalize images. Please refer to `mmdet.datasets.pipelines.transfroms.py:Normalize` for detailed docstring. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def __call__(self, results): """Call function. For each dict in results, call the call function of `Normalize` to normalize image. Args: results (list[dict]): List of dict that from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains normalized results, 'img_norm_cfg' key is added into result dict. """ outs = [] for _results in results: _results = super().__call__(_results) outs.append(_results) return outs ================================================ FILE: external/dataset/dvps_pipelines/tricks.py ================================================ import numpy as np from mmdet.datasets.builder import PIPELINES from mmdet.datasets.pipelines import AutoAugment @PIPELINES.register_module() class SeqAutoAug(AutoAugment): """ Auto augmentation a sequence. """ def __init__(self, policies): super().__init__(policies=policies) def __call__(self, results): transform = np.random.choice(self.transforms) outs = [] for _results in results: out = transform(_results) outs.append(out) return outs ================================================ FILE: external/dataset/forecasting_pipelines/__init__.py ================================================ ================================================ FILE: external/dataset/forecasting_pipelines/loading.py ================================================ import mmcv import numpy as np from mmdet.core import BitmapMasks from mmdet.datasets.builder import PIPELINES def bitmasks2bboxes(bitmasks): bitmasks_array = bitmasks.masks boxes = np.zeros((bitmasks_array.shape[0], 4), dtype=np.float32) x_any = np.any(bitmasks_array, axis=1) y_any = np.any(bitmasks_array, axis=2) for idx in range(bitmasks_array.shape[0]): x = np.where(x_any[idx, :])[0] y = np.where(y_any[idx, :])[0] if len(x) > 0 and len(y) > 0: boxes[idx, :] = np.array((x[0], y[0], x[-1], y[-1]), dtype=np.float32) return boxes @PIPELINES.register_module() class LoadMultiImagesFromFile: """Load an image from file. Required keys are "img_prefix" and "img_info" (a dict that must contain the key "filename"). Added or updated keys are "filename", "img", "img_shape", "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`), "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1). Args: to_float32 (bool): Whether to convert the loaded image to a float32 numpy array. If set to False, the loaded image is an uint8 array. Defaults to False. color_type (str): The flag argument for :func:`mmcv.imfrombytes`. Defaults to 'color'. file_client_args (dict): Arguments to instantiate a FileClient. See :class:`mmcv.fileio.FileClient` for details. Defaults to ``dict(backend='disk')``. """ def __init__(self, to_float32=False, color_type='color', file_client_args=dict(backend='disk')): self.to_float32 = to_float32 self.color_type = color_type self.file_client_args = file_client_args.copy() self.file_client = None def __call__(self, results): """Call functions to load image and get image meta information. Args: results (dict): Result dict from :obj:`mmdet.CustomDataset`. Returns: dict: The dict contains loaded image and meta information. """ if self.file_client is None: self.file_client = mmcv.FileClient(**self.file_client_args) filenames = results['img_info']['filename'] imgs = [] for filename in filenames: img_bytes = self.file_client.get(filename) img = mmcv.imfrombytes(img_bytes, flag=self.color_type) if self.to_float32: img = img.astype(np.float32) imgs.append(img) img = np.concatenate(imgs, axis=-1) results['img'] = img results['img_shape'] = img.shape results['ori_shape'] = img.shape results['img_fields'].append('img') return results def __repr__(self): repr_str = (f'{self.__class__.__name__}(' f'to_float32={self.to_float32}, ' f"color_type='{self.color_type}', " f'file_client_args={self.file_client_args})') return repr_str @PIPELINES.register_module() class LoadAnnotationsInstanceMasks: def __init__(self, with_mask=True, with_seg=True, with_inst=False, file_client_args=dict(backend='disk')): self.with_mask = with_mask self.with_seg = with_seg self.with_inst = with_inst self.file_client_args = file_client_args.copy() self.file_client = None def _load_masks(self, results): """Private function to load mask annotations. Args: results (dict): Result dict from :obj:`mmdet.CustomDataset`. Returns: dict: The dict contains loaded mask annotations. If ``self.poly2mask`` is set ``True``, `gt_mask` will contain :obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used. """ img_bytes = self.file_client.get(results['ann_info']['inst_map']) inst_mask = mmcv.imfrombytes(img_bytes, flag='unchanged').squeeze() if self.with_inst: results['gt_instance_map'] = inst_mask.copy().astype(int) results['gt_instance_map'][inst_mask < 10000] *= 1000 if not self.with_mask: return results masks = [] labels = [] for inst_id in np.unique(inst_mask): if inst_id >= 10000: masks.append((inst_mask == inst_id).astype(int)) labels.append(inst_id // 1000) if len(masks) == 0: return None gt_masks = BitmapMasks(masks, height=inst_mask.shape[0], width=inst_mask.shape[1]) results['gt_masks'] = gt_masks results['mask_fields'].append('gt_masks') results['gt_labels'] = np.array(labels) boxes = bitmasks2bboxes(gt_masks) results['gt_bboxes'] = boxes results['bbox_fields'].append('gt_bboxes') return results def _load_semantic_seg(self, results): """Private function to load semantic segmentation annotations. Args: results (dict): Result dict from :obj:`dataset`. Returns: dict: The dict contains loaded semantic segmentation annotations. """ img_bytes = self.file_client.get(results['ann_info']['seg_map']) results['gt_semantic_seg'] = mmcv.imfrombytes( img_bytes, flag='unchanged').squeeze() results['seg_fields'].append('gt_semantic_seg') return results def __call__(self, results): """Call function to load multiple types annotations. Args: results (dict): Result dict from :obj:`mmdet.CustomDataset`. Returns: dict: The dict contains loaded bounding box, label, mask and semantic segmentation annotations. """ if self.file_client is None: self.file_client = mmcv.FileClient(**self.file_client_args) if self.with_mask or self.with_inst: results = self._load_masks(results) if results is None: return None if self.with_seg: results = self._load_semantic_seg(results) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'with_mask={self.with_mask}, ' repr_str += f'with_seg={self.with_seg}, ' return repr_str ================================================ FILE: external/dataset/forecasting_pipelines/transforms.py ================================================ import mmcv import numpy as np import warnings from mmdet.datasets import PIPELINES @PIPELINES.register_module() class NormalizeMultiple: """Normalize the image. Added key is "img_norm_cfg". Args: mean (sequence): Mean values of 3 channels. std (sequence): Std values of 3 channels. to_rgb (bool): Whether to convert the image from BGR to RGB, default is true. """ def __init__(self, mean, std, to_rgb=True): self.mean = np.array(mean, dtype=np.float32) self.std = np.array(std, dtype=np.float32) self.to_rgb = to_rgb def __call__(self, results): """Call function to normalize images. Args: results (dict): Result dict from loading pipeline. Returns: dict: Normalized results, 'img_norm_cfg' key is added into result dict. """ for key in results.get('img_fields', ['img']): if results[key].shape[-1] > 3: num_3 = results[key].shape[-1] assert num_3 % 3 == 0 num_img = num_3 // 3 img = np.ones_like(results[key]).astype(np.float32) for i in range(num_img): img[..., 3 * i:3 * i + 3] = mmcv.imnormalize( results[key][..., 3 * i:3 * i + 3], self.mean, self.std, self.to_rgb) results[key] = img else: results[key] = mmcv.imnormalize(results[key], self.mean, self.std, self.to_rgb) results['img_norm_cfg'] = dict( mean=self.mean, std=self.std, to_rgb=self.to_rgb) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})' return repr_str @PIPELINES.register_module() class PadFutureMMDet: """Pad the image & masks & segmentation map. There are two padding modes: (1) pad to a fixed size and (2) pad to the minimum size that is divisible by some number. Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor", Args: size (tuple, optional): Fixed padding size. size_divisor (int, optional): The divisor of padded size. pad_to_square (bool): Whether to pad the image into a square. Currently only used for YOLOX. Default: False. pad_val (dict, optional): A dict for padding value, the default value is `dict(img=0, masks=0, seg=255)`. """ def __init__(self, size=None, size_divisor=None, pad_to_square=False, pad_val=dict(img=0, masks=0, seg=255)): self.size = size self.size_divisor = size_divisor if isinstance(pad_val, float) or isinstance(pad_val, int): warnings.warn( 'pad_val of float type is deprecated now, ' f'please use pad_val=dict(img={pad_val}, ' f'masks={pad_val}, seg=255) instead.', DeprecationWarning) pad_val = dict(img=pad_val, masks=pad_val, seg=255) assert isinstance(pad_val, dict) self.pad_val = pad_val self.pad_to_square = pad_to_square if pad_to_square: assert size is None and size_divisor is None, \ 'The size and size_divisor must be None ' \ 'when pad2square is True' else: assert size is not None or size_divisor is not None, \ 'only one of size and size_divisor should be valid' assert size is None or size_divisor is None def _pad_img(self, results): """Pad images according to ``self.size``.""" pad_val = self.pad_val.get('img', 0) for key in results.get('img_fields', ['img']): if self.pad_to_square: max_size = max(results[key].shape[:2]) self.size = (max_size, max_size) if self.size is not None: padded_img = mmcv.impad( results[key], shape=self.size, pad_val=pad_val) elif self.size_divisor is not None: padded_img = mmcv.impad_to_multiple( results[key], self.size_divisor, pad_val=pad_val) results[key] = padded_img results['pad_shape'] = padded_img.shape results['pad_fixed_size'] = self.size results['pad_size_divisor'] = self.size_divisor def _pad_masks(self, results): """Pad masks according to ``results['pad_shape']``.""" pad_shape = results['pad_shape'][:2] pad_val = self.pad_val.get('masks', 0) for key in results.get('mask_fields', []): results[key] = results[key].pad(pad_shape, pad_val=pad_val) def _pad_seg(self, results): """Pad semantic segmentation map according to ``results['pad_shape']``.""" pad_val = self.pad_val.get('seg', 255) for key in results.get('seg_fields', []): results[key] = mmcv.impad( results[key], shape=results['pad_shape'][:2], pad_val=pad_val) def __call__(self, results): """Call function to pad images, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. Returns: dict: Updated result dict. """ self._pad_img(results) self._pad_masks(results) self._pad_seg(results) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(size={self.size}, ' repr_str += f'size_divisor={self.size_divisor}, ' repr_str += f'pad_to_square={self.pad_to_square}, ' repr_str += f'pad_val={self.pad_val})' return repr_str @PIPELINES.register_module() class KNetInsAdapter: """Adapter that is used to convert city-style instance class-ids to coco-style instance-ids (11-starting to 0-starting) """ def __init__(self, stuff_nums=11): self.stuff_nums = stuff_nums def __call__(self, results): """Call function to modify gt_labels Args: results (dict): Result dict from loading pipeline. Returns: dict: Updated result dict. """ results['gt_labels'] -= self.stuff_nums return results ================================================ FILE: external/dataset/mIoU.py ================================================ import numpy as np def eval_miou(results, targets, num_classes, ignore_index=255): total_area_intersect = np.zeros((num_classes,), dtype=np.float64) total_area_union = np.zeros((num_classes,), dtype=np.float64) total_area_pred = np.zeros((num_classes,), dtype=np.float64) total_area_label = np.zeros((num_classes,), dtype=np.float64) for result, target in zip(results, targets): mask = (target != ignore_index) pred = result[mask] label = target[mask] intersect = pred[pred == label] area_intersect, _ = np.histogram(intersect.astype(float), bins=num_classes, range=(0, num_classes - 1)) area_pred, _ = np.histogram(pred.astype(float), bins=num_classes, range=(0, num_classes - 1)) area_label, _ = np.histogram(label.astype(float), bins=num_classes, range=(0, num_classes - 1)) area_union = area_pred + area_label - area_intersect total_area_intersect += area_intersect total_area_pred += area_intersect total_area_label += area_label total_area_union += area_union iou_per_class = total_area_intersect / total_area_union return iou_per_class if __name__ == '__main__': results = [ np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) ] targets = [ np.array([[1, 2, 3], [1, 1, 2], [255, 255, 255]]) ] eval_miou(results, targets, 19) ================================================ FILE: external/dataset/pipelines/__init__.py ================================================ ================================================ FILE: external/dataset/pipelines/formatting.py ================================================ import numpy as np import torch from mmcv.parallel import DataContainer as DC from mmdet.datasets.builder import PIPELINES from mmdet.datasets.pipelines import to_tensor @PIPELINES.register_module() class ConcatVideoReferences(object): """Concat video references. If the input list contains at least two dicts, concat the input list of dict to one dict from 2-nd dict of the input list. Args: results (list[dict]): List of dict that contain keys such as 'img', 'img_metas', 'gt_masks','proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels','gt_semantic_seg', 'gt_instance_ids'. Returns: list[dict]: The first dict of outputs is the same as the first dict of `results`. The second dict of outputs concats the dicts in `results[1:]`. """ def __call__(self, results): assert (isinstance(results, list)), 'results must be list' outs = results[:1] for i, result in enumerate(results[1:], 1): if 'img' in result: img = result['img'] if len(img.shape) < 3: img = np.expand_dims(img, -1) if i == 1: result['img'] = np.expand_dims(img, -1) else: outs[1]['img'] = np.concatenate( (outs[1]['img'], np.expand_dims(img, -1)), axis=-1) for key in ['img_metas', 'gt_masks']: if key in result: if i == 1: result[key] = [result[key]] else: outs[1][key].append(result[key]) for key in [ 'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels', 'gt_instance_ids', ]: if key not in result: continue value = result[key] if value.ndim == 1: value = value[:, None] N = value.shape[0] value = np.concatenate((np.full( (N, 1), i - 1, dtype=np.float32), value), axis=1) if i == 1: result[key] = value else: outs[1][key] = np.concatenate((outs[1][key], value), axis=0) if 'gt_semantic_seg' in result: if i == 1: result['gt_semantic_seg'] = result['gt_semantic_seg'][..., None, None] else: outs[1]['gt_semantic_seg'] = np.concatenate( (outs[1]['gt_semantic_seg'], result['gt_semantic_seg'][..., None, None]), axis=-1) if 'gt_depth' in result: if i == 1: result['gt_depth'] = result['gt_depth'][..., None, None] else: outs[1]['gt_depth'] = np.concatenate( (outs[1]['gt_depth'], result['gt_depth'][..., None, None]), axis=-1) if i == 1: outs.append(result) return outs @PIPELINES.register_module() class ConcatVideos(object): """Concat video references. If the input list contains at least two dicts, concat the input list of dict to one dict from 2-nd dict of the input list. Args: results (list[dict]): List of dict that contain keys such as 'img', 'img_metas', 'gt_masks','proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels','gt_semantic_seg', 'gt_instance_ids'. Returns: list[dict]: The first dict of outputs is the same as the first dict of `results`. The second dict of outputs concats the dicts in `results[1:]`. """ def __call__(self, results): assert (isinstance(results, list)), 'results must be list' outs = results[:1] # outs = [] for i, result in enumerate(results[0:], 1): if 'img' in result: img = result['img'] if len(img.shape) < 3: img = np.expand_dims(img, -1) if i == 1: result['img'] = np.expand_dims(img, -1) else: outs[1]['img'] = np.concatenate( (outs[1]['img'], np.expand_dims(img, -1)), axis=-1) for key in ['img_metas', 'gt_masks']: if key in result: if i == 1: result[key] = [result[key]] else: outs[1][key].append(result[key]) for key in [ 'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels', 'gt_instance_ids' ]: if key not in result: continue value = result[key] if value.ndim == 1: value = value[:, None] N = value.shape[0] value = np.concatenate((np.full( (N, 1), i - 1, dtype=np.float32), value), axis=1) if i == 1: result[key] = value else: outs[1][key] = np.concatenate((outs[1][key], value), axis=0) if 'gt_semantic_seg' in result: if i == 1: result['gt_semantic_seg'] = result['gt_semantic_seg'][..., None, None] else: outs[1]['gt_semantic_seg'] = np.concatenate( (outs[1]['gt_semantic_seg'], result['gt_semantic_seg'][..., None, None]), axis=-1) if i == 1: outs.append(result) res = [] res.append(outs[1]) return res @PIPELINES.register_module() class MultiImagesToTensor(object): """Multi images to tensor. 1. Transpose and convert image/multi-images to Tensor. 2. Add prefix to every key in the second dict of the inputs. Then, add these keys and corresponding values into the outputs. Args: ref_prefix (str): The prefix of key added to the second dict of inputs. Defaults to 'ref'. """ def __init__(self, ref_prefix='ref'): self.ref_prefix = ref_prefix def __call__(self, results): """Multi images to tensor. 1. Transpose and convert image/multi-images to Tensor. 2. Add prefix to every key in the second dict of the inputs. Then, add these keys and corresponding values into the output dict. Args: results (list[dict]): List of two dicts. Returns: dict: Each key in the first dict of `results` remains unchanged. Each key in the second dict of `results` adds `self.ref_prefix` as prefix. """ outs = [] for _results in results: _results = self.images_to_tensor(_results) outs.append(_results) data = {} data.update(outs[0]) if len(outs) == 2: for k, v in outs[1].items(): data[f'{self.ref_prefix}_{k}'] = v return data def images_to_tensor(self, results): """Transpose and convert images/multi-images to Tensor.""" if 'img' in results: img = results['img'] if len(img.shape) == 3: # (H, W, 3) to (3, H, W) img = np.ascontiguousarray(img.transpose(2, 0, 1)) else: # (H, W, 3, N) to (N, 3, H, W) img = np.ascontiguousarray(img.transpose(3, 2, 0, 1)) results['img'] = to_tensor(img) if 'proposals' in results: results['proposals'] = to_tensor(results['proposals']) if 'img_metas' in results: results['img_metas'] = DC(results['img_metas'], cpu_only=True) return results @PIPELINES.register_module() class SeqDefaultFormatBundle(object): """Sequence Default formatting bundle. It simplifies the pipeline of formatting common fields, including "img", "img_metas", "proposals", "gt_bboxes", "gt_instance_ids", "gt_match_indices", "gt_bboxes_ignore", "gt_labels", "gt_masks" and "gt_semantic_seg". These fields are formatted as follows. - img: (1) transpose, (2) to tensor, (3) to DataContainer (stack=True) - img_metas: (1) to DataContainer (cpu_only=True) - proposals: (1) to tensor, (2) to DataContainer - gt_bboxes: (1) to tensor, (2) to DataContainer - gt_instance_ids: (1) to tensor, (2) to DataContainer - gt_match_indices: (1) to tensor, (2) to DataContainer - gt_bboxes_ignore: (1) to tensor, (2) to DataContainer - gt_labels: (1) to tensor, (2) to DataContainer - gt_masks: (1) to DataContainer (cpu_only=True) - gt_semantic_seg: (1) unsqueeze dim-0 (2) to tensor, \ (3) to DataContainer (stack=True) Args: ref_prefix (str): The prefix of key added to the second dict of input list. Defaults to 'ref'. """ def __init__(self, ref_prefix='ref'): self.ref_prefix = ref_prefix def __call__(self, results): """Sequence Default formatting bundle call function. Args: results (list[dict]): List of two dicts. Returns: dict: The result dict contains the data that is formatted with default bundle. Each key in the second dict of the input list adds `self.ref_prefix` as prefix. """ outs = [] for _results in results: _results = self.default_format_bundle(_results) outs.append(_results) data = {} if self.ref_prefix == 'ref': # origin frames data.update(outs[0]) # reference frames if len(outs) == 1: # for k in outs[0]: # data[f'{self.ref_prefix}_{k}'] = None pass else: for k, v in outs[1].items(): data[f'{self.ref_prefix}_{k}'] = v elif self.ref_prefix is None: # origin frames data.update(outs[0]) return data def default_format_bundle(self, results): """Transform and format common fields in results. Args: results (dict): Result dict contains the data to convert. Returns: dict: The result dict contains the data that is formatted with default bundle. """ if 'img' in results: img = results['img'] if len(img.shape) == 3: img = np.ascontiguousarray(img.transpose(2, 0, 1)) else: img = np.ascontiguousarray(img.transpose(3, 2, 0, 1)) results['img'] = DC(to_tensor(img), stack=True) for key in [ 'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels', 'gt_instance_ids', 'gt_match_indices', ]: if key not in results: continue results[key] = DC(to_tensor(results[key])) for key in ['img_metas', 'gt_masks']: if key in results: results[key] = DC(results[key], cpu_only=True) if 'gt_semantic_seg' in results: semantic_seg = results['gt_semantic_seg'] if len(semantic_seg.shape) == 2: semantic_seg = semantic_seg[None, ...] else: semantic_seg = np.ascontiguousarray( semantic_seg.transpose(3, 2, 0, 1)) results['gt_semantic_seg'] = DC( to_tensor(semantic_seg), stack=True) if 'gt_depth' in results: gt_depth = results['gt_depth'] if len(gt_depth.shape) == 2: gt_depth = gt_depth[None, ...] else: gt_depth = np.ascontiguousarray( gt_depth.transpose(3, 2, 0, 1)) results['gt_depth'] = DC( to_tensor(gt_depth), stack=True) return results def __repr__(self): return self.__class__.__name__ @PIPELINES.register_module() class VideoCollect(object): """Collect data from the loader relevant to the specific task. Args: keys (Sequence[str]): Keys of results to be collected in ``data``. meta_keys (Sequence[str]): Meta keys to be converted to ``mmcv.DataContainer`` and collected in ``data[img_metas]``. Defaults to None. default_meta_keys (tuple): Default meta keys. Defaults to ('filename', 'ori_filename', 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg', 'frame_id', 'is_video_data'). """ def __init__(self, keys, meta_keys=None, reject_empty=False, num_ref_imgs=0, # no_obj_class is added for handling non-0 no-obj class default_meta_keys=('filename', 'ori_filename', 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg', 'video_id', 'frame_id', 'is_video_data', 'no_obj_class')): self.keys = keys self.meta_keys = default_meta_keys if meta_keys is not None: if isinstance(meta_keys, str): meta_keys = (meta_keys,) else: assert isinstance(meta_keys, tuple), \ 'meta_keys must be str or tuple' self.meta_keys += meta_keys self.reject_empty = reject_empty self.num_ref_imgs = num_ref_imgs def __call__(self, results): """Call function to collect keys in results. The keys in ``meta_keys`` and ``default_meta_keys`` will be converted to :obj:mmcv.DataContainer. Args: results (list[dict] | dict): List of dict or dict which contains the data to collect. Returns: list[dict] | dict: List of dict or dict that contains the following keys: - keys in ``self.keys`` - ``img_metas`` """ results_is_dict = isinstance(results, dict) if results_is_dict: results = [results] outs = [] for _results in results: _results = self._add_default_meta_keys(_results) _results = self._collect_meta_keys(_results) outs.append(_results) if results_is_dict: outs[0]['img_metas'] = DC(outs[0]['img_metas'], cpu_only=True) if self.reject_empty: if len(results[0]['gt_labels']) == 0: return None if self.num_ref_imgs > 0: if len(results) != self.num_ref_imgs + 1: return None return outs[0] if results_is_dict else outs def _collect_meta_keys(self, results): """Collect `self.keys` and `self.meta_keys` from `results` (dict).""" data = {} img_meta = {} for key in self.meta_keys: if key in results: img_meta[key] = results[key] elif key in results['img_info']: img_meta[key] = results['img_info'][key] data['img_metas'] = img_meta for key in self.keys: data[key] = results[key] return data def _add_default_meta_keys(self, results): """Add default meta keys. We set default meta keys including `pad_shape`, `scale_factor` and `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and `Pad` are implemented during the whole pipeline. Args: results (dict): Result dict contains the data to convert. Returns: results (dict): Updated result dict contains the data to convert. """ img = results['img'] results.setdefault('pad_shape', img.shape) results.setdefault('scale_factor', 1.0) num_channels = 1 if len(img.shape) < 3 else img.shape[2] results.setdefault( 'img_norm_cfg', dict( mean=np.zeros(num_channels, dtype=np.float32), std=np.ones(num_channels, dtype=np.float32), to_rgb=False)) return results @PIPELINES.register_module() class ToList(object): """Use list to warp each value of the input dict. Args: results (dict): Result dict contains the data to convert. Returns: dict: Updated result dict contains the data to convert. """ def __call__(self, results): out = {} for k, v in results.items(): out[k] = [v] return out @PIPELINES.register_module() class ReIDFormatBundle(object): """ReID formatting bundle. It first concatenates common fields, then simplifies the pipeline of formatting common fields, including "img", and "gt_label". These fields are formatted as follows. - img: (1) transpose, (2) to tensor, (3) to DataContainer (stack=True) - gt_labels: (1) to tensor, (2) to DataContainer """ def __init__(self, *args, **kwargs): super().__init__() def __call__(self, results): """ReID formatting bundle call function. Args: results (list[dict] or dict): List of dicts or dict. Returns: dict: The result dict contains the data that is formatted with ReID bundle. """ inputs = dict() if isinstance(results, list): assert len(results) > 1, \ 'the \'results\' only have one item, ' \ 'please directly use normal pipeline not \'Seq\' pipeline.' inputs['img'] = np.stack([_results['img'] for _results in results], axis=3) inputs['gt_label'] = np.stack( [_results['gt_label'] for _results in results], axis=0) elif isinstance(results, dict): inputs['img'] = results['img'] inputs['gt_label'] = results['gt_label'] else: raise TypeError('results must be a list or a dict.') outs = self.reid_format_bundle(inputs) return outs def reid_format_bundle(self, results): """Transform and format gt_label fields in results. Args: results (dict): Result dict contains the data to convert. Returns: dict: The result dict contains the data that is formatted with ReID bundle. """ for key in results: if key == 'img': img = results[key] if img.ndim == 3: img = np.ascontiguousarray(img.transpose(2, 0, 1)) else: img = np.ascontiguousarray(img.transpose(3, 2, 0, 1)) results['img'] = DC(to_tensor(img), stack=True) elif key == 'gt_label': results[key] = DC( to_tensor(results[key]), stack=True, pad_dims=None) else: raise KeyError(f'key {key} is not supported') return results @PIPELINES.register_module() class ImageToTensorWithRef(object): def __init__(self, keys): self.keys = keys def __call__(self, results): for key in self.keys: if key in ['ref_img']: if isinstance(results[key], list): img_ref = [] for img in results[key]: img = np.ascontiguousarray(img.transpose(2, 0, 1)) img_ref.append(img) img_ref = np.array(img_ref) results[key] = to_tensor(img_ref) else: img = np.ascontiguousarray(results[key].transpose(2, 0, 1)) results[key] = to_tensor(img) else: results[key] = to_tensor(results[key].transpose(2, 0, 1)) return results def __repr__(self): return self.__class__.__name__ + '(keys={})'.format(self.keys) @PIPELINES.register_module() class LabelConsistentChecker: """This module is to make the annotations are consistent in each video. """ def __init__(self, num_frames=5): self.num_frames = num_frames def __call__(self, results): ref_gt_instance_ids = results['ref_gt_instance_ids'].data ins_mul_nframe = ref_gt_instance_ids.size(0) if ins_mul_nframe % self.num_frames != 0: return None num_ins = ins_mul_nframe // self.num_frames ins_id_bucket = torch.zeros((num_ins,), dtype=torch.float) for i in range(ins_mul_nframe): frame_cur = i // num_ins ins_cur = i % num_ins if ref_gt_instance_ids[i][0] != frame_cur: return None if frame_cur == 0: ins_id_bucket[ins_cur] = ref_gt_instance_ids[i][1] else: if ref_gt_instance_ids[i][1] != ins_id_bucket[ins_cur]: return None return results ================================================ FILE: external/dataset/pipelines/loading.py ================================================ import os.path as osp import numpy as np import mmcv from mmdet.core import BitmapMasks from mmdet.datasets.builder import PIPELINES from mmdet.datasets.pipelines import LoadAnnotations, LoadImageFromFile @PIPELINES.register_module() class LoadMultiImagesFromFile(LoadImageFromFile): """Load multi images from file. Please refer to `mmdet.datasets.pipelines.loading.py:LoadImageFromFile` for detailed docstring. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def __call__(self, results): """Call function. For each dict in `results`, call the call function of `LoadImageFromFile` to load image. Args: results (list[dict]): List of dict from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains loaded image. """ outs = [] for _results in results: _results = super().__call__(_results) outs.append(_results) return outs @PIPELINES.register_module() class SeqLoadAnnotations(LoadAnnotations): """Sequence load annotations. Please refer to `mmdet.datasets.pipelines.loading.py:LoadAnnotations` for detailed docstring. Args: with_track (bool): If True, load instance ids of bboxes. """ def __init__(self, with_track=False, *args, **kwargs): super().__init__(*args, **kwargs) self.with_track = with_track def _load_track(self, results): """Private function to load label annotations. Args: results (dict): Result dict from :obj:`mmtrack.CocoVideoDataset`. Returns: dict: The dict contains loaded label annotations. """ results['gt_instance_ids'] = results['ann_info']['instance_ids'].copy() return results def __call__(self, results): """Call function. For each dict in results, call the call function of `LoadAnnotations` to load annotation. Args: results (list[dict]): List of dict that from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains loaded annotations, such as bounding boxes, labels, instance ids, masks and semantic segmentation annotations. """ outs = [] for _results in results: _results = super().__call__(_results) if self.with_track: _results = self._load_track(_results) outs.append(_results) return outs @PIPELINES.register_module() class LoadRefImageFromFile(object): """ Code reading reference frame information. Specific to Cityscapes-VPS, Cityscapes, and VIPER datasets. """ def __init__(self, sample=True, to_float32=False): self.to_float32 = to_float32 self.sample = sample def __call__(self, results): # requires dirname for ref images assert results['ref_prefix'] is not None, 'ref_prefix must be specified.' filename = osp.join(results['img_prefix'], results['img_info']['filename']) img = mmcv.imread(filename) # if specified by another ref json file. if 'ref_filename' in results['img_info']: ref_filename = osp.join(results['ref_prefix'], results['img_info']['ref_filename']) ref_img = mmcv.imread(ref_filename) # [1024, 2048, 3] else: raise NotImplementedError('We need this implementation.') if self.to_float32: img = img.astype(np.float32) ref_img = ref_img.astype(np.float32) results['filename'] = filename results['ori_filename'] = results['img_info']['filename'] results['img'] = img results['img_shape'] = img.shape results['ori_shape'] = img.shape results['ref_img'] = ref_img results['iid'] = results['img_info']['id'] return results def __repr__(self): return self.__class__.__name__ + '(to_float32={})'.format( self.to_float32) def bitmasks2bboxes(bitmasks): bitmasks_array = bitmasks.masks boxes = np.zeros((bitmasks_array.shape[0], 4), dtype=np.float32) x_any = np.any(bitmasks_array, axis=1) y_any = np.any(bitmasks_array, axis=2) for idx in range(bitmasks_array.shape[0]): x = np.where(x_any[idx, :])[0] y = np.where(y_any[idx, :])[0] if len(x) > 0 and len(y) > 0: boxes[idx, :] = np.array((x[0], y[0], x[-1], y[-1]), dtype=np.float32) return boxes @PIPELINES.register_module() class LoadAnnotationsInstanceMasks: def __init__(self, with_mask=True, with_seg=True, with_inst=False, cherry=None, file_client_args=dict(backend='disk')): self.with_mask = with_mask self.with_seg = with_seg self.with_inst = with_inst self.file_client_args = file_client_args.copy() self.cherry = cherry self.file_client = None def _load_masks(self, results): """Private function to load mask annotations. Args: results (dict): Result dict from :obj:`mmdet.CustomDataset`. Returns: dict: The dict contains loaded mask annotations. If ``self.poly2mask`` is set ``True``, `gt_mask` will contain :obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used. """ img_bytes = self.file_client.get(results['ann_info']['inst_map']) inst_mask = mmcv.imfrombytes(img_bytes, flag='unchanged').squeeze() if self.with_inst: results['gt_instance_map'] = inst_mask.copy().astype(int) results['gt_instance_map'][inst_mask < 10000] *= 1000 if not self.with_mask: return results masks = [] labels = [] for inst_id in np.unique(inst_mask): if inst_id >= 10000: if self.cherry is not None and not (inst_id // 1000 in self.cherry): continue masks.append((inst_mask == inst_id).astype(int)) labels.append(inst_id // 1000) if len(masks) == 0: return None gt_masks = BitmapMasks(masks, height=inst_mask.shape[0], width=inst_mask.shape[1]) results['gt_masks'] = gt_masks results['mask_fields'].append('gt_masks') results['gt_labels'] = np.array(labels) boxes = bitmasks2bboxes(gt_masks) results['gt_bboxes'] = boxes results['bbox_fields'].append('gt_bboxes') return results def _load_semantic_seg(self, results): """Private function to load semantic segmentation annotations. Args: results (dict): Result dict from :obj:`dataset`. Returns: dict: The dict contains loaded semantic segmentation annotations. """ img_bytes = self.file_client.get(results['ann_info']['seg_map']) results['gt_semantic_seg'] = mmcv.imfrombytes( img_bytes, flag='unchanged').squeeze() results['seg_fields'].append('gt_semantic_seg') return results def __call__(self, results): """Call function to load multiple types annotations. Args: results (dict): Result dict from :obj:`mmdet.CustomDataset`. Returns: dict: The dict contains loaded bounding box, label, mask and semantic segmentation annotations. """ if self.file_client is None: self.file_client = mmcv.FileClient(**self.file_client_args) if self.with_mask or self.with_inst: results = self._load_masks(results) if results is None: return None if self.with_seg: results = self._load_semantic_seg(results) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'with_mask={self.with_mask}, ' repr_str += f'with_seg={self.with_seg}, ' return repr_str ================================================ FILE: external/dataset/pipelines/test_time_aug.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings import mmcv from mmdet.datasets.builder import PIPELINES from mmdet.datasets.pipelines import Compose @PIPELINES.register_module() class MultiScaleFlipAugVideo: """Test-time augmentation with multiple scales and flipping. An example configuration is as followed: .. code-block:: img_scale=[(1333, 400), (1333, 800)], flip=True, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ] After MultiScaleFLipAug with above configuration, the results are wrapped into lists of the same length as followed: .. code-block:: dict( img=[...], img_shape=[...], scale=[(1333, 400), (1333, 400), (1333, 800), (1333, 800)] flip=[False, True, False, True] ... ) Args: transforms (list[dict]): Transforms to apply in each augmentation. img_scale (tuple | list[tuple] | None): Images scales for resizing. scale_factor (float | list[float] | None): Scale factors for resizing. flip (bool): Whether apply flip augmentation. Default: False. flip_direction (str | list[str]): Flip augmentation directions, options are "horizontal", "vertical" and "diagonal". If flip_direction is a list, multiple flip augmentations will be applied. It has no effect when flip == False. Default: "horizontal". """ def __init__(self, transforms, img_scale=None, scale_factor=None, flip=False, flip_direction='horizontal'): self.transforms = Compose(transforms) assert (img_scale is None) ^ (scale_factor is None), ( 'Must have but only one variable can be set') if img_scale is not None: self.img_scale = img_scale if isinstance(img_scale, list) else [img_scale] self.scale_key = 'scale' assert mmcv.is_list_of(self.img_scale, tuple) else: self.img_scale = scale_factor if isinstance( scale_factor, list) else [scale_factor] self.scale_key = 'scale_factor' self.flip = flip self.flip_direction = flip_direction if isinstance( flip_direction, list) else [flip_direction] assert mmcv.is_list_of(self.flip_direction, str) if not self.flip and self.flip_direction != ['horizontal']: warnings.warn( 'flip_direction has no effect when flip is set to False') if (self.flip and not any([t['type'] == 'RandomFlip' for t in transforms])): warnings.warn( 'flip has no effect when RandomFlip is not in transforms') def __call__(self, results): """Call function to apply test time augment transforms on results. Args: results (dict): Result dict contains the data to transform. Returns: dict[str: list]: The augmented data, where each value is wrapped into a list. """ aug_data = [] flip_args = [(False, None)] if self.flip: flip_args += [(True, direction) for direction in self.flip_direction] for scale in self.img_scale: for flip, direction in flip_args: _results = [] for results_single in results: _results_single = results_single.copy() _results_single[self.scale_key] = scale _results_single['flip'] = flip _results_single['flip_direction'] = direction _results.append(_results_single) data = self.transforms(_results) aug_data.append(data) # list of dict to dict of list aug_data_dict = {key: [] for key in aug_data[0]} for data in aug_data: for key, val in data.items(): aug_data_dict[key].append(val) return aug_data_dict def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(transforms={self.transforms}, ' repr_str += f'img_scale={self.img_scale}, flip={self.flip}, ' repr_str += f'flip_direction={self.flip_direction})' return repr_str ================================================ FILE: external/dataset/pipelines/transforms.py ================================================ import cv2 import mmcv import numpy as np import warnings from mmdet.datasets.builder import PIPELINES from mmdet.datasets.pipelines import Normalize, Pad, RandomFlip, Resize @PIPELINES.register_module() class SeqColorAug(object): """Color augmention for images. Args: prob (list[float]): The probability to perform color augmention for each image. Defaults to [1.0, 1.0]. rgb_var (list[list]]): The values of color augmentaion. Defaults to [[-0.55919361, 0.98062831, -0.41940627], [1.72091413, 0.19879334, -1.82968581], [4.64467907, 4.73710203, 4.88324118]]. """ def __init__(self, prob=[1.0, 1.0], rgb_var=[[-0.55919361, 0.98062831, -0.41940627], [1.72091413, 0.19879334, -1.82968581], [4.64467907, 4.73710203, 4.88324118]]): self.prob = prob self.rgb_var = np.array(rgb_var, dtype=np.float32) def __call__(self, results): """Call function. For each dict in results, perform color augmention for image in the dict. Args: results (list[dict]): List of dict that from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains augmented color image. """ outs = [] for i, _results in enumerate(results): image = _results['img'] if self.prob[i] > np.random.random(): offset = np.dot(self.rgb_var, np.random.randn(3, 1)) # bgr to rgb offset = offset[::-1] offset = offset.reshape(3) image = (image - offset).astype(np.float32) _results['img'] = image outs.append(_results) return outs @PIPELINES.register_module() class SeqBlurAug(object): """Blur augmention for images. Args: prob (list[float]): The probability to perform blur augmention for each image. Defaults to [0.0, 0.2]. """ def __init__(self, prob=[0.0, 0.2]): self.prob = prob def __call__(self, results): """Call function. For each dict in results, perform blur augmention for image in the dict. Args: results (list[dict]): List of dict that from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains augmented blur image. """ outs = [] for i, _results in enumerate(results): image = _results['img'] if self.prob[i] > np.random.random(): sizes = np.arange(5, 46, 2) size = np.random.choice(sizes) kernel = np.zeros((size, size)) c = int(size / 2) wx = np.random.random() kernel[:, c] += 1. / size * wx kernel[c, :] += 1. / size * (1 - wx) image = cv2.filter2D(image, -1, kernel) _results['img'] = image outs.append(_results) return outs @PIPELINES.register_module() class SeqResize(Resize): """Resize images. Please refer to `mmdet.datasets.pipelines.transfroms.py:Resize` for detailed docstring. Args: share_params (bool): If True, share the resize parameters for all images. Defaults to True. """ def __init__(self, share_params=True, *args, **kwargs): super().__init__(*args, **kwargs) self.share_params = share_params def __call__(self, results): """Call function. For each dict in results, call the call function of `Resize` to resize image and corresponding annotations. Args: results (list[dict]): List of dict that from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains resized results, 'img_shape', 'pad_shape', 'scale_factor', 'keep_ratio' keys are added into result dict. """ outs, scale = [], None for i, _results in enumerate(results): if self.share_params and i > 0: _results['scale'] = scale _results = super().__call__(_results) if self.share_params and i == 0: scale = _results['scale'] outs.append(_results) return outs @PIPELINES.register_module() class SeqNormalize(Normalize): """Normalize images. Please refer to `mmdet.datasets.pipelines.transfroms.py:Normalize` for detailed docstring. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def __call__(self, results): """Call function. For each dict in results, call the call function of `Normalize` to normalize image. Args: results (list[dict]): List of dict that from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains normalized results, 'img_norm_cfg' key is added into result dict. """ outs = [] for _results in results: _results = super().__call__(_results) outs.append(_results) return outs @PIPELINES.register_module() class SeqRandomFlip(RandomFlip): """Randomly flip for images. Please refer to `mmdet.datasets.pipelines.transfroms.py:RandomFlip` for detailed docstring. Args: share_params (bool): If True, share the flip parameters for all images. Defaults to True. """ def __init__(self, share_params, *args, **kwargs): super().__init__(*args, **kwargs) self.share_params = share_params def __call__(self, results): """Call function. For each dict in results, call `RandomFlip` to randomly flip image. Args: results (list[dict]): List of dict that from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains flipped results, 'flip', 'flip_direction' keys are added into the dict. """ if self.share_params: if isinstance(self.direction, list): # None means non-flip direction_list = self.direction + [None] else: # None means non-flip direction_list = [self.direction, None] if isinstance(self.flip_ratio, list): non_flip_ratio = 1 - sum(self.flip_ratio) flip_ratio_list = self.flip_ratio + [non_flip_ratio] else: non_flip_ratio = 1 - self.flip_ratio # exclude non-flip single_ratio = self.flip_ratio / (len(direction_list) - 1) flip_ratio_list = [single_ratio] * (len(direction_list) - 1) + [non_flip_ratio] cur_dir = np.random.choice(direction_list, p=flip_ratio_list) flip = cur_dir is not None flip_direction = cur_dir for _results in results: _results['flip'] = flip _results['flip_direction'] = flip_direction outs = [] for _results in results: _results = super().__call__(_results) outs.append(_results) return outs @PIPELINES.register_module() class SeqPad(Pad): """Pad images. Please refer to `mmdet.datasets.pipelines.transfroms.py:Pad` for detailed docstring. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def __call__(self, results): """Call function. For each dict in results, call the call function of `Pad` to pad image. Args: results (list[dict]): List of dict that from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains padding results, 'pad_shape', 'pad_fixed_size' and 'pad_size_divisor' keys are added into the dict. """ outs = [] for _results in results: _results = super().__call__(_results) outs.append(_results) return outs @PIPELINES.register_module() class SeqRandomCrop(object): """Sequentially random crop the images & bboxes & masks. The absolute `crop_size` is sampled based on `crop_type` and `image_size`, then the cropped results are generated. Args: crop_size (tuple): The relative ratio or absolute pixels of height and width. allow_negative_crop (bool, optional): Whether to allow a crop that does not contain any bbox area. Default False. share_params (bool, optional): Whether share the cropping parameters for the images. bbox_clip_border (bool, optional): Whether clip the objects outside the border of the image. Defaults to True. Note: - If the image is smaller than the absolute crop size, return the original image. - The keys for bboxes, labels and masks must be aligned. That is, `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and `gt_bboxes_ignore` corresponds to `gt_labels_ignore` and `gt_masks_ignore`. - If the crop does not contain any gt-bbox region and `allow_negative_crop` is set to False, skip this image. """ def __init__(self, crop_size, allow_negative_crop=False, share_params=False, bbox_clip_border=True, check_id_match=True ): assert crop_size[0] > 0 and crop_size[1] > 0 self.crop_size = crop_size self.allow_negative_crop = allow_negative_crop self.share_params = share_params self.bbox_clip_border = bbox_clip_border self.check_id_match = check_id_match # The key correspondence from bboxes to labels and masks. self.bbox2label = { 'gt_bboxes': ['gt_labels', 'gt_instance_ids'], 'gt_bboxes_ignore': ['gt_labels_ignore', 'gt_instance_ids_ignore'] } self.bbox2mask = { 'gt_bboxes': 'gt_masks', 'gt_bboxes_ignore': 'gt_masks_ignore' } def get_offsets(self, img): """Random generate the offsets for cropping.""" margin_h = max(img.shape[0] - self.crop_size[0], 0) margin_w = max(img.shape[1] - self.crop_size[1], 0) offset_h = np.random.randint(0, margin_h + 1) offset_w = np.random.randint(0, margin_w + 1) return offset_h, offset_w def random_crop(self, results, offsets=None): """Call function to randomly crop images, bounding boxes, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. offsets (tuple, optional): Pre-defined offsets for cropping. Default to None. Returns: dict: Randomly cropped results, 'img_shape' key in result dict is updated according to crop size. """ for key in results.get('img_fields', ['img']): img = results[key] if offsets is not None: offset_h, offset_w = offsets else: offset_h, offset_w = self.get_offsets(img) results['img_info']['crop_offsets'] = (offset_h, offset_w) crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0] crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1] # crop the image img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...] img_shape = img.shape results[key] = img results['img_shape'] = img_shape # crop bboxes accordingly and clip to the image boundary for key in results.get('bbox_fields', []): # e.g. gt_bboxes and gt_bboxes_ignore bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h], dtype=np.float32) bboxes = results[key] - bbox_offset if self.bbox_clip_border: bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1]) bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0]) valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & ( bboxes[:, 3] > bboxes[:, 1]) # If the crop does not contain any gt-bbox area and # self.allow_negative_crop is False, skip this image. if (key == 'gt_bboxes' and not valid_inds.any() and not self.allow_negative_crop): return None results[key] = bboxes[valid_inds, :] # label fields. e.g. gt_labels and gt_labels_ignore label_keys = self.bbox2label.get(key) for label_key in label_keys: if label_key in results: results[label_key] = results[label_key][valid_inds] # mask fields, e.g. gt_masks and gt_masks_ignore mask_key = self.bbox2mask.get(key) if mask_key in results: results[mask_key] = results[mask_key][ valid_inds.nonzero()[0]].crop( np.asarray([crop_x1, crop_y1, crop_x2, crop_y2])) # crop semantic seg for key in results.get('seg_fields', []): results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2] return results def __call__(self, results): """Call function to sequentially randomly crop images, bounding boxes, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. Returns: dict: Randomly cropped results, 'img_shape' key in result dict is updated according to crop size. """ if self.share_params: offsets = self.get_offsets(results[0]['img']) else: offsets = None outs = [] for _results in results: _results = self.random_crop(_results, offsets) if _results is None: return None outs.append(_results) if len(outs) == 2 and self.check_id_match: ref_result, result = outs[1], outs[0] if self.check_match(ref_result, result): return None return outs def check_match(self, ref_results, results): ref_ids = ref_results['gt_instance_ids'].tolist() gt_ids = results['gt_instance_ids'].tolist() gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids] nomatch = (np.array(gt_pids) == -1).all() return nomatch @PIPELINES.register_module() class SeqPhotoMetricDistortion(object): """Apply photometric distortion to image sequentially, every transformation is applied with a probability of 0.5. The position of random contrast is in second or second to last. 1. random brightness 2. random contrast (mode 0) 3. convert color from BGR to HSV 4. random saturation 5. random hue 6. convert color from HSV to BGR 7. random contrast (mode 1) 8. randomly swap channels Args: brightness_delta (int): delta of brightness. contrast_range (tuple): range of contrast. saturation_range (tuple): range of saturation. hue_delta (int): delta of hue. """ def __init__(self, share_params=True, brightness_delta=32, contrast_range=(0.5, 1.5), saturation_range=(0.5, 1.5), hue_delta=18): self.share_params = share_params self.brightness_delta = brightness_delta self.contrast_lower, self.contrast_upper = contrast_range self.saturation_lower, self.saturation_upper = saturation_range self.hue_delta = hue_delta def get_params(self): """Generate parameters.""" params = dict() # delta if np.random.randint(2): params['delta'] = np.random.uniform(-self.brightness_delta, self.brightness_delta) else: params['delta'] = None # mode mode = np.random.randint(2) params['contrast_first'] = True if mode == 1 else 0 # alpha if np.random.randint(2): params['alpha'] = np.random.uniform(self.contrast_lower, self.contrast_upper) else: params['alpha'] = None # saturation if np.random.randint(2): params['saturation'] = np.random.uniform(self.saturation_lower, self.saturation_upper) else: params['saturation'] = None # hue if np.random.randint(2): params['hue'] = np.random.uniform(-self.hue_delta, self.hue_delta) else: params['hue'] = None # swap if np.random.randint(2): params['permutation'] = np.random.permutation(3) else: params['permutation'] = None return params def photo_metric_distortion(self, results, params=None): """Call function to perform photometric distortion on images. Args: results (dict): Result dict from loading pipeline. params (dict, optional): Pre-defined parameters. Default to None. Returns: dict: Result dict with images distorted. """ if params is None: params = self.get_params() results['img_info']['color_jitter'] = params if 'img_fields' in results: assert results['img_fields'] == ['img'], \ 'Only single img_fields is allowed' img = results['img'] assert img.dtype == np.float32, \ 'PhotoMetricDistortion needs the input image of dtype np.float32,' \ ' please set "to_float32=True" in "LoadImageFromFile" pipeline' # random brightness if params['delta'] is not None: img += params['delta'] # mode == 0 --> do random contrast first # mode == 1 --> do random contrast last if params['contrast_first']: if params['alpha'] is not None: img *= params['alpha'] # convert color from BGR to HSV img = mmcv.bgr2hsv(img) # random saturation if params['saturation'] is not None: img[..., 1] *= params['saturation'] # random hue if params['hue'] is not None: img[..., 0] += params['hue'] img[..., 0][img[..., 0] > 360] -= 360 img[..., 0][img[..., 0] < 0] += 360 # convert color from HSV to BGR img = mmcv.hsv2bgr(img) # random contrast if not params['contrast_first']: if params['alpha'] is not None: img *= params['alpha'] # randomly swap channels if params['permutation'] is not None: img = img[..., params['permutation']] results['img'] = img return results def __call__(self, results): """Call function to perform photometric distortion on images. Args: results (dict): Result dict from loading pipeline. Returns: dict: Result dict with images distorted. """ if self.share_params: params = self.get_params() else: params = None outs = [] for _results in results: _results = self.photo_metric_distortion(_results, params) outs.append(_results) return outs def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(\nbrightness_delta={self.brightness_delta},\n' repr_str += 'contrast_range=' repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n' repr_str += 'saturation_range=' repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n' repr_str += f'hue_delta={self.hue_delta})' return repr_str @PIPELINES.register_module() class ResizeWithRef(object): """Resize images & bbox & mask. This transform resizes the input image to some scale. Bboxes and masks are then resized with the same scale factor. If the input dict contains the key "scale", then the scale in the input dict is used, otherwise the specified scale in the init method is used. `img_scale` can either be a tuple (single-scale) or a list of tuple (multi-scale). There are 3 multiscale modes: - `ratio_range` is not None: randomly sample a ratio from the ratio range and multiply it with the image scale. - `ratio_range` is None and `multiscale_mode` == "range": randomly sample a scale from the a range. - `ratio_range` is None and `multiscale_mode` == "value": randomly sample a scale from multiple scales. Args: img_scale (tuple or list[tuple]): Images scales for resizing. multiscale_mode (str): Either "range" or "value". ratio_range (tuple[float]): (min_ratio, max_ratio) keep_ratio (bool): Whether to keep the aspect ratio when resizing the image. """ def __init__(self, img_scale=None, multiscale_mode='range', ratio_range=None, keep_ratio=True): if img_scale is None: self.img_scale = None else: if isinstance(img_scale, list): self.img_scale = img_scale else: self.img_scale = [img_scale] assert mmcv.is_list_of(self.img_scale, tuple) if ratio_range is not None: # mode 1: given a scale and a range of image ratio assert len(self.img_scale) == 1 else: # mode 2: given multiple scales or a range of scales assert multiscale_mode in ['value', 'range'] self.multiscale_mode = multiscale_mode self.ratio_range = ratio_range self.keep_ratio = keep_ratio @staticmethod def random_select(img_scales): assert mmcv.is_list_of(img_scales, tuple) scale_idx = np.random.randint(len(img_scales)) img_scale = img_scales[scale_idx] return img_scale, scale_idx @staticmethod def random_sample(img_scales): assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2 img_scale_long = [max(s) for s in img_scales] img_scale_short = [min(s) for s in img_scales] long_edge = np.random.randint( min(img_scale_long), max(img_scale_long) + 1) short_edge = np.random.randint( min(img_scale_short), max(img_scale_short) + 1) img_scale = (long_edge, short_edge) return img_scale, None @staticmethod def random_sample_ratio(img_scale, ratio_range): assert isinstance(img_scale, tuple) and len(img_scale) == 2 min_ratio, max_ratio = ratio_range assert min_ratio <= max_ratio ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio) return scale, None def _random_scale(self, results): if self.ratio_range is not None: scale, scale_idx = self.random_sample_ratio( self.img_scale[0], self.ratio_range) elif len(self.img_scale) == 1: scale, scale_idx = self.img_scale[0], 0 elif self.multiscale_mode == 'range': scale, scale_idx = self.random_sample(self.img_scale) elif self.multiscale_mode == 'value': scale, scale_idx = self.random_select(self.img_scale) else: raise NotImplementedError results['scale'] = scale results['scale_idx'] = scale_idx def _resize_img(self, results): els = ['ref_img', 'img'] if 'ref_img' in results else ['img'] for el in els: if self.keep_ratio: img, scale_factor = mmcv.imrescale( results[el], results['scale'], return_scale=True) else: img, w_scale, h_scale = mmcv.imresize( results[el], results['scale'], return_scale=True) scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], dtype=np.float32) results[el] = img results['img_shape'] = img.shape results['pad_shape'] = img.shape # in case that there is no padding results['scale_factor'] = scale_factor results['keep_ratio'] = self.keep_ratio def _resize_bboxes(self, results): els = ['ref_bbox_fields', 'bbox_fields'] if 'ref_bbox_fields' in results else ['bbox_fields'] for el in els: img_shape = results['img_shape'] for key in results.get(el, []): bboxes = results[key] * results['scale_factor'] bboxes[:, 0::2] = np.clip( bboxes[:, 0::2], 0, img_shape[1] - 1) bboxes[:, 1::2] = np.clip( bboxes[:, 1::2], 0, img_shape[0] - 1) results[key] = bboxes def _resize_masks(self, results): els = ['ref_mask_fields', 'mask_fields'] if 'ref_mask_fields' in results else ['mask_fields'] for el in els: for key in results.get(el, []): if results[key] is None: continue if self.keep_ratio: masks = [ mmcv.imrescale( mask, results['scale_factor'], interpolation='nearest') for mask in results[key] ] else: mask_size = (results['img_shape'][1], results['img_shape'][0]) masks = [ mmcv.imresize(mask, mask_size, interpolation='nearest') for mask in results[key] ] results[key] = masks def __call__(self, results): if 'scale' not in results: self._random_scale(results) self._resize_img(results) self._resize_bboxes(results) self._resize_masks(results) # self._resize_semantic_seg(results) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += ('(img_scale={}, multiscale_mode={}, ratio_range={}, ' 'keep_ratio={})').format(self.img_scale, self.multiscale_mode, self.ratio_range, self.keep_ratio) return repr_str @PIPELINES.register_module() class RandomFlipWithRef(object): """Flip the image & bbox & mask. If the input dict contains the key "flip", then the flag will be used, otherwise it will be randomly decided by a ratio specified in the init method. Args: flip_ratio (float, optional): The flipping probability. """ def __init__(self, flip_ratio=None): self.flip_ratio = flip_ratio if flip_ratio is not None: assert flip_ratio >= 0 and flip_ratio <= 1 def bbox_flip(self, bboxes, img_shape): """Flip bboxes horizontally. Args: bboxes(ndarray): shape (..., 4*k) img_shape(tuple): (height, width) """ assert bboxes.shape[-1] % 4 == 0 w = img_shape[1] flipped = bboxes.copy() flipped[..., 0::4] = w - bboxes[..., 2::4] - 1 flipped[..., 2::4] = w - bboxes[..., 0::4] - 1 return flipped def __call__(self, results): if 'flip' not in results: flip = True if np.random.rand() < self.flip_ratio else False results['flip'] = flip if results['flip']: # flip image results['img'] = mmcv.imflip(results['img']) if 'ref_img' in results: results['ref_img'] = mmcv.imflip(results['ref_img']) # flip bboxes for key in results.get('bbox_fields', []): results[key] = self.bbox_flip(results[key], results['img_shape']) for key in results.get('ref_bbox_fields', []): results[key] = self.bbox_flip(results[key], results['img_shape']) # flip masks for key in results.get('mask_fields', []): results[key] = [mask[:, ::-1] for mask in results[key]] for key in results.get('ref_mask_fields', []): results[key] = [mask[:, ::-1] for mask in results[key]] return results def __repr__(self): return self.__class__.__name__ + '(flip_ratio={})'.format( self.flip_ratio) @PIPELINES.register_module() class PadWithRef(object): """Pad the image & mask. There are two padding modes: (1) pad to a fixed size and (2) pad to the minimum size that is divisible by some number. Args: size (tuple, optional): Fixed padding size. size_divisor (int, optional): The divisor of padded size. pad_val (float, optional): Padding value, 0 by default. """ def __init__(self, size=None, size_divisor=None, pad_val=0): self.size = size self.size_divisor = size_divisor self.pad_val = pad_val # only one of size and size_divisor should be valid assert size is not None or size_divisor is not None assert size is None or size_divisor is None def _pad_img(self, results): els = ['ref_img', 'img'] if 'ref_img' in results else ['img'] for el in els: if self.size is not None: padded_img = mmcv.impad(results['img'], self.size) elif self.size_divisor is not None: padded_img = mmcv.impad_to_multiple( results[el], self.size_divisor, pad_val=self.pad_val) results[el] = padded_img results['pad_shape'] = padded_img.shape results['pad_fixed_size'] = self.size results['pad_size_divisor'] = self.size_divisor def _pad_masks(self, results): els = ['ref_mask_fields', 'mask_fields'] if 'ref_mask_fields' in results else ['mask_fields'] for el in els: pad_shape = results['pad_shape'][:2] for key in results.get(el, []): padded_masks = [ mmcv.impad(mask, pad_shape, pad_val=self.pad_val) for mask in results[key] ] results[key] = np.stack(padded_masks, axis=0) def __call__(self, results): self._pad_img(results) self._pad_masks(results) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += '(size={}, size_divisor={}, pad_val={})'.format( self.size, self.size_divisor, self.pad_val) return repr_str @PIPELINES.register_module() class NormalizeWithRef(object): """Normalize the image. Args: mean (sequence): Mean values of 3 channels. std (sequence): Std values of 3 channels. to_rgb (bool): Whether to convert the image from BGR to RGB, default is true. """ def __init__(self, mean, std, to_rgb=True): self.mean = np.array(mean, dtype=np.float32) self.std = np.array(std, dtype=np.float32) self.to_rgb = to_rgb def __call__(self, results): results['img'] = mmcv.imnormalize( results['img'], self.mean, self.std, self.to_rgb) if 'ref_img' in results: results['ref_img'] = mmcv.imnormalize( results['ref_img'], self.mean, self.std, self.to_rgb) results['img_norm_cfg'] = dict( mean=self.mean, std=self.std, to_rgb=self.to_rgb) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += '(mean={}, std={}, to_rgb={})'.format( self.mean, self.std, self.to_rgb) return repr_str @PIPELINES.register_module() class RandomCropWithRef(object): """Random crop the image & bboxes & masks. Args: crop_size (tuple): Expected size after cropping, (h, w). """ def __init__(self, crop_size): self.crop_size = crop_size def __call__(self, results): img = results['img'] margin_h = max(img.shape[0] - self.crop_size[0], 0) margin_w = max(img.shape[1] - self.crop_size[1], 0) offset_h = np.random.randint(0, margin_h + 1) offset_w = np.random.randint(0, margin_w + 1) crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0] crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1] # crop the image ori_shape = img.shape img = img[crop_y1:crop_y2, crop_x1:crop_x2, :] img_shape = img.shape results['img'] = img if 'ref_img' in results: ref_img = results['ref_img'] ref_img = ref_img[crop_y1:crop_y2, crop_x1:crop_x2, :] results['ref_img'] = ref_img results['img_shape'] = img_shape results['crop_coords'] = [crop_y1, crop_y2, crop_x1, crop_x2] # crop bboxes accordingly and clip to the image boundary els = ['ref_bbox_fields', 'bbox_fields'] if 'ref_bbox_fields' in results else ['bbox_fields'] for el in els: for key in results.get(el, []): bbox_offset = np.array( [offset_w, offset_h, offset_w, offset_h], dtype=np.float32) bboxes = results[key] - bbox_offset bboxes[:, 0::2] = np.clip( bboxes[:, 0::2], 0, img_shape[1] - 1) bboxes[:, 1::2] = np.clip( bboxes[:, 1::2], 0, img_shape[0] - 1) results[key] = bboxes # filter out the gt bboxes that are completely cropped els = ['ref_bboxes', 'gt_bboxes'] if 'ref_bboxes' in results else ['gt_bboxes'] for el in els: if el in results: gt_bboxes = results[el] valid_inds = (gt_bboxes[:, 2] > gt_bboxes[:, 0]) & ( gt_bboxes[:, 3] > gt_bboxes[:, 1]) # if no gt bbox remains after cropping, just skip this image if not np.any(valid_inds): return None results[el] = gt_bboxes[valid_inds, :] ell = el.replace('_bboxes', '_labels') if ell in results: results[ell] = results[ell][valid_inds] #### filter gt_obj_ids just like gt_labes. elo = el.replace('_bboxes', '_obj_ids') if elo in results: results[elo] = results[elo][valid_inds] # filter and crop the masks elm = el.replace('_bboxes', '_masks') if elm in results: valid_gt_masks = [] for i in np.where(valid_inds)[0]: gt_mask = results[elm][i][ crop_y1:crop_y2, crop_x1:crop_x2] valid_gt_masks.append(gt_mask) results[elm] = valid_gt_masks return results def __repr__(self): return self.__class__.__name__ + '(crop_size={})'.format( self.crop_size) @PIPELINES.register_module() class PadFutureMMDet: """Pad the image & masks & segmentation map. There are two padding modes: (1) pad to a fixed size and (2) pad to the minimum size that is divisible by some number. Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor", Args: size (tuple, optional): Fixed padding size. size_divisor (int, optional): The divisor of padded size. pad_to_square (bool): Whether to pad the image into a square. Currently only used for YOLOX. Default: False. pad_val (dict, optional): A dict for padding value, the default value is `dict(img=0, masks=0, seg=255)`. """ def __init__(self, size=None, size_divisor=None, pad_to_square=False, pad_val=dict(img=0, masks=0, seg=255)): self.size = size self.size_divisor = size_divisor if isinstance(pad_val, float) or isinstance(pad_val, int): warnings.warn( 'pad_val of float type is deprecated now, ' f'please use pad_val=dict(img={pad_val}, ' f'masks={pad_val}, seg=255) instead.', DeprecationWarning) pad_val = dict(img=pad_val, masks=pad_val, seg=255) assert isinstance(pad_val, dict) self.pad_val = pad_val self.pad_to_square = pad_to_square if pad_to_square: assert size is None and size_divisor is None, \ 'The size and size_divisor must be None ' \ 'when pad2square is True' else: assert size is not None or size_divisor is not None, \ 'only one of size and size_divisor should be valid' assert size is None or size_divisor is None def _pad_img(self, results): """Pad images according to ``self.size``.""" pad_val = self.pad_val.get('img', 0) for key in results.get('img_fields', ['img']): if self.pad_to_square: max_size = max(results[key].shape[:2]) self.size = (max_size, max_size) if self.size is not None: padded_img = mmcv.impad( results[key], shape=self.size, pad_val=pad_val) elif self.size_divisor is not None: padded_img = mmcv.impad_to_multiple( results[key], self.size_divisor, pad_val=pad_val) results[key] = padded_img results['pad_shape'] = padded_img.shape results['pad_fixed_size'] = self.size results['pad_size_divisor'] = self.size_divisor def _pad_masks(self, results): """Pad masks according to ``results['pad_shape']``.""" pad_shape = results['pad_shape'][:2] pad_val = self.pad_val.get('masks', 0) for key in results.get('mask_fields', []): results[key] = results[key].pad(pad_shape, pad_val=pad_val) def _pad_seg(self, results): """Pad semantic segmentation map according to ``results['pad_shape']``.""" pad_val = self.pad_val.get('seg', 255) for key in results.get('seg_fields', []): results[key] = mmcv.impad( results[key], shape=results['pad_shape'][:2], pad_val=pad_val) def __call__(self, results): """Call function to pad images, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. Returns: dict: Updated result dict. """ self._pad_img(results) self._pad_masks(results) self._pad_seg(results) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(size={self.size}, ' repr_str += f'size_divisor={self.size_divisor}, ' repr_str += f'pad_to_square={self.pad_to_square}, ' repr_str += f'pad_val={self.pad_val})' return repr_str @PIPELINES.register_module() class KNetInsAdapter: """Adapter that is used to convert city-style instance class-ids to coco-style instance-ids (11-starting to 0-starting) """ def __init__(self, stuff_nums=11): self.stuff_nums = stuff_nums def __call__(self, results): """Call function to modify gt_labels Args: results (dict): Result dict from loading pipeline. Returns: dict: Updated result dict. """ results['gt_labels'] -= self.stuff_nums return results @PIPELINES.register_module() class KNetInsAdapterCherryPick: """Adapter that is used to convert city-style instance class-ids to coco-style instance-ids (11-starting to 0-starting) """ def __init__(self, stuff_nums=11, cherry=(11, 13)): self.cherry = cherry self.stuff_nums = stuff_nums def __call__(self, results): """Call function to modify gt_labels Args: results (dict): Result dict from loading pipeline. Returns: dict: Updated result dict. """ bias = 0 for ch in self.cherry: results['gt_labels'][results['gt_labels'] == ch] -= bias bias += 1 results['gt_labels'] -= self.stuff_nums return results ================================================ FILE: external/evalhooks.py ================================================ import os.path as osp import warnings from math import inf import mmcv import torch.distributed as dist from mmcv.runner import Hook from mmdet.utils import get_root_logger from torch.nn.modules.batchnorm import _BatchNorm from torch.utils.data import DataLoader from external.test import multi_gpu_test, single_gpu_test class EvalHook(Hook): """Evaluation hook. Notes: If new arguments are added for EvalHook, tools/test.py, tools/analysis_tools/eval_metric.py may be effected. Attributes: dataloader (DataLoader): A PyTorch dataloader. start (int, optional): Evaluation starting epoch. It enables evaluation before the training starts if ``start`` <= the resuming epoch. If None, whether to evaluate is merely decided by ``interval``. Default: None. interval (int): Evaluation interval (by epochs). Default: 1. save_best (str, optional): If a metric is specified, it would measure the best checkpoint during evaluation. The information about best checkpoint would be save in best.json. Options are the evaluation metrics to the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox detection and instance segmentation. ``AR@100`` for proposal recall. If ``save_best`` is ``auto``, the first key will be used. The interval of ``CheckpointHook`` should device EvalHook. Default: None. rule (str, optional): Comparison rule for best score. If set to None, it will infer a reasonable rule. Keys such as 'mAP' or 'AR' will be inferred by 'greater' rule. Keys contain 'loss' will be inferred by 'less' rule. Options are 'greater', 'less'. Default: None. **eval_kwargs: Evaluation arguments fed into the evaluate function of the dataset. """ rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y} init_value_map = {'greater': -inf, 'less': inf} greater_keys = ['mAP', 'AR'] less_keys = ['loss'] def __init__(self, dataloader, start=None, interval=1, by_epoch=True, save_best=None, rule=None, **eval_kwargs): if not isinstance(dataloader, DataLoader): raise TypeError('dataloader must be a pytorch DataLoader, but got' f' {type(dataloader)}') if not interval > 0: raise ValueError(f'interval must be positive, but got {interval}') if start is not None and start < 0: warnings.warn( f'The evaluation start epoch {start} is smaller than 0, ' f'use 0 instead', UserWarning) start = 0 self.dataloader = dataloader self.interval = interval self.by_epoch = by_epoch self.start = start assert isinstance(save_best, str) or save_best is None self.save_best = save_best self.eval_kwargs = eval_kwargs self.initial_epoch_flag = True self.logger = get_root_logger() if self.save_best is not None: self._init_rule(rule, self.save_best) def _init_rule(self, rule, key_indicator): """Initialize rule, key_indicator, comparison_func, and best score. Args: rule (str | None): Comparison rule for best score. key_indicator (str | None): Key indicator to determine the comparison rule. """ if rule not in self.rule_map and rule is not None: raise KeyError(f'rule must be greater, less or None, ' f'but got {rule}.') if rule is None: if key_indicator != 'auto': if any(key in key_indicator for key in self.greater_keys): rule = 'greater' elif any(key in key_indicator for key in self.less_keys): rule = 'less' else: raise ValueError(f'Cannot infer the rule for key ' f'{key_indicator}, thus a specific rule ' f'must be specified.') self.rule = rule self.key_indicator = key_indicator if self.rule is not None: self.compare_func = self.rule_map[self.rule] def before_run(self, runner): if self.save_best is not None: if runner.meta is None: warnings.warn('runner.meta is None. Creating a empty one.') runner.meta = dict() runner.meta.setdefault('hook_msgs', dict()) def before_train_epoch(self, runner): """Evaluate the model only at the start of training.""" if not self.initial_epoch_flag: return if self.start is not None and runner.epoch >= self.start: self.after_train_epoch(runner) self.initial_epoch_flag = False def evaluation_flag(self, runner): """Judge whether to perform_evaluation after this epoch. Returns: bool: The flag indicating whether to perform evaluation. """ if self.start is None: if not self.every_n_epochs(runner, self.interval): # No evaluation during the interval epochs. return False elif (runner.epoch + 1) < self.start: # No evaluation if start is larger than the current epoch. return False else: # Evaluation only at epochs 3, 5, 7... if start==3 and interval==2 if (runner.epoch + 1 - self.start) % self.interval: return False return True def after_train_epoch(self, runner): if not self.by_epoch or not self.evaluation_flag(runner): return results = single_gpu_test(runner.model, self.dataloader, show=False) key_score = self.evaluate(runner, results) if self.save_best: self.save_best_checkpoint(runner, key_score) def after_train_iter(self, runner): if self.by_epoch or not self.every_n_iters(runner, self.interval): return results = single_gpu_test(runner.model, self.dataloader, show=False) key_score = self.evaluate(runner, results) if self.save_best: self.save_best_checkpoint(runner, key_score) def save_best_checkpoint(self, runner, key_score): best_score = runner.meta['hook_msgs'].get( 'best_score', self.init_value_map[self.rule]) if self.compare_func(key_score, best_score): best_score = key_score runner.meta['hook_msgs']['best_score'] = best_score last_ckpt = runner.meta['hook_msgs']['last_ckpt'] runner.meta['hook_msgs']['best_ckpt'] = last_ckpt mmcv.symlink( last_ckpt, osp.join(runner.work_dir, f'best_{self.key_indicator}.pth')) time_stamp = runner.epoch + 1 if self.by_epoch else runner.iter + 1 self.logger.info(f'Now best checkpoint is epoch_{time_stamp}.pth.' f'Best {self.key_indicator} is {best_score:0.4f}') def evaluate(self, runner, results): eval_res = self.dataloader.dataset.evaluate( results, logger=runner.logger, **self.eval_kwargs) for name, val in eval_res.items(): runner.log_buffer.output[name] = val runner.log_buffer.ready = True if self.save_best is not None: if self.key_indicator == 'auto': # infer from eval_results self._init_rule(self.rule, list(eval_res.keys())[0]) return eval_res[self.key_indicator] else: return None class DistEvalHook(EvalHook): """Distributed evaluation hook. Notes: If new arguments are added, tools/test.py may be effected. Attributes: dataloader (DataLoader): A PyTorch dataloader. start (int, optional): Evaluation starting epoch. It enables evaluation before the training starts if ``start`` <= the resuming epoch. If None, whether to evaluate is merely decided by ``interval``. Default: None. interval (int): Evaluation interval (by epochs). Default: 1. tmpdir (str | None): Temporary directory to save the results of all processes. Default: None. gpu_collect (bool): Whether to use gpu or cpu to collect results. Default: False. save_best (str, optional): If a metric is specified, it would measure the best checkpoint during evaluation. The information about best checkpoint would be save in best.json. Options are the evaluation metrics to the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox detection and instance segmentation. ``AR@100`` for proposal recall. If ``save_best`` is ``auto``, the first key will be used. The interval of ``CheckpointHook`` should device EvalHook. Default: None. rule (str | None): Comparison rule for best score. If set to None, it will infer a reasonable rule. Default: 'None'. broadcast_bn_buffer (bool): Whether to broadcast the buffer(running_mean and running_var) of rank 0 to other rank before evaluation. Default: True. **eval_kwargs: Evaluation arguments fed into the evaluate function of the dataset. """ def __init__(self, dataloader, start=None, interval=1, by_epoch=True, tmpdir=None, gpu_collect=False, save_best=None, rule=None, broadcast_bn_buffer=True, **eval_kwargs): super().__init__( dataloader, start=start, interval=interval, by_epoch=by_epoch, save_best=save_best, rule=rule, **eval_kwargs) self.broadcast_bn_buffer = broadcast_bn_buffer self.tmpdir = tmpdir self.gpu_collect = gpu_collect def _broadcast_bn_buffer(self, runner): # Synchronization of BatchNorm's buffer (running_mean # and running_var) is not supported in the DDP of pytorch, # which may cause the inconsistent performance of models in # different ranks, so we broadcast BatchNorm's buffers # of rank 0 to other ranks to avoid this. if self.broadcast_bn_buffer: model = runner.model for name, module in model.named_modules(): if isinstance(module, _BatchNorm) and module.track_running_stats: dist.broadcast(module.running_var, 0) dist.broadcast(module.running_mean, 0) def after_train_epoch(self, runner): if not self.by_epoch or not self.evaluation_flag(runner): return if self.broadcast_bn_buffer: self._broadcast_bn_buffer(runner) tmpdir = self.tmpdir if tmpdir is None: tmpdir = osp.join(runner.work_dir, '.eval_hook') results = multi_gpu_test( runner.model, self.dataloader, tmpdir=tmpdir, gpu_collect=self.gpu_collect) if runner.rank == 0: print('\n') key_score = self.evaluate(runner, results) if self.save_best: self.save_best_checkpoint(runner, key_score) def after_train_iter(self, runner): if self.by_epoch or not self.every_n_iters(runner, self.interval): return if self.broadcast_bn_buffer: self._broadcast_bn_buffer(runner) tmpdir = self.tmpdir if tmpdir is None: tmpdir = osp.join(runner.work_dir, '.eval_hook') results = multi_gpu_test( runner.model, self.dataloader, tmpdir=tmpdir, gpu_collect=self.gpu_collect) if runner.rank == 0: print('\n') key_score = self.evaluate(runner, results) if self.save_best: self.save_best_checkpoint(runner, key_score) ================================================ FILE: external/ext/mask.py ================================================ __author__ = 'tsungyi' import pycocotools._mask as _mask # Interface for manipulating masks stored in RLE format. # # RLE is a simple yet efficient format for storing binary masks. RLE # first divides a vector (or vectorized image) into a series of piecewise # constant regions and then for each piece simply stores the length of # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] # (note that the odd counts are always the numbers of zeros). Instead of # storing the counts directly, additional compression is achieved with a # variable bitrate representation based on a common scheme called LEB128. # # Compression is greatest given large piecewise constant regions. # Specifically, the size of the RLE is proportional to the number of # *boundaries* in M (or for an image the number of boundaries in the y # direction). Assuming fairly simple shapes, the RLE representation is # O(sqrt(n)) where n is number of pixels in the object. Hence space usage # is substantially lower, especially for large simple objects (large n). # # Many common operations on masks can be computed directly using the RLE # (without need for decoding). This includes computations such as area, # union, intersection, etc. All of these operations are linear in the # size of the RLE, in other words they are O(sqrt(n)) where n is the area # of the object. Computing these operations on the original mask is O(n). # Thus, using the RLE can result in substantial computational savings. # # The following API functions are defined: # encode - Encode binary masks using RLE. # decode - Decode binary masks encoded via RLE. # merge - Compute union or intersection of encoded masks. # iou - Compute intersection over union between masks. # area - Compute area of encoded masks. # toBbox - Get bounding boxes surrounding encoded masks. # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. # # Usage: # Rs = encode( masks ) # masks = decode( Rs ) # R = merge( Rs, intersect=false ) # o = iou( dt, gt, iscrowd ) # a = area( Rs ) # bbs = toBbox( Rs ) # Rs = frPyObjects( [pyObjects], h, w ) # # In the API the following formats are used: # Rs - [dict] Run-length encoding of binary masks # R - dict Run-length encoding of binary mask # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore # bbs - [nx4] Bounding box(es) stored as [x y w h] # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) # dt,gt - May be either bounding boxes or encoded masks # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). # # Finally, a note about the intersection over union (iou) computation. # The standard iou of a ground truth (gt) and detected (dt) object is # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) # For "crowd" regions, we use a modified criteria. If a gt object is # marked as "iscrowd", we allow a dt to match any subregion of the gt. # Choosing gt' in the crowd gt that best matches the dt can be done using # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) # For crowd gt regions we use this modified criteria above for the iou. # # To compile run "python setup.py build_ext --inplace" # Please do not contact us for help with compiling. # # Microsoft COCO Toolbox. version 2.0 # Data, paper, and tutorials available at: http://mscoco.org/ # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. # Licensed under the Simplified BSD License [see coco/license.txt] iou = _mask.iou merge = _mask.merge frPyObjects = _mask.frPyObjects def encode(bimask): if len(bimask.shape) == 3: return _mask.encode(bimask) elif len(bimask.shape) == 2: h, w = bimask.shape return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0] def decode(rleObjs): if type(rleObjs) == list: return _mask.decode(rleObjs) else: return _mask.decode([rleObjs])[:,:,0] def area(rleObjs): if type(rleObjs) == list: return _mask.area(rleObjs) else: return _mask.area([rleObjs])[0] def toBbox(rleObjs): if type(rleObjs) == list: return _mask.toBbox(rleObjs) else: return _mask.toBbox([rleObjs])[0] ================================================ FILE: external/ext/ytvos.py ================================================ __author__ = 'ychfan' # Interface for accessing the YouTubeVIS dataset. # The following API functions are defined: # YTVOS - YTVOS api class that loads YouTubeVIS annotation file and prepare data structures. # decodeMask - Decode binary mask M encoded via run-length encoding. # encodeMask - Encode binary mask M using run-length encoding. # getAnnIds - Get ann ids that satisfy given filter conditions. # getCatIds - Get cat ids that satisfy given filter conditions. # getImgIds - Get img ids that satisfy given filter conditions. # loadAnns - Load anns with the specified ids. # loadCats - Load cats with the specified ids. # loadImgs - Load imgs with the specified ids. # annToMask - Convert segmentation in an annotation to binary mask. # loadRes - Load algorithm results and create API for accessing them. # Microsoft COCO Toolbox. version 2.0 # Data, paper, and tutorials available at: http://mscoco.org/ # Code written by Piotr Dollar and Tsung-Yi Lin, 2014. # Licensed under the Simplified BSD License [see bsd.txt] import json import time import matplotlib.pyplot as plt from matplotlib.collections import PatchCollection from matplotlib.patches import Polygon import numpy as np import copy import itertools from . import mask as maskUtils import os from collections import defaultdict import sys PYTHON_VERSION = sys.version_info[0] def _isArrayLike(obj): return hasattr(obj, '__iter__') and hasattr(obj, '__len__') class YTVOS: def __init__(self, annotation_file=None): """ Constructor of Microsoft COCO helper class for reading and visualizing annotations. :param annotation_file (str): location of annotation file :param image_folder (str): location to the folder that hosts images. :return: """ # load dataset self.dataset,self.anns,self.cats,self.vids = dict(),dict(),dict(),dict() self.vidToAnns, self.catToVids = defaultdict(list), defaultdict(list) if not annotation_file == None: print('loading annotations into memory...') tic = time.time() dataset = json.load(open(annotation_file, 'r')) assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset)) print('Done (t={:0.2f}s)'.format(time.time()- tic)) self.dataset = dataset self.createIndex() def createIndex(self): # create index print('creating index...') anns, cats, vids = {}, {}, {} vidToAnns,catToVids = defaultdict(list),defaultdict(list) if 'annotations' in self.dataset: for ann in self.dataset['annotations']: vidToAnns[ann['video_id']].append(ann) anns[ann['id']] = ann if 'videos' in self.dataset: for vid in self.dataset['videos']: vids[vid['id']] = vid if 'categories' in self.dataset: for cat in self.dataset['categories']: cats[cat['id']] = cat if 'annotations' in self.dataset and 'categories' in self.dataset: for ann in self.dataset['annotations']: catToVids[ann['category_id']].append(ann['video_id']) print('index created!') # create class members self.anns = anns self.vidToAnns = vidToAnns self.catToVids = catToVids self.vids = vids self.cats = cats def info(self): """ Print information about the annotation file. :return: """ for key, value in self.dataset['info'].items(): print('{}: {}'.format(key, value)) def getAnnIds(self, vidIds=[], catIds=[], areaRng=[], iscrowd=None): """ Get ann ids that satisfy given filter conditions. default skips that filter :param vidIds (int array) : get anns for given vids catIds (int array) : get anns for given cats areaRng (float array) : get anns for given area range (e.g. [0 inf]) iscrowd (boolean) : get anns for given crowd label (False or True) :return: ids (int array) : integer array of ann ids """ vidIds = vidIds if _isArrayLike(vidIds) else [vidIds] catIds = catIds if _isArrayLike(catIds) else [catIds] if len(vidIds) == len(catIds) == len(areaRng) == 0: anns = self.dataset['annotations'] else: if not len(vidIds) == 0: lists = [self.vidToAnns[vidId] for vidId in vidIds if vidId in self.vidToAnns] anns = list(itertools.chain.from_iterable(lists)) else: anns = self.dataset['annotations'] anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds] anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['avg_area'] > areaRng[0] and ann['avg_area'] < areaRng[1]] if not iscrowd == None: ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd] else: ids = [ann['id'] for ann in anns] return ids def getCatIds(self, catNms=[], supNms=[], catIds=[]): """ filtering parameters. default skips that filter. :param catNms (str array) : get cats for given cat names :param supNms (str array) : get cats for given supercategory names :param catIds (int array) : get cats for given cat ids :return: ids (int array) : integer array of cat ids """ catNms = catNms if _isArrayLike(catNms) else [catNms] supNms = supNms if _isArrayLike(supNms) else [supNms] catIds = catIds if _isArrayLike(catIds) else [catIds] if len(catNms) == len(supNms) == len(catIds) == 0: cats = self.dataset['categories'] else: cats = self.dataset['categories'] cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms] cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms] cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds] ids = [cat['id'] for cat in cats] return ids def getVidIds(self, vidIds=[], catIds=[]): ''' Get vid ids that satisfy given filter conditions. :param vidIds (int array) : get vids for given ids :param catIds (int array) : get vids with all given cats :return: ids (int array) : integer array of vid ids ''' vidIds = vidIds if _isArrayLike(vidIds) else [vidIds] catIds = catIds if _isArrayLike(catIds) else [catIds] if len(vidIds) == len(catIds) == 0: ids = self.vids.keys() else: ids = set(vidIds) for i, catId in enumerate(catIds): if i == 0 and len(ids) == 0: ids = set(self.catToVids[catId]) else: ids &= set(self.catToVids[catId]) return list(ids) def loadAnns(self, ids=[]): """ Load anns with the specified ids. :param ids (int array) : integer ids specifying anns :return: anns (object array) : loaded ann objects """ if _isArrayLike(ids): return [self.anns[id] for id in ids] elif type(ids) == int: return [self.anns[ids]] def loadCats(self, ids=[]): """ Load cats with the specified ids. :param ids (int array) : integer ids specifying cats :return: cats (object array) : loaded cat objects """ if _isArrayLike(ids): return [self.cats[id] for id in ids] elif type(ids) == int: return [self.cats[ids]] def loadVids(self, ids=[]): """ Load anns with the specified ids. :param ids (int array) : integer ids specifying vid :return: vids (object array) : loaded vid objects """ if _isArrayLike(ids): return [self.vids[id] for id in ids] elif type(ids) == int: return [self.vids[ids]] def loadRes(self, resFile): """ Load result file and return a result api object. :param resFile (str) : file name of result file :return: res (obj) : result api object """ res = YTVOS() res.dataset['videos'] = [img for img in self.dataset['videos']] print('Loading and preparing results...') tic = time.time() if type(resFile) == str or type(resFile) == unicode: anns = json.load(open(resFile)) elif type(resFile) == np.ndarray: anns = self.loadNumpyAnnotations(resFile) else: anns = resFile assert type(anns) == list, 'results in not an array of objects' annsVidIds = [ann['video_id'] for ann in anns] assert set(annsVidIds) == (set(annsVidIds) & set(self.getVidIds())), \ 'Results do not correspond to current coco set' if 'segmentations' in anns[0]: res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) for id, ann in enumerate(anns): ann['areas'] = [] if not 'bboxes' in ann: ann['bboxes'] = [] for seg in ann['segmentations']: # now only support compressed RLE format as segmentation results if seg: ann['areas'].append(maskUtils.area(seg)) if len(ann['bboxes']) < len(ann['areas']): ann['bboxes'].append(maskUtils.toBbox(seg)) else: ann['areas'].append(None) if len(ann['bboxes']) < len(ann['areas']): ann['bboxes'].append(None) ann['id'] = id+1 l = [a for a in ann['areas'] if a] if len(l)==0: ann['avg_area'] = 0 else: ann['avg_area'] = np.array(l).mean() ann['iscrowd'] = 0 print('DONE (t={:0.2f}s)'.format(time.time()- tic)) res.dataset['annotations'] = anns res.createIndex() return res def annToRLE(self, ann, frameId): """ Convert annotation which can be polygons, uncompressed RLE to RLE. :return: binary mask (numpy 2D array) """ t = self.vids[ann['video_id']] h, w = t['height'], t['width'] segm = ann['segmentations'][frameId] if type(segm) == list: # polygon -- a single object might consist of multiple parts # we merge all parts into one mask rle code rles = maskUtils.frPyObjects(segm, h, w) rle = maskUtils.merge(rles) elif type(segm['counts']) == list: # uncompressed RLE rle = maskUtils.frPyObjects(segm, h, w) else: # rle rle = segm return rle def annToMask(self, ann, frameId): """ Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask. :return: binary mask (numpy 2D array) """ rle = self.annToRLE(ann, frameId) m = maskUtils.decode(rle) return m ================================================ FILE: external/fcn_mask_head.py ================================================ import numpy as np import torch from mmdet.models.builder import HEADS from mmdet.models.roi_heads.mask_heads.fcn_mask_head import (FCNMaskHead, _do_paste_mask) BYTES_PER_FLOAT = 4 # TODO: This memory limit may be too much or too little. It would be better to # determine it based on available resources. GPU_MEM_LIMIT = 1024**3 # 1 GB memory limit @HEADS.register_module() class InstanceMaskHead(FCNMaskHead): def __init__(self, **kwargs): super(InstanceMaskHead, self).__init__(**kwargs) def get_seg_masks(self, mask_pred, det_bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor, rescale): """Get segmentation masks from mask_pred and bboxes. The only difference from InstanceMaskHead and FCNMaskHead is the output format of instance masks. The original FCNMaskHead return numpy masks. Args: mask_pred (Tensor or ndarray): shape (n, #class, h, w). For single-scale testing, mask_pred is the direct output of model, whose type is Tensor, while for multi-scale testing, it will be converted to numpy array outside of this method. det_bboxes (Tensor): shape (n, 4/5) det_labels (Tensor): shape (n, ) rcnn_test_cfg (dict): rcnn testing config ori_shape (Tuple): original image height and width, shape (2,) scale_factor(float | Tensor): If ``rescale is True``, box coordinates are divided by this scale factor to fit ``ori_shape``. rescale (bool): If True, the resulting masks will be rescaled to ``ori_shape``. Returns: list[list]: encoded masks. The c-th item in the outer list corresponds to the c-th class. Given the c-th outer list, the i-th item in that inner list is the mask for the i-th box with class label c. Example: >>> import mmcv >>> from mmdet.models.roi_heads.mask_heads.fcn_mask_head import * # NOQA >>> N = 7 # N = number of extracted ROIs >>> C, H, W = 11, 32, 32 >>> # Create example instance of FCN Mask Head. >>> self = FCNMaskHead(num_classes=C, num_convs=0) >>> inputs = torch.rand(N, self.in_channels, H, W) >>> mask_pred = self.forward(inputs) >>> # Each input is associated with some bounding box >>> det_bboxes = torch.Tensor([[1, 1, 42, 42 ]] * N) >>> det_labels = torch.randint(0, C, size=(N,)) >>> rcnn_test_cfg = mmcv.Config({'mask_thr_binary': 0, }) >>> ori_shape = (H * 4, W * 4) >>> scale_factor = torch.FloatTensor((1, 1)) >>> rescale = False >>> # Encoded masks are a list for each category. >>> encoded_masks = self.get_seg_masks( >>> mask_pred, det_bboxes, det_labels, rcnn_test_cfg, ori_shape, >>> scale_factor, rescale >>> ) >>> assert len(encoded_masks) == C >>> assert sum(list(map(len, encoded_masks))) == N """ if isinstance(mask_pred, torch.Tensor): mask_pred = mask_pred.sigmoid() else: mask_pred = det_bboxes.new_tensor(mask_pred) device = mask_pred.device bboxes = det_bboxes[:, :4] labels = det_labels if rescale: img_h, img_w = ori_shape[:2] else: if isinstance(scale_factor, float): img_h = np.round(ori_shape[0] * scale_factor).astype(np.int32) img_w = np.round(ori_shape[1] * scale_factor).astype(np.int32) else: w_scale, h_scale = scale_factor[0], scale_factor[1] img_h = np.round(ori_shape[0] * h_scale.item()).astype( np.int32) img_w = np.round(ori_shape[1] * w_scale.item()).astype( np.int32) scale_factor = 1.0 if not isinstance(scale_factor, (float, torch.Tensor)): scale_factor = bboxes.new_tensor(scale_factor) bboxes = bboxes / scale_factor if torch.onnx.is_in_onnx_export(): # TODO: Remove after F.grid_sample is supported. from torchvision.models.detection.roi_heads \ import paste_masks_in_image masks = paste_masks_in_image(mask_pred, bboxes, ori_shape[:2]) thr = rcnn_test_cfg.get('mask_thr_binary', 0) if thr > 0: masks = masks >= thr return masks N = len(mask_pred) # The actual implementation split the input into chunks, # and paste them chunk by chunk. if device.type == 'cpu': # CPU is most efficient when they are pasted one by one with # skip_empty=True, so that it performs minimal number of # operations. num_chunks = N else: # GPU benefits from parallelism for larger chunks, # but may have memory issue num_chunks = int( np.ceil(N * img_h * img_w * BYTES_PER_FLOAT / GPU_MEM_LIMIT)) assert (num_chunks <= N), 'Default GPU_MEM_LIMIT is too small; try increasing it' chunks = torch.chunk(torch.arange(N, device=device), num_chunks) threshold = rcnn_test_cfg.mask_thr_binary im_mask = torch.zeros( N, img_h, img_w, device=device, dtype=torch.bool if threshold >= 0 else torch.uint8) if not self.class_agnostic: mask_pred = mask_pred[range(N), labels][:, None] for inds in chunks: masks_chunk, spatial_inds = _do_paste_mask( mask_pred[inds], bboxes[inds], img_h, img_w, skip_empty=device.type == 'cpu') if threshold >= 0: masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool) else: # for visualization and debugging masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8) im_mask[(inds, ) + spatial_inds] = masks_chunk return im_mask ================================================ FILE: external/kitti_step_dvps.py ================================================ import os import random from typing import Dict, List import copy import mmcv import numpy as np import torch from mmdet.datasets.builder import DATASETS from mmdet.datasets.pipelines import Compose from mmdet.datasets import CustomDataset from mmdet.utils import get_root_logger from external.dataset.mIoU import eval_miou class SeqObj: # This divisor is orthogonal with panoptic class-instance divisor. DIVISOR = 1000000 def __init__(self, the_dict: Dict): self.dict = the_dict assert 'seq_id' in self.dict and 'img_id' in self.dict def __hash__(self): return self.dict['seq_id'] * self.DIVISOR + self.dict['img_id'] def __eq__(self, other): return self.dict['seq_id'] == other.dict['seq_id'] and self.dict['img_id'] == other.dict['img_id'] def __getitem__(self, attr): return self.dict[attr] @DATASETS.register_module() class KITTISTEPDVPSDataset: CLASSES = ('road', 'sidewalk', 'building', 'wall', 'fence', 'pole', 'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky', 'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle') def __init__(self, pipeline=None, data_root=None, test_mode=False, split='train', ref_seq_index: List[int] = None, is_instance_only: bool = True, with_depth: bool = False ): assert data_root is not None data_root = os.path.expanduser(data_root) video_seq_dir = os.path.join(data_root, 'video_sequence', split) assert os.path.exists(video_seq_dir) assert 'leftImg8bit' not in video_seq_dir self.num_thing_classes = 2 self.num_stuff_classes = 17 self.thing_before_stuff = False # ref_seq_index is None means no ref img if ref_seq_index is None: ref_seq_index = [] filenames = list(map(lambda x: str(x), os.listdir(video_seq_dir))) img_names = sorted(list(filter(lambda x: 'leftImg8bit' in x, filenames))) images = [] for item in img_names: seq_id, img_id, _ = item.split(sep="_", maxsplit=2) if int(seq_id) == 1 and int(img_id) in [177, 178, 179, 180] and with_depth: continue item_full = os.path.join(video_seq_dir, item) images.append(SeqObj({ 'seq_id': int(seq_id), 'img_id': int(img_id), 'img': item_full, 'depth': item_full.replace('leftImg8bit', 'depth') if with_depth else None, 'ann': item_full.replace('leftImg8bit', 'panoptic'), # This should be modified carefully for each dataset. Usually 255. 'no_obj_class': 255 })) assert os.path.exists(images[-1]['img']) assert images[-1]['depth'] is None or os.path.exists(images[-1]['depth']), \ "Missing depth : {}".format(images[-1]['depth']) # assert os.path.exists(images[-1]['ann']) reference_images = {hash(image): image for image in images} sequences = [] for img_cur in images: is_seq = True seq_now = [img_cur.dict] if ref_seq_index: for index in random.choices(ref_seq_index, k=1): query_obj = SeqObj({ 'seq_id': img_cur.dict['seq_id'], 'img_id': img_cur.dict['img_id'] + index }) if hash(query_obj) in reference_images: seq_now.append(reference_images[hash(query_obj)].dict) else: is_seq = False break if is_seq: sequences.append(seq_now) self.sequences = sequences self.ref_seq_index = ref_seq_index # mmdet self.pipeline = Compose(pipeline) self.test_mode = test_mode # misc self.flag = self._set_groups() self.is_instance_only = is_instance_only # For evaluation self.max_ins = 10000 self.no_obj_id = 255 def pre_pipelines(self, results): for _results in results: _results['img_info'] = [] _results['thing_lower'] = 0 if self.thing_before_stuff else self.num_stuff_classes _results['thing_upper'] = self.num_thing_classes \ if self.thing_before_stuff else self.num_stuff_classes + self.num_thing_classes _results['is_instance_only'] = self.is_instance_only _results['ori_filename'] = os.path.basename(_results['img']) def prepare_train_img(self, idx): """Get training data and annotations after pipeline. Args: idx (int): Index of data. Returns: dict: Training data and annotation after pipeline with new keys \ introduced by pipeline. """ results = copy.deepcopy(self.sequences[idx]) self.pre_pipelines(results) return self.pipeline(results) def prepare_test_img(self, idx): results = copy.deepcopy(self.sequences[idx]) self.pre_pipelines(results) # During test time, one image inference does not requires seq if not self.ref_seq_index: results = results[0] return self.pipeline(results) def _rand_another(self, idx): """Get another random index from the same group as the given index.""" pool = np.where(self.flag == self.flag[idx])[0] return np.random.choice(pool) # Copy and Modify from mmdet def __getitem__(self, idx): """Get training/test data after pipeline. Args: idx (int): Index of data. Returns: dict: Training/test data (with annotation if `test_mode` is set \ True). """ if self.test_mode: return self.prepare_test_img(idx) else: while True: cur_data = self.prepare_train_img(idx) if cur_data is None: idx = self._rand_another(idx) continue return cur_data def __len__(self): """Total number of samples of data.""" return len(self.sequences) def _set_groups(self): return np.zeros((len(self)), dtype=np.int64) # The evaluate func def evaluate( self, results, **kwargs ): # logger and metric thing_knet2real = [11, 13] pred_results_handled = [] pred_depth = [] pred_depth_final = [] item_id = 0 sem_preds = [] for item in results: if item[-1] is not None: # With depth bbox_results, mask_results, seg_results, depth, depth_final = item pred_depth.append(depth) pred_depth_final.append(depth_final) else: bbox_results, mask_results, seg_results, _, _ = item # in seg_info id starts from 1 inst_map, seg_info = seg_results cat_map = np.zeros_like(inst_map) + self.num_thing_classes + self.num_stuff_classes for instance in seg_info: cat_cur = instance['category_id'] if instance['isthing']: cat_cur = thing_knet2real[cat_cur] else: if self.thing_before_stuff: raise NotImplementedError else: # stuff starts from 1 in the model cat_cur -= 1 offset = 0 for thing_id in thing_knet2real: if cat_cur + offset >= thing_id: offset += 1 cat_cur += offset assert cat_cur < self.num_thing_classes + self.num_stuff_classes cat_map[inst_map == instance['id']] = cat_cur if not instance['isthing']: inst_map[inst_map == instance['id']] = 0 pred_results_handled.append(cat_map.astype(np.int32) * self.max_ins + inst_map.astype(np.int32)) item_id += 1 sem_preds.append(cat_map) gt_panseg = [] gt_depth = [] sem_targets = [] for item in self.sequences: # Only for single item = item[0] # Only for single id_map = mmcv.imread(item['ann'], flag='color', channel_order='rgb') gt_semantic_seg = id_map[..., 0].astype(np.int32) sem_targets.append(gt_semantic_seg) gt_inst_map = id_map[..., 1].astype(np.int32) * 256 + id_map[..., 2].astype(np.int32) ps_id = gt_semantic_seg * self.max_ins + gt_inst_map gt_panseg.append(ps_id) if len(pred_depth) > 0: gt_depth_cur = mmcv.imread(item['depth'], flag='unchanged').astype(np.float32) / 256. gt_depth.append(gt_depth_cur) vpq_results = [] for pred, gt in zip(pred_results_handled, gt_panseg): vpq_result = vpq_eval([pred, gt]) vpq_results.append(vpq_result) iou_per_class = np.stack([result[0] for result in vpq_results]).sum(axis=0)[ :self.num_thing_classes + self.num_stuff_classes] tp_per_class = np.stack([result[1] for result in vpq_results]).sum(axis=0)[ :self.num_thing_classes + self.num_stuff_classes] fn_per_class = np.stack([result[2] for result in vpq_results]).sum(axis=0)[ :self.num_thing_classes + self.num_stuff_classes] fp_per_class = np.stack([result[3] for result in vpq_results]).sum(axis=0)[ :self.num_thing_classes + self.num_stuff_classes] abs_rels = [] abs_rel_finals = [] if len(pred_depth) > 0: for pred, pred_final, gt in zip(pred_depth, pred_depth_final, gt_depth): depth_mask = gt > 0. abs_rel_normal = np.mean( np.abs( pred[depth_mask] - gt[depth_mask]) / gt[depth_mask]) abs_rel_final = np.mean( np.abs( pred_final[depth_mask] - gt[depth_mask]) / gt[depth_mask]) abs_rels.append(abs_rel_normal) abs_rel_finals.append(abs_rel_final) abs_rel = np.stack(abs_rels).mean(axis=0) abs_rel_final = np.stack(abs_rel_finals).mean(axis=0) else: abs_rel = 0. abs_rel_final = 0. # calculate the PQs epsilon = 0. sq = iou_per_class / (tp_per_class + epsilon) rq = tp_per_class / (tp_per_class + 0.5 * fn_per_class + 0.5 * fp_per_class + epsilon) pq = sq * rq things_index = np.zeros((19,)).astype(bool) things_index[11] = True things_index[13] = True stuff_pq = pq[np.logical_not(things_index)] things_pq = pq[things_index] miou_per_class = eval_miou(sem_preds, sem_targets, num_classes=self.num_thing_classes + self.num_stuff_classes) print("class pq\t\tsq\t\trq\t\ttp\t\tfp\t\tfn\t\tmIoU") for i in range(len(self.CLASSES)): print("{}{}{:.3f}\t\t{:.3f}\t\t{:.3f}\t\t{:.0f}\t\t{:.0f}\t\t{:.0f}\t\t{:.3f}".format( self.CLASSES[i], ' ' * (13 - len(self.CLASSES[i])), pq[i], sq[i], rq[i], tp_per_class[i], fp_per_class[i], fn_per_class[i], miou_per_class[i] )) return { "abs_rel": abs_rel, "abs_rel_final": abs_rel_final, "PQ": np.nan_to_num(pq).mean() * 100, "Stuff PQ": np.nan_to_num(stuff_pq).mean() * 100, "Things PQ": np.nan_to_num(things_pq).mean() * 100, "mIoU": np.nan_to_num(miou_per_class).mean() * 100, } def vpq_eval(element): import six pred_ids, gt_ids = element max_ins = 10000 ign_id = 255 offset = 2 ** 30 num_cat = 19 + 1 iou_per_class = np.zeros(num_cat, dtype=np.float64) tp_per_class = np.zeros(num_cat, dtype=np.float64) fn_per_class = np.zeros(num_cat, dtype=np.float64) fp_per_class = np.zeros(num_cat, dtype=np.float64) def _ids_to_counts(id_array): ids, counts = np.unique(id_array, return_counts=True) return dict(six.moves.zip(ids, counts)) pred_areas = _ids_to_counts(pred_ids) gt_areas = _ids_to_counts(gt_ids) void_id = ign_id * max_ins ign_ids = { gt_id for gt_id in six.iterkeys(gt_areas) if (gt_id // max_ins) == ign_id } int_ids = gt_ids.astype(np.int64) * offset + pred_ids.astype(np.int64) int_areas = _ids_to_counts(int_ids) def prediction_void_overlap(pred_id): void_int_id = void_id * offset + pred_id return int_areas.get(void_int_id, 0) def prediction_ignored_overlap(pred_id): total_ignored_overlap = 0 for _ign_id in ign_ids: int_id = _ign_id * offset + pred_id total_ignored_overlap += int_areas.get(int_id, 0) return total_ignored_overlap gt_matched = set() pred_matched = set() for int_id, int_area in six.iteritems(int_areas): gt_id = int(int_id // offset) gt_cat = int(gt_id // max_ins) pred_id = int(int_id % offset) pred_cat = int(pred_id // max_ins) if gt_cat != pred_cat: continue union = ( gt_areas[gt_id] + pred_areas[pred_id] - int_area - prediction_void_overlap(pred_id) ) iou = int_area / union if iou > 0.5: tp_per_class[gt_cat] += 1 iou_per_class[gt_cat] += iou gt_matched.add(gt_id) pred_matched.add(pred_id) for gt_id in six.iterkeys(gt_areas): if gt_id in gt_matched: continue cat_id = gt_id // max_ins if cat_id == ign_id: continue fn_per_class[cat_id] += 1 for pred_id in six.iterkeys(pred_areas): if pred_id in pred_matched: continue if (prediction_ignored_overlap(pred_id) / pred_areas[pred_id]) > 0.5: continue cat = pred_id // max_ins fp_per_class[cat] += 1 return iou_per_class, tp_per_class, fn_per_class, fp_per_class if __name__ == '__main__': import dataset.dvps_pipelines.loading import dataset.dvps_pipelines.transforms import dataset.pipelines.transforms import dataset.pipelines.formatting img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False) test_pipeline = [ dict(type='LoadMultiImagesDirect'), dict(type='SeqPadWithDepth', size_divisor=32), dict(type='SeqNormalize', **img_norm_cfg), dict( type='VideoCollect', keys=['img']), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ] data = KITTISTEPDVPSDataset( pipeline=[ dict(type='LoadMultiImagesDirect'), dict(type='LoadMultiAnnotationsDirect', with_depth=True, divisor=-1), dict(type='SeqFlipWithDepth', flip_ratio=0.5), dict(type='SeqPadWithDepth', size_divisor=32), dict(type='SeqNormalize', **img_norm_cfg), dict( type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_depth']), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ], data_root=os.path.expanduser('~/datasets/kitti-step'), split='val', ref_seq_index=[-1, 1], with_depth=True, ) np.set_string_function(lambda x: '<{} ; {}>'.format(x.shape, x.dtype)) torch.set_printoptions(profile='short') for item in data: print(item) ================================================ FILE: external/panoptic_fpn.py ================================================ from mmdet.models.builder import DETECTORS from mmdet.models.detectors.two_stage import TwoStageDetector @DETECTORS.register_module() class PanopticFPN(TwoStageDetector): """Implementation of `Panoptic FPN `_""" def __init__(self, backbone, rpn_head, roi_head, train_cfg, test_cfg, neck=None, pretrained=None): super(PanopticFPN, self).__init__( backbone=backbone, neck=neck, rpn_head=rpn_head, roi_head=roi_head, train_cfg=train_cfg, test_cfg=test_cfg, pretrained=pretrained) @property def with_semantic(self): """bool: whether the detector has a semantic head""" return ((hasattr(self, 'roi_head') and self.roi_head.with_semantic) or (hasattr(self, 'semantic_head') and self.semantic_head is not None)) ================================================ FILE: external/panoptic_head.py ================================================ import torch from mmdet.core import bbox2result from mmdet.models.builder import HEADS, build_head from mmdet.models.roi_heads import StandardRoIHead class PanopticTestMixin(object): def simple_test_semantic(self, x, img_metas): segm_feature_pred = self.semantic_head(x) semantic_seg_results = [] for i, img_meta in enumerate(img_metas): semantic_seg_results.append( self.semantic_head.get_semantic_seg(segm_feature_pred[i:i + 1], img_meta['ori_shape'], img_meta['img_shape'])[0]) return semantic_seg_results def generate_panoptic(self, det_bboxes, det_labels, mask_preds, sem_seg, img_metas, merge_cfg): panoptic_results = [] for i in range(len(img_metas)): panoptic_results.append( merge_stuff_thing(det_bboxes[i], det_labels[i], mask_preds[i], sem_seg[i], merge_cfg)) return panoptic_results @HEADS.register_module() class PanopticHead(StandardRoIHead, PanopticTestMixin): """Panoptic Segmentation Head for Panoptic Seg.""" def __init__(self, *args, semantic_head, **kwargs): super(PanopticHead, self).__init__(*args, **kwargs) self.semantic_head = build_head(semantic_head) @property def with_semantic(self): """bool: whether the head has semantic head""" if hasattr(self, 'semantic_head') and self.semantic_head is not None: return True else: return False def init_weights(self, pretrained): """Initialize the weights in head. Args: pretrained (str, optional): Path to pre-trained weights. Defaults to None. """ super().init_weights(pretrained) if self.with_semantic: self.semantic_head.init_weights() def forward_train(self, x, img_metas, proposal_list, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, gt_semantic_seg=None): """ Args: x (list[Tensor]): list of multi-level img features. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. proposals (list[Tensors]): list of region proposals. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. Returns: dict[str, Tensor]: a dictionary of loss components """ # assign gts and sample proposals if self.with_bbox or self.with_mask: num_imgs = len(img_metas) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = self.bbox_assigner.assign( proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = self.bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) losses = dict() # bbox head forward and loss if self.with_bbox: bbox_results = self._bbox_forward_train(x, sampling_results, gt_bboxes, gt_labels, img_metas) losses.update(bbox_results['loss_bbox']) # mask head forward and loss if self.with_mask: mask_results = self._mask_forward_train(x, sampling_results, bbox_results['bbox_feats'], gt_masks, img_metas) losses.update(mask_results['loss_mask']) if self.with_semantic: for i in range(gt_semantic_seg.shape[0]): gt_semantic_seg[i, :, img_metas[i]['img_shape'] [0]:, :] = self.semantic_head.ignore_label gt_semantic_seg[i, :, :, img_metas[i]['img_shape'] [1]:] = self.semantic_head.ignore_label seg_preds = self.semantic_head(x) seg_losses = self.semantic_head.loss(seg_preds, gt_semantic_seg) losses.update(seg_losses) return losses async def async_simple_test(self, x, proposal_list, img_metas, proposals=None, rescale=False): """Async test without augmentation.""" raise NotImplementedError('PanopticHead does not support async test') def simple_test(self, x, proposal_list, img_metas, proposals=None, rescale=False): """Test without augmentation.""" assert self.with_bbox, 'Bbox head must be implemented.' det_bboxes, det_labels = self.simple_test_bboxes( x, img_metas, proposal_list, self.test_cfg, rescale=rescale) if torch.onnx.is_in_onnx_export(): if self.with_mask: segm_results = self.simple_test_mask( x, img_metas, det_bboxes, det_labels, rescale=rescale) return det_bboxes, det_labels, segm_results else: return det_bboxes, det_labels bbox_results = [ bbox2result(det_bboxes[i], det_labels[i], self.bbox_head.num_classes) for i in range(len(det_bboxes)) ] if not self.with_mask: return bbox_results else: mask_preds = self.simple_test_mask( x, img_metas, det_bboxes, det_labels, rescale=rescale) segm_results = mask2result(mask_preds, det_labels, self.mask_head.num_classes) if self.with_semantic: sem_seg = self.simple_test_semantic(x, img_metas) panoptic_results = self.generate_panoptic( det_bboxes, det_labels, mask_preds, sem_seg, img_metas, self.test_cfg.merge_stuff_thing) return list(zip(bbox_results, segm_results, panoptic_results)) return list(zip(bbox_results, segm_results)) def mask2result(mask_preds, labels, num_classes): cls_segms = [] for batch_id, mask_pred in enumerate(mask_preds): if isinstance(mask_pred, list): cls_segms.append(mask_pred) continue cls_segms.append([[] for _ in range(num_classes)]) N = mask_preds[batch_id].shape[0] for i in range(N): cls_segms[batch_id][labels[batch_id][i]].append( mask_pred[i].detach().cpu().numpy()) return cls_segms def merge_stuff_thing(det_bboxes, det_labels, mask_preds, sem_seg, merge_cfg=None): """Merge stuff and thing segmentation maps. This function is modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/modeling/meta_arch/panoptic_fpn.py#L183 # noqa Args: det_bboxes (torch.Tensor): Bounding boxes in shape (n, 5). det_labels (torch.Tensor): Labels of bounding boxes in shape (n, ). mask_preds (torch.Tensor): Mask prediction in the original image size. sem_seg (torch.Tensor): Semantic segmentation prediction in the original image size. merge_cfg (dict): The config dict containing merge hyper-parameters. """ sem_seg = sem_seg.argmax(dim=0) box_scores = det_bboxes[:, -1] panoptic_seg = torch.zeros_like(sem_seg, dtype=torch.int32) # sort instance outputs by scores sorted_inds = torch.argsort(-box_scores) current_segment_id = 0 segments_info = [] if isinstance(mask_preds, list): instance_masks = None else: instance_masks = mask_preds.to( dtype=torch.bool, device=panoptic_seg.device) # Add instances one-by-one, check for overlaps with existing ones for inst_id in sorted_inds: score = box_scores[inst_id].item() if score < merge_cfg.instance_score_thr: break mask = instance_masks[inst_id] # H,W mask_area = mask.sum().item() if mask_area == 0: continue intersect = (mask > 0) & (panoptic_seg > 0) intersect_area = intersect.sum().item() if intersect_area * 1.0 / mask_area > merge_cfg.iou_thr: continue if intersect_area > 0: mask = mask & (panoptic_seg == 0) current_segment_id += 1 panoptic_seg[mask] = current_segment_id segments_info.append({ 'id': current_segment_id, 'isthing': True, 'score': score, 'category_id': det_labels[inst_id].item(), 'instance_id': inst_id.item(), }) # Add semantic results to remaining empty areas semantic_labels = torch.unique(sem_seg).cpu().tolist() for semantic_label in semantic_labels: if semantic_label == 0: # 0 is a special "thing" class continue mask = (sem_seg == semantic_label) & (panoptic_seg == 0) mask_area = mask.sum().item() if mask_area < merge_cfg.stuff_max_area: continue current_segment_id += 1 panoptic_seg[mask] = current_segment_id segments_info.append({ 'id': current_segment_id, 'isthing': False, 'category_id': semantic_label, 'area': mask_area, }) return panoptic_seg.cpu().numpy(), segments_info ================================================ FILE: external/semantic_seg_head.py ================================================ import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import kaiming_init from mmcv.runner import auto_fp16, force_fp32 from mmdet.models.builder import HEADS, build_loss, build_neck from mmdet.models.roi_heads.mask_heads import FusedSemanticHead @HEADS.register_module() class SemanticHead(FusedSemanticHead): """Semantic segmentation head that can be used in panoptic segmentation. Args: semantic_decoder (dict): Config dict of decoder. It usually is a neck, like semantic FPN. in_channels (int, optional): Input channels. Defaults to 256. num_classes (int, optional): Number of semantic classes including the background. Defaults to 183. ignore_label (int, optional): Labels to be ignored. Defaults to 255. loss_seg (dict, optional): Config dict of loss. Defaults to `dict(type='CrossEntropyLoss', use_sigmoid=False, \ loss_weight=1.0)`. conv_cfg (dict, optional): Config of convolutional layers. Defaults to None. norm_cfg (dict, optional): Config of normalization layers. Defaults to None. """ def __init__(self, semantic_decoder, in_channels=256, num_classes=183, ignore_label=255, pred_stride=4, loss_seg=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), conv_cfg=None, norm_cfg=None): super(FusedSemanticHead, self).__init__() self.semantic_decoder = build_neck(semantic_decoder) self.conv_logits = nn.Conv2d(in_channels, num_classes, 1) self.loss_seg = build_loss(loss_seg) self.in_channels = in_channels self.num_classes = num_classes self.ignore_label = ignore_label self.pred_stride = pred_stride self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.fp16_enabled = False def init_weights(self): kaiming_init(self.conv_logits) @auto_fp16() def forward(self, feats): x = self.semantic_decoder(feats) mask_pred = self.conv_logits(x) return mask_pred @force_fp32(apply_to=('mask_pred', )) def loss(self, mask_pred, labels): mask_pred = F.interpolate( mask_pred, scale_factor=self.pred_stride, mode='bilinear', align_corners=False) labels = labels.squeeze(1).long() loss_sem_seg = self.loss_seg.loss_weight * F.cross_entropy( mask_pred, labels, reduction='mean', ignore_index=self.ignore_label) # loss_semantic_seg = self.loss_seg( # mask_pred, labels, ignore_index=self.ignore_label) return dict(loss_sem_seg=loss_sem_seg) def get_semantic_seg(self, seg_preds, ori_shape, img_shape_withoutpad): """Obtain semantic segmentation map for panoptic segmentation. Args: seg_preds (torch.Tensor): Segmentation prediction ori_shape (tuple[int]): Input image shape with padding. img_shape_withoutpad (tuple[int]): Original image shape before without padding. Returns: list[list[np.ndarray]]: The decoded segmentation masks. The first dimension is the number of classes. The second dimension is the number of masks of a similar class. """ # only surport 1 batch seg_preds = F.interpolate( seg_preds, scale_factor=self.pred_stride, mode='bilinear', align_corners=False) seg_preds = seg_preds[:, :, 0:img_shape_withoutpad[0], 0:img_shape_withoutpad[1]] # seg_masks = F.softmax(seg_preds, 1) # seg_masks = F.interpolate( # seg_masks, # size=ori_shape[0:2], # mode='bilinear', # align_corners=False) seg_results = F.interpolate( seg_preds, size=ori_shape[0:2], mode='bilinear', align_corners=False) return seg_results ================================================ FILE: external/semkitti_dvps.py ================================================ import os from typing import Dict, List import copy import mmcv import numpy as np import random import torch from mmdet.datasets.builder import DATASETS from mmdet.datasets.pipelines import Compose class SeqObj: # This divisor is orthogonal with panoptic class-instance divisor. DIVISOR = 1000000 def __init__(self, the_dict: Dict): self.dict = the_dict assert 'seq_id' in self.dict and 'img_id' in self.dict def __hash__(self): return self.dict['seq_id'] * self.DIVISOR + self.dict['img_id'] def __eq__(self, other): return self.dict['seq_id'] == other.dict['seq_id'] and self.dict['img_id'] == other.dict['img_id'] def __getitem__(self, attr): return self.dict[attr] @DATASETS.register_module() class KITTIDVPSDataset: CLASSES = ( 'car', 'bicycle', 'motorcycle', 'truck', 'other-vehicle', 'person', 'bicyclist', 'motorcyclist' ) def __init__(self, pipeline=None, data_root=None, test_mode=False, split='train', ref_seq_index: List[int] = None, is_instance_only: bool = True, ): assert data_root is not None data_root = os.path.expanduser(data_root) video_seq_dir = os.path.join(data_root, 'video_sequence', split) assert os.path.exists(video_seq_dir) assert 'leftImg8bit' not in video_seq_dir self.num_thing_classes = 8 self.num_stuff_classes = 11 self.thing_before_stuff = True # ref_seq_index is None means no ref img if ref_seq_index is None: ref_seq_index = [] filenames = list(map(lambda x: str(x), os.listdir(video_seq_dir))) depth_names = sorted(list(filter(lambda x: 'depth' in x, filenames))) # No depth annotation if not depth_names: depth_names = sorted(list(filter(lambda x: 'leftImg8bit' in x, filenames))) images = [] for item in depth_names: seq_id, img_id, _ = item.split(sep="_", maxsplit=2) item_full = os.path.join(video_seq_dir, item) images.append(SeqObj({ 'seq_id': int(seq_id), 'img_id': int(img_id), 'img': os.path.join(video_seq_dir, "{}_{}_{}.png".format(seq_id, img_id, 'leftImg8bit')), 'depth': item_full, 'ann_class': os.path.join(video_seq_dir, "{}_{}_{}.png".format(seq_id, img_id, 'gtFine_class')), 'ann_inst': os.path.join(video_seq_dir, "{}_{}_{}.png".format(seq_id, img_id, 'gtFine_instance')), # This should be modified carefully for each dataset. Usually 255. 'no_obj_class': 255 })) assert os.path.exists(images[-1]['img']) if not test_mode: assert os.path.exists(images[-1]['depth']) assert os.path.exists(images[-1]['ann_class']) assert os.path.exists(images[-1]['ann_inst']) reference_images = {hash(image): image for image in images} sequences = [] for img_cur in images: is_seq = True seq_now = [img_cur.dict] if ref_seq_index: for index in random.choices(ref_seq_index, k=1): query_obj = SeqObj({ 'seq_id': img_cur.dict['seq_id'], 'img_id': img_cur.dict['img_id'] + index }) if hash(query_obj) in reference_images: seq_now.append(reference_images[hash(query_obj)].dict) else: is_seq = False break if is_seq: sequences.append(seq_now) self.sequences = sequences self.ref_seq_index = ref_seq_index # mmdet self.pipeline = Compose(pipeline) self.test_mode = test_mode # misc self.flag = self._set_groups() self.is_instance_only = is_instance_only # For evaluation self.max_ins = 1000 self.no_obj_id = 255 def pre_pipelines(self, results): for _results in results: _results['img_info'] = [] _results['thing_lower'] = 0 if self.thing_before_stuff else self.num_stuff_classes _results['thing_upper'] = self.num_thing_classes \ if self.thing_before_stuff else self.num_stuff_classes + self.num_thing_classes _results['is_instance_only'] = self.is_instance_only _results['ori_filename'] = os.path.basename(_results['img']) def prepare_train_img(self, idx): """Get training data and annotations after pipeline. Args: idx (int): Index of data. Returns: dict: Training data and annotation after pipeline with new keys \ introduced by pipeline. """ results = copy.deepcopy(self.sequences[idx]) self.pre_pipelines(results) return self.pipeline(results) def prepare_test_img(self, idx): results = copy.deepcopy(self.sequences[idx]) self.pre_pipelines(results) # During test time, one image inference does not requires seq if not self.ref_seq_index: results = results[0] return self.pipeline(results) def _rand_another(self, idx): """Get another random index from the same group as the given index.""" pool = np.where(self.flag == self.flag[idx])[0] return np.random.choice(pool) # Copy and Modify from mmdet def __getitem__(self, idx): """Get training/test data after pipeline. Args: idx (int): Index of data. Returns: dict: Training/test data (with annotation if `test_mode` is set \ True). """ if self.test_mode: return self.prepare_test_img(idx) else: while True: cur_data = self.prepare_train_img(idx) if cur_data is None: idx = self._rand_another(idx) continue return cur_data def __len__(self): """Total number of samples of data.""" return len(self.sequences) def _set_groups(self): return np.zeros((len(self)), dtype=np.int64) # The evaluate func def evaluate( self, results, **kwargs ): thing_lower = 0 if self.thing_before_stuff else self.num_stuff_classes thing_upper = self.num_thing_classes \ if self.thing_before_stuff else self.num_stuff_classes + self.num_thing_classes pred_results_handled = [] pred_depth = [] pred_depth_final = [] for item in results: bbox_results, mask_results, seg_results, depth, depth_final = item pred_depth.append(depth) pred_depth_final.append(depth_final) # in seg_info id starts from 1 inst_map, seg_info = seg_results cat_map = np.zeros_like(inst_map) + self.num_thing_classes + self.num_stuff_classes for instance in seg_info: cat_cur = instance['category_id'] if instance['isthing']: cat_cur += thing_lower else: if self.thing_before_stuff: cat_cur = cat_cur - 1 + thing_upper else: # stuff starts from 1 in the model cat_cur -= 1 assert cat_cur < self.num_thing_classes + self.num_stuff_classes cat_map[inst_map == instance['id']] = cat_cur if not instance['isthing']: inst_map[inst_map == instance['id']] = 0 pred_results_handled.append(cat_map.astype(np.int32) * 10000 + inst_map.astype(np.int32)) gt_panseg = [] gt_depth = [] for item in self.sequences: # Only for single item = item[0] # Only for single cat_id = mmcv.imread(item['ann_class'], flag='unchanged').astype(np.int32) inst_id = mmcv.imread(item['ann_inst'], flag='unchanged').astype(np.int32) ps_id = cat_id * 10000 + inst_id gt_panseg.append(ps_id) gt_depth_cur = mmcv.imread(item['depth'], flag='unchanged').astype(np.float32) / 256. gt_depth.append(gt_depth_cur) vpq_results = [] for pred, gt in zip(pred_results_handled, gt_panseg): vpq_result = vpq_eval([pred, gt]) vpq_results.append(vpq_result) iou_per_class = np.stack([result[0] for result in vpq_results]).sum(axis=0)[ :self.num_thing_classes + self.num_stuff_classes] tp_per_class = np.stack([result[1] for result in vpq_results]).sum(axis=0)[ :self.num_thing_classes + self.num_stuff_classes] fn_per_class = np.stack([result[2] for result in vpq_results]).sum(axis=0)[ :self.num_thing_classes + self.num_stuff_classes] fp_per_class = np.stack([result[3] for result in vpq_results]).sum(axis=0)[ :self.num_thing_classes + self.num_stuff_classes] abs_rels = [] abs_rel_finals = [] for pred, pred_final, gt in zip(pred_depth, pred_depth_final, gt_depth): depth_mask = gt > 0. abs_rel_normal = np.mean( np.abs( pred[depth_mask] - gt[depth_mask]) / gt[depth_mask]) abs_rel_final = np.mean( np.abs( pred_final[depth_mask] - gt[depth_mask]) / gt[depth_mask]) abs_rels.append(abs_rel_normal) abs_rel_finals.append(abs_rel_final) abs_rel = np.stack(abs_rels).mean(axis=0) abs_rel_final = np.stack(abs_rel_finals).mean(axis=0) # calculate the PQs epsilon = 0. sq = iou_per_class / (tp_per_class + epsilon) rq = tp_per_class / (tp_per_class + 0.5 * fn_per_class + 0.5 * fp_per_class + epsilon) print("tp per class") print(tp_per_class) print("fp per class") print(fp_per_class) print("fn per class") print(fn_per_class) pq = sq * rq print("PQ") print(pq[:thing_upper]) print(pq[thing_upper:]) print("SQ") print(sq) print("RQ") print(rq) stuff_pq = pq[:thing_upper] things_pq = pq[thing_upper:] return { "abs_rel": abs_rel, "abs_rel_final": abs_rel_final, "PQ": np.nan_to_num(pq).mean() * 100, "Stuff PQ": np.nan_to_num(stuff_pq).mean() * 100, "Things PQ": np.nan_to_num(things_pq).mean() * 100, } def vpq_eval(element): import six pred_ids, gt_ids = element max_ins = 10000 ign_id = 255 offset = 2 ** 30 num_cat = 19 + 1 iou_per_class = np.zeros(num_cat, dtype=np.float64) tp_per_class = np.zeros(num_cat, dtype=np.float64) fn_per_class = np.zeros(num_cat, dtype=np.float64) fp_per_class = np.zeros(num_cat, dtype=np.float64) def _ids_to_counts(id_array): ids, counts = np.unique(id_array, return_counts=True) return dict(six.moves.zip(ids, counts)) pred_areas = _ids_to_counts(pred_ids) gt_areas = _ids_to_counts(gt_ids) void_id = ign_id * max_ins ign_ids = { gt_id for gt_id in six.iterkeys(gt_areas) if (gt_id // max_ins) == ign_id } int_ids = gt_ids.astype(np.int64) * offset + pred_ids.astype(np.int64) int_areas = _ids_to_counts(int_ids) def prediction_void_overlap(pred_id): void_int_id = void_id * offset + pred_id return int_areas.get(void_int_id, 0) def prediction_ignored_overlap(pred_id): total_ignored_overlap = 0 for _ign_id in ign_ids: int_id = _ign_id * offset + pred_id total_ignored_overlap += int_areas.get(int_id, 0) return total_ignored_overlap gt_matched = set() pred_matched = set() for int_id, int_area in six.iteritems(int_areas): gt_id = int(int_id // offset) gt_cat = int(gt_id // max_ins) pred_id = int(int_id % offset) pred_cat = int(pred_id // max_ins) if gt_cat != pred_cat: continue union = ( gt_areas[gt_id] + pred_areas[pred_id] - int_area - prediction_void_overlap(pred_id) ) iou = int_area / union if iou > 0.5: tp_per_class[gt_cat] += 1 iou_per_class[gt_cat] += iou gt_matched.add(gt_id) pred_matched.add(pred_id) for gt_id in six.iterkeys(gt_areas): if gt_id in gt_matched: continue cat_id = gt_id // max_ins if cat_id == ign_id: continue fn_per_class[cat_id] += 1 for pred_id in six.iterkeys(pred_areas): if pred_id in pred_matched: continue if (prediction_ignored_overlap(pred_id) / pred_areas[pred_id]) > 0.5: continue cat = pred_id // max_ins fp_per_class[cat] += 1 return iou_per_class, tp_per_class, fn_per_class, fp_per_class if __name__ == '__main__': import dataset.dvps_pipelines.loading import dataset.dvps_pipelines.transforms import dataset.pipelines.formatting img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False) data = KITTIDVPSDataset( pipeline=[ dict(type='LoadMultiImagesDirect'), dict(type='LoadMultiAnnotationsDirect', with_depth=True, divisor=0), dict(type='SeqResizeWithDepth', img_scale=(1024, 2048), ratio_range=[1.0, 2.0], keep_ratio=True), dict(type='SeqFlipWithDepth', flip_ratio=0.5), dict(type='SeqRandomCropWithDepth', crop_size=(1024, 2048), share_params=True), dict(type='SeqNormalizeWithDepth', **img_norm_cfg), dict(type='SeqPadWithDepth', size_divisor=32), dict( type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_depth', 'gt_instance_ids']), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ], data_root=os.path.expanduser('~/datasets/kitti-dvps'), split='val', ref_seq_index=[-1, 1] ) np.set_string_function(lambda x: '<{} ; {}>'.format(x.shape, x.dtype)) torch.set_printoptions(profile='short') for item in data: print(item) ================================================ FILE: external/test.py ================================================ import os.path as osp import time import mmcv import torch from mmcv.image import tensor2imgs from mmcv.runner import get_dist_info from mmdet.apis.test import collect_results_cpu, collect_results_gpu from mmdet.core import encode_mask_results from .utils import encode_panoptic def single_gpu_test(model, data_loader, show=False, out_dir=None, show_score_thr=0.3): model.eval() results = [] dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) batch_size = len(result) if show or out_dir: if batch_size == 1 and isinstance(data['img'][0], torch.Tensor): img_tensor = data['img'][0] else: img_tensor = data['img'][0].data[0] img_metas = data['img_metas'][0].data[0] imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg']) assert len(imgs) == len(img_metas) for i, (img, img_meta) in enumerate(zip(imgs, img_metas)): h, w, _ = img_meta['img_shape'] img_show = img[:h, :w, :] ori_h, ori_w = img_meta['ori_shape'][:-1] img_show = mmcv.imresize(img_show, (ori_w, ori_h)) if out_dir: out_file = osp.join(out_dir, img_meta['ori_filename']) else: out_file = None model.module.show_result( img_show, result[i], show=show, out_file=out_file, score_thr=show_score_thr) # encode mask results if isinstance(result[0], tuple): if len(result[0]) == 2: result = [(bbox_results, encode_mask_results(mask_results)) for bbox_results, mask_results in result] # Supporting depth here elif len(result[0]) == 5: result = [(bbox_results, mask_results, seg_results, depth, depth_final) for bbox_results, mask_results, seg_results, depth, depth_final in result ] else: result = [(bbox_results, encode_mask_results(mask_results), encode_panoptic(seg_results)) for bbox_results, mask_results, seg_results in result ] results.extend(result) for _ in range(batch_size): prog_bar.update() return results def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): """Test model with multiple gpus. This method tests model with multiple gpus and collects the results under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' it encodes results to gpu tensors and use gpu communication for results collection. On cpu mode it saves the results on different gpus to 'tmpdir' and collects them by the rank 0 worker. Args: model (nn.Module): Model to be tested. data_loader (nn.Dataloader): Pytorch data loader. tmpdir (str): Path of directory to save the temporary results from different gpus under cpu mode. gpu_collect (bool): Option to use either gpu or cpu to collect results. Returns: list: The prediction results. """ model.eval() results = [] dataset = data_loader.dataset rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) time.sleep(2) # This line can prevent deadlock problem in some cases. for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) # encode mask results if isinstance(result[0], tuple): if len(result[0]) == 2: result = [(bbox_results, encode_mask_results(mask_results)) for bbox_results, mask_results in result] # Supporting depth here elif len(result[0]) == 5: result = [(bbox_results, mask_results, seg_results, depth, depth_final) for bbox_results, mask_results, seg_results, depth, depth_final in result ] else: result = [ (bbox_results, encode_mask_results(mask_results), encode_panoptic(seg_results)) for bbox_results, mask_results, seg_results in result ] results.extend(result) if rank == 0: batch_size = len(result) for _ in range(batch_size * world_size): prog_bar.update() # collect results from all ranks if gpu_collect: results = collect_results_gpu(results, len(dataset)) else: results = collect_results_cpu(results, len(dataset), tmpdir) return results ================================================ FILE: external/train.py ================================================ import warnings import torch from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner, Fp16OptimizerHook, OptimizerHook, build_optimizer, build_runner) from mmcv.utils import build_from_cfg from mmdet.datasets import (build_dataloader, build_dataset, replace_ImageToTensor) from mmdet.utils import get_root_logger from external.evalhooks import DistEvalHook, EvalHook def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) if 'runner' not in cfg: cfg.runner = { 'type': 'EpochBasedRunner', 'max_epochs': cfg.total_epochs } warnings.warn( 'config is now expected to have a `runner` section, ' 'please set `runner` in your config.', UserWarning) else: if 'total_epochs' in cfg: assert cfg.total_epochs == cfg.runner.max_epochs runner = build_runner( cfg.runner, default_args=dict( model=model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: if isinstance(runner, EpochBasedRunner): runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: # Support batch_size > 1 in validation val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1) if val_samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.val.pipeline = replace_ImageToTensor( cfg.data.val.pipeline) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=val_samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow) ================================================ FILE: external/utils.py ================================================ import io from panopticapi.utils import id2rgb from PIL import Image def encode_panoptic(panoptic_results): panoptic_img, segments_info = panoptic_results with io.BytesIO() as out: Image.fromarray(id2rgb(panoptic_img)).save(out, format='PNG') return out.getvalue(), segments_info ================================================ FILE: external/vipseg_dvps.py ================================================ import os import random from typing import Dict, List import copy import mmcv import numpy as np import torch from mmdet.datasets.builder import DATASETS from mmdet.datasets.pipelines import Compose from mmdet.utils import get_root_logger CLASSES = [ {"id": 0, "name": "wall", "isthing": 0, "color": [120, 120, 120]}, {"id": 1, "name": "ceiling", "isthing": 0, "color": [180, 120, 120]}, {"id": 2, "name": "door", "isthing": 1, "color": [6, 230, 230]}, {"id": 3, "name": "stair", "isthing": 0, "color": [80, 50, 50]}, {"id": 4, "name": "ladder", "isthing": 1, "color": [4, 200, 3]}, {"id": 5, "name": "escalator", "isthing": 0, "color": [120, 120, 80]}, {"id": 6, "name": "Playground_slide", "isthing": 0, "color": [140, 140, 140]}, {"id": 7, "name": "handrail_or_fence", "isthing": 0, "color": [204, 5, 255]}, {"id": 8, "name": "window", "isthing": 1, "color": [230, 230, 230]}, {"id": 9, "name": "rail", "isthing": 0, "color": [4, 250, 7]}, {"id": 10, "name": "goal", "isthing": 1, "color": [224, 5, 255]}, {"id": 11, "name": "pillar", "isthing": 0, "color": [235, 255, 7]}, {"id": 12, "name": "pole", "isthing": 0, "color": [150, 5, 61]}, {"id": 13, "name": "floor", "isthing": 0, "color": [120, 120, 70]}, {"id": 14, "name": "ground", "isthing": 0, "color": [8, 255, 51]}, {"id": 15, "name": "grass", "isthing": 0, "color": [255, 6, 82]}, {"id": 16, "name": "sand", "isthing": 0, "color": [143, 255, 140]}, {"id": 17, "name": "athletic_field", "isthing": 0, "color": [204, 255, 4]}, {"id": 18, "name": "road", "isthing": 0, "color": [255, 51, 7]}, {"id": 19, "name": "path", "isthing": 0, "color": [204, 70, 3]}, {"id": 20, "name": "crosswalk", "isthing": 0, "color": [0, 102, 200]}, {"id": 21, "name": "building", "isthing": 0, "color": [61, 230, 250]}, {"id": 22, "name": "house", "isthing": 0, "color": [255, 6, 51]}, {"id": 23, "name": "bridge", "isthing": 0, "color": [11, 102, 255]}, {"id": 24, "name": "tower", "isthing": 0, "color": [255, 7, 71]}, {"id": 25, "name": "windmill", "isthing": 0, "color": [255, 9, 224]}, {"id": 26, "name": "well_or_well_lid", "isthing": 0, "color": [9, 7, 230]}, {"id": 27, "name": "other_construction", "isthing": 0, "color": [220, 220, 220]}, {"id": 28, "name": "sky", "isthing": 0, "color": [255, 9, 92]}, {"id": 29, "name": "mountain", "isthing": 0, "color": [112, 9, 255]}, {"id": 30, "name": "stone", "isthing": 0, "color": [8, 255, 214]}, {"id": 31, "name": "wood", "isthing": 0, "color": [7, 255, 224]}, {"id": 32, "name": "ice", "isthing": 0, "color": [255, 184, 6]}, {"id": 33, "name": "snowfield", "isthing": 0, "color": [10, 255, 71]}, {"id": 34, "name": "grandstand", "isthing": 0, "color": [255, 41, 10]}, {"id": 35, "name": "sea", "isthing": 0, "color": [7, 255, 255]}, {"id": 36, "name": "river", "isthing": 0, "color": [224, 255, 8]}, {"id": 37, "name": "lake", "isthing": 0, "color": [102, 8, 255]}, {"id": 38, "name": "waterfall", "isthing": 0, "color": [255, 61, 6]}, {"id": 39, "name": "water", "isthing": 0, "color": [255, 194, 7]}, {"id": 40, "name": "billboard_or_Bulletin_Board", "isthing": 0, "color": [255, 122, 8]}, {"id": 41, "name": "sculpture", "isthing": 1, "color": [0, 255, 20]}, {"id": 42, "name": "pipeline", "isthing": 0, "color": [255, 8, 41]}, {"id": 43, "name": "flag", "isthing": 1, "color": [255, 5, 153]}, {"id": 44, "name": "parasol_or_umbrella", "isthing": 1, "color": [6, 51, 255]}, {"id": 45, "name": "cushion_or_carpet", "isthing": 0, "color": [235, 12, 255]}, {"id": 46, "name": "tent", "isthing": 1, "color": [160, 150, 20]}, {"id": 47, "name": "roadblock", "isthing": 1, "color": [0, 163, 255]}, {"id": 48, "name": "car", "isthing": 1, "color": [140, 140, 140]}, {"id": 49, "name": "bus", "isthing": 1, "color": [250, 10, 15]}, {"id": 50, "name": "truck", "isthing": 1, "color": [20, 255, 0]}, {"id": 51, "name": "bicycle", "isthing": 1, "color": [31, 255, 0]}, {"id": 52, "name": "motorcycle", "isthing": 1, "color": [255, 31, 0]}, {"id": 53, "name": "wheeled_machine", "isthing": 0, "color": [255, 224, 0]}, {"id": 54, "name": "ship_or_boat", "isthing": 1, "color": [153, 255, 0]}, {"id": 55, "name": "raft", "isthing": 1, "color": [0, 0, 255]}, {"id": 56, "name": "airplane", "isthing": 1, "color": [255, 71, 0]}, {"id": 57, "name": "tyre", "isthing": 0, "color": [0, 235, 255]}, {"id": 58, "name": "traffic_light", "isthing": 0, "color": [0, 173, 255]}, {"id": 59, "name": "lamp", "isthing": 0, "color": [31, 0, 255]}, {"id": 60, "name": "person", "isthing": 1, "color": [11, 200, 200]}, {"id": 61, "name": "cat", "isthing": 1, "color": [255, 82, 0]}, {"id": 62, "name": "dog", "isthing": 1, "color": [0, 255, 245]}, {"id": 63, "name": "horse", "isthing": 1, "color": [0, 61, 255]}, {"id": 64, "name": "cattle", "isthing": 1, "color": [0, 255, 112]}, {"id": 65, "name": "other_animal", "isthing": 1, "color": [0, 255, 133]}, {"id": 66, "name": "tree", "isthing": 0, "color": [255, 0, 0]}, {"id": 67, "name": "flower", "isthing": 0, "color": [255, 163, 0]}, {"id": 68, "name": "other_plant", "isthing": 0, "color": [255, 102, 0]}, {"id": 69, "name": "toy", "isthing": 0, "color": [194, 255, 0]}, {"id": 70, "name": "ball_net", "isthing": 0, "color": [0, 143, 255]}, {"id": 71, "name": "backboard", "isthing": 0, "color": [51, 255, 0]}, {"id": 72, "name": "skateboard", "isthing": 1, "color": [0, 82, 255]}, {"id": 73, "name": "bat", "isthing": 0, "color": [0, 255, 41]}, {"id": 74, "name": "ball", "isthing": 1, "color": [0, 255, 173]}, {"id": 75, "name": "cupboard_or_showcase_or_storage_rack", "isthing": 0, "color": [10, 0, 255]}, {"id": 76, "name": "box", "isthing": 1, "color": [173, 255, 0]}, {"id": 77, "name": "traveling_case_or_trolley_case", "isthing": 1, "color": [0, 255, 153]}, {"id": 78, "name": "basket", "isthing": 1, "color": [255, 92, 0]}, {"id": 79, "name": "bag_or_package", "isthing": 1, "color": [255, 0, 255]}, {"id": 80, "name": "trash_can", "isthing": 0, "color": [255, 0, 245]}, {"id": 81, "name": "cage", "isthing": 0, "color": [255, 0, 102]}, {"id": 82, "name": "plate", "isthing": 1, "color": [255, 173, 0]}, {"id": 83, "name": "tub_or_bowl_or_pot", "isthing": 1, "color": [255, 0, 20]}, {"id": 84, "name": "bottle_or_cup", "isthing": 1, "color": [255, 184, 184]}, {"id": 85, "name": "barrel", "isthing": 1, "color": [0, 31, 255]}, {"id": 86, "name": "fishbowl", "isthing": 1, "color": [0, 255, 61]}, {"id": 87, "name": "bed", "isthing": 1, "color": [0, 71, 255]}, {"id": 88, "name": "pillow", "isthing": 1, "color": [255, 0, 204]}, {"id": 89, "name": "table_or_desk", "isthing": 1, "color": [0, 255, 194]}, {"id": 90, "name": "chair_or_seat", "isthing": 1, "color": [0, 255, 82]}, {"id": 91, "name": "bench", "isthing": 1, "color": [0, 10, 255]}, {"id": 92, "name": "sofa", "isthing": 1, "color": [0, 112, 255]}, {"id": 93, "name": "shelf", "isthing": 0, "color": [51, 0, 255]}, {"id": 94, "name": "bathtub", "isthing": 0, "color": [0, 194, 255]}, {"id": 95, "name": "gun", "isthing": 1, "color": [0, 122, 255]}, {"id": 96, "name": "commode", "isthing": 1, "color": [0, 255, 163]}, {"id": 97, "name": "roaster", "isthing": 1, "color": [255, 153, 0]}, {"id": 98, "name": "other_machine", "isthing": 0, "color": [0, 255, 10]}, {"id": 99, "name": "refrigerator", "isthing": 1, "color": [255, 112, 0]}, {"id": 100, "name": "washing_machine", "isthing": 1, "color": [143, 255, 0]}, {"id": 101, "name": "Microwave_oven", "isthing": 1, "color": [82, 0, 255]}, {"id": 102, "name": "fan", "isthing": 1, "color": [163, 255, 0]}, {"id": 103, "name": "curtain", "isthing": 0, "color": [255, 235, 0]}, {"id": 104, "name": "textiles", "isthing": 0, "color": [8, 184, 170]}, {"id": 105, "name": "clothes", "isthing": 0, "color": [133, 0, 255]}, {"id": 106, "name": "painting_or_poster", "isthing": 1, "color": [0, 255, 92]}, {"id": 107, "name": "mirror", "isthing": 1, "color": [184, 0, 255]}, {"id": 108, "name": "flower_pot_or_vase", "isthing": 1, "color": [255, 0, 31]}, {"id": 109, "name": "clock", "isthing": 1, "color": [0, 184, 255]}, {"id": 110, "name": "book", "isthing": 0, "color": [0, 214, 255]}, {"id": 111, "name": "tool", "isthing": 0, "color": [255, 0, 112]}, {"id": 112, "name": "blackboard", "isthing": 0, "color": [92, 255, 0]}, {"id": 113, "name": "tissue", "isthing": 0, "color": [0, 224, 255]}, {"id": 114, "name": "screen_or_television", "isthing": 1, "color": [112, 224, 255]}, {"id": 115, "name": "computer", "isthing": 1, "color": [70, 184, 160]}, {"id": 116, "name": "printer", "isthing": 1, "color": [163, 0, 255]}, {"id": 117, "name": "Mobile_phone", "isthing": 1, "color": [153, 0, 255]}, {"id": 118, "name": "keyboard", "isthing": 1, "color": [71, 255, 0]}, {"id": 119, "name": "other_electronic_product", "isthing": 0, "color": [255, 0, 163]}, {"id": 120, "name": "fruit", "isthing": 0, "color": [255, 204, 0]}, {"id": 121, "name": "food", "isthing": 0, "color": [255, 0, 143]}, {"id": 122, "name": "instrument", "isthing": 1, "color": [0, 255, 235]}, {"id": 123, "name": "train", "isthing": 1, "color": [133, 255, 0]} ] CLASSES_THING = [ {'id': 2, 'name': 'door', 'isthing': 1, 'color': [6, 230, 230]}, {'id': 4, 'name': 'ladder', 'isthing': 1, 'color': [4, 200, 3]}, {'id': 8, 'name': 'window', 'isthing': 1, 'color': [230, 230, 230]}, {'id': 10, 'name': 'goal', 'isthing': 1, 'color': [224, 5, 255]}, {'id': 41, 'name': 'sculpture', 'isthing': 1, 'color': [0, 255, 20]}, {'id': 43, 'name': 'flag', 'isthing': 1, 'color': [255, 5, 153]}, {'id': 44, 'name': 'parasol_or_umbrella', 'isthing': 1, 'color': [6, 51, 255]}, {'id': 46, 'name': 'tent', 'isthing': 1, 'color': [160, 150, 20]}, {'id': 47, 'name': 'roadblock', 'isthing': 1, 'color': [0, 163, 255]}, {'id': 48, 'name': 'car', 'isthing': 1, 'color': [140, 140, 140]}, {'id': 49, 'name': 'bus', 'isthing': 1, 'color': [250, 10, 15]}, {'id': 50, 'name': 'truck', 'isthing': 1, 'color': [20, 255, 0]}, {'id': 51, 'name': 'bicycle', 'isthing': 1, 'color': [31, 255, 0]}, {'id': 52, 'name': 'motorcycle', 'isthing': 1, 'color': [255, 31, 0]}, {'id': 54, 'name': 'ship_or_boat', 'isthing': 1, 'color': [153, 255, 0]}, {'id': 55, 'name': 'raft', 'isthing': 1, 'color': [0, 0, 255]}, {'id': 56, 'name': 'airplane', 'isthing': 1, 'color': [255, 71, 0]}, {'id': 60, 'name': 'person', 'isthing': 1, 'color': [11, 200, 200]}, {'id': 61, 'name': 'cat', 'isthing': 1, 'color': [255, 82, 0]}, {'id': 62, 'name': 'dog', 'isthing': 1, 'color': [0, 255, 245]}, {'id': 63, 'name': 'horse', 'isthing': 1, 'color': [0, 61, 255]}, {'id': 64, 'name': 'cattle', 'isthing': 1, 'color': [0, 255, 112]}, {'id': 65, 'name': 'other_animal', 'isthing': 1, 'color': [0, 255, 133]}, {'id': 72, 'name': 'skateboard', 'isthing': 1, 'color': [0, 82, 255]}, {'id': 74, 'name': 'ball', 'isthing': 1, 'color': [0, 255, 173]}, {'id': 76, 'name': 'box', 'isthing': 1, 'color': [173, 255, 0]}, {'id': 77, 'name': 'traveling_case_or_trolley_case', 'isthing': 1, 'color': [0, 255, 153]}, {'id': 78, 'name': 'basket', 'isthing': 1, 'color': [255, 92, 0]}, {'id': 79, 'name': 'bag_or_package', 'isthing': 1, 'color': [255, 0, 255]}, {'id': 82, 'name': 'plate', 'isthing': 1, 'color': [255, 173, 0]}, {'id': 83, 'name': 'tub_or_bowl_or_pot', 'isthing': 1, 'color': [255, 0, 20]}, {'id': 84, 'name': 'bottle_or_cup', 'isthing': 1, 'color': [255, 184, 184]}, {'id': 85, 'name': 'barrel', 'isthing': 1, 'color': [0, 31, 255]}, {'id': 86, 'name': 'fishbowl', 'isthing': 1, 'color': [0, 255, 61]}, {'id': 87, 'name': 'bed', 'isthing': 1, 'color': [0, 71, 255]}, {'id': 88, 'name': 'pillow', 'isthing': 1, 'color': [255, 0, 204]}, {'id': 89, 'name': 'table_or_desk', 'isthing': 1, 'color': [0, 255, 194]}, {'id': 90, 'name': 'chair_or_seat', 'isthing': 1, 'color': [0, 255, 82]}, {'id': 91, 'name': 'bench', 'isthing': 1, 'color': [0, 10, 255]}, {'id': 92, 'name': 'sofa', 'isthing': 1, 'color': [0, 112, 255]}, {'id': 95, 'name': 'gun', 'isthing': 1, 'color': [0, 122, 255]}, {'id': 96, 'name': 'commode', 'isthing': 1, 'color': [0, 255, 163]}, {'id': 97, 'name': 'roaster', 'isthing': 1, 'color': [255, 153, 0]}, {'id': 99, 'name': 'refrigerator', 'isthing': 1, 'color': [255, 112, 0]}, {'id': 100, 'name': 'washing_machine', 'isthing': 1, 'color': [143, 255, 0]}, {'id': 101, 'name': 'Microwave_oven', 'isthing': 1, 'color': [82, 0, 255]}, {'id': 102, 'name': 'fan', 'isthing': 1, 'color': [163, 255, 0]}, {'id': 106, 'name': 'painting_or_poster', 'isthing': 1, 'color': [0, 255, 92]}, {'id': 107, 'name': 'mirror', 'isthing': 1, 'color': [184, 0, 255]}, {'id': 108, 'name': 'flower_pot_or_vase', 'isthing': 1, 'color': [255, 0, 31]}, {'id': 109, 'name': 'clock', 'isthing': 1, 'color': [0, 184, 255]}, {'id': 114, 'name': 'screen_or_television', 'isthing': 1, 'color': [112, 224, 255]}, {'id': 115, 'name': 'computer', 'isthing': 1, 'color': [70, 184, 160]}, {'id': 116, 'name': 'printer', 'isthing': 1, 'color': [163, 0, 255]}, {'id': 117, 'name': 'Mobile_phone', 'isthing': 1, 'color': [153, 0, 255]}, {'id': 118, 'name': 'keyboard', 'isthing': 1, 'color': [71, 255, 0]}, {'id': 122, 'name': 'instrument', 'isthing': 1, 'color': [0, 255, 235]}, {'id': 123, 'name': 'train', 'isthing': 1, 'color': [133, 255, 0]} ] CLASSES_STUFF = [ {'id': 0, 'name': 'wall', 'isthing': 0, 'color': [120, 120, 120]}, {'id': 1, 'name': 'ceiling', 'isthing': 0, 'color': [180, 120, 120]}, {'id': 3, 'name': 'stair', 'isthing': 0, 'color': [80, 50, 50]}, {'id': 5, 'name': 'escalator', 'isthing': 0, 'color': [120, 120, 80]}, {'id': 6, 'name': 'Playground_slide', 'isthing': 0, 'color': [140, 140, 140]}, {'id': 7, 'name': 'handrail_or_fence', 'isthing': 0, 'color': [204, 5, 255]}, {'id': 9, 'name': 'rail', 'isthing': 0, 'color': [4, 250, 7]}, {'id': 11, 'name': 'pillar', 'isthing': 0, 'color': [235, 255, 7]}, {'id': 12, 'name': 'pole', 'isthing': 0, 'color': [150, 5, 61]}, {'id': 13, 'name': 'floor', 'isthing': 0, 'color': [120, 120, 70]}, {'id': 14, 'name': 'ground', 'isthing': 0, 'color': [8, 255, 51]}, {'id': 15, 'name': 'grass', 'isthing': 0, 'color': [255, 6, 82]}, {'id': 16, 'name': 'sand', 'isthing': 0, 'color': [143, 255, 140]}, {'id': 17, 'name': 'athletic_field', 'isthing': 0, 'color': [204, 255, 4]}, {'id': 18, 'name': 'road', 'isthing': 0, 'color': [255, 51, 7]}, {'id': 19, 'name': 'path', 'isthing': 0, 'color': [204, 70, 3]}, {'id': 20, 'name': 'crosswalk', 'isthing': 0, 'color': [0, 102, 200]}, {'id': 21, 'name': 'building', 'isthing': 0, 'color': [61, 230, 250]}, {'id': 22, 'name': 'house', 'isthing': 0, 'color': [255, 6, 51]}, {'id': 23, 'name': 'bridge', 'isthing': 0, 'color': [11, 102, 255]}, {'id': 24, 'name': 'tower', 'isthing': 0, 'color': [255, 7, 71]}, {'id': 25, 'name': 'windmill', 'isthing': 0, 'color': [255, 9, 224]}, {'id': 26, 'name': 'well_or_well_lid', 'isthing': 0, 'color': [9, 7, 230]}, {'id': 27, 'name': 'other_construction', 'isthing': 0, 'color': [220, 220, 220]}, {'id': 28, 'name': 'sky', 'isthing': 0, 'color': [255, 9, 92]}, {'id': 29, 'name': 'mountain', 'isthing': 0, 'color': [112, 9, 255]}, {'id': 30, 'name': 'stone', 'isthing': 0, 'color': [8, 255, 214]}, {'id': 31, 'name': 'wood', 'isthing': 0, 'color': [7, 255, 224]}, {'id': 32, 'name': 'ice', 'isthing': 0, 'color': [255, 184, 6]}, {'id': 33, 'name': 'snowfield', 'isthing': 0, 'color': [10, 255, 71]}, {'id': 34, 'name': 'grandstand', 'isthing': 0, 'color': [255, 41, 10]}, {'id': 35, 'name': 'sea', 'isthing': 0, 'color': [7, 255, 255]}, {'id': 36, 'name': 'river', 'isthing': 0, 'color': [224, 255, 8]}, {'id': 37, 'name': 'lake', 'isthing': 0, 'color': [102, 8, 255]}, {'id': 38, 'name': 'waterfall', 'isthing': 0, 'color': [255, 61, 6]}, {'id': 39, 'name': 'water', 'isthing': 0, 'color': [255, 194, 7]}, {'id': 40, 'name': 'billboard_or_Bulletin_Board', 'isthing': 0, 'color': [255, 122, 8]}, {'id': 42, 'name': 'pipeline', 'isthing': 0, 'color': [255, 8, 41]}, {'id': 45, 'name': 'cushion_or_carpet', 'isthing': 0, 'color': [235, 12, 255]}, {'id': 53, 'name': 'wheeled_machine', 'isthing': 0, 'color': [255, 224, 0]}, {'id': 57, 'name': 'tyre', 'isthing': 0, 'color': [0, 235, 255]}, {'id': 58, 'name': 'traffic_light', 'isthing': 0, 'color': [0, 173, 255]}, {'id': 59, 'name': 'lamp', 'isthing': 0, 'color': [31, 0, 255]}, {'id': 66, 'name': 'tree', 'isthing': 0, 'color': [255, 0, 0]}, {'id': 67, 'name': 'flower', 'isthing': 0, 'color': [255, 163, 0]}, {'id': 68, 'name': 'other_plant', 'isthing': 0, 'color': [255, 102, 0]}, {'id': 69, 'name': 'toy', 'isthing': 0, 'color': [194, 255, 0]}, {'id': 70, 'name': 'ball_net', 'isthing': 0, 'color': [0, 143, 255]}, {'id': 71, 'name': 'backboard', 'isthing': 0, 'color': [51, 255, 0]}, {'id': 73, 'name': 'bat', 'isthing': 0, 'color': [0, 255, 41]}, {'id': 75, 'name': 'cupboard_or_showcase_or_storage_rack', 'isthing': 0, 'color': [10, 0, 255]}, {'id': 80, 'name': 'trash_can', 'isthing': 0, 'color': [255, 0, 245]}, {'id': 81, 'name': 'cage', 'isthing': 0, 'color': [255, 0, 102]}, {'id': 93, 'name': 'shelf', 'isthing': 0, 'color': [51, 0, 255]}, {'id': 94, 'name': 'bathtub', 'isthing': 0, 'color': [0, 194, 255]}, {'id': 98, 'name': 'other_machine', 'isthing': 0, 'color': [0, 255, 10]}, {'id': 103, 'name': 'curtain', 'isthing': 0, 'color': [255, 235, 0]}, {'id': 104, 'name': 'textiles', 'isthing': 0, 'color': [8, 184, 170]}, {'id': 105, 'name': 'clothes', 'isthing': 0, 'color': [133, 0, 255]}, {'id': 110, 'name': 'book', 'isthing': 0, 'color': [0, 214, 255]}, {'id': 111, 'name': 'tool', 'isthing': 0, 'color': [255, 0, 112]}, {'id': 112, 'name': 'blackboard', 'isthing': 0, 'color': [92, 255, 0]}, {'id': 113, 'name': 'tissue', 'isthing': 0, 'color': [0, 224, 255]}, {'id': 119, 'name': 'other_electronic_product', 'isthing': 0, 'color': [255, 0, 163]}, {'id': 120, 'name': 'fruit', 'isthing': 0, 'color': [255, 204, 0]}, {'id': 121, 'name': 'food', 'isthing': 0, 'color': [255, 0, 143]} ] # stuff -> thing NO_OBJ = 0 NO_OBJ_HB = 255 DIVISOR_PAN = 100 DIVISOR_NEW = 1000 NUM_THING = 58 NUM_STUFF = 66 THING_B_STUFF = False def vip2hb(pan_map): assert not THING_B_STUFF, "VIPSeg only supports stuff -> thing" pan_new = - np.ones_like(pan_map) vip2hb_thing = {itm['id'] + 1: idx for idx, itm in enumerate(CLASSES_THING)} vip2hb_stuff = {itm['id'] + 1: idx for idx, itm in enumerate(CLASSES_STUFF)} for idx in np.unique(pan_map): if idx == NO_OBJ or idx == 200: pan_new[pan_map == idx] = NO_OBJ_HB elif idx > 128: cls_id = idx // DIVISOR_PAN cls_new_id = vip2hb_thing[cls_id] inst_id = idx % DIVISOR_PAN # since stuff -> thing cls_new_id += NUM_STUFF pan_new[pan_map == idx] = cls_new_id * DIVISOR_NEW + inst_id else: pan_new[pan_map == idx] = vip2hb_stuff[idx] assert -1. not in np.unique(pan_new) return pan_new class SeqObj: # This divisor is orthogonal with panoptic class-instance divisor. DIVISOR = 1000000 def __init__(self, the_dict: Dict): self.dict = the_dict assert 'seq_id' in self.dict and 'img_id' in self.dict def __hash__(self): return self.dict['seq_id'] * self.DIVISOR + self.dict['img_id'] def __eq__(self, other): return self.dict['seq_id'] == other.dict['seq_id'] and self.dict['img_id'] == other.dict['img_id'] def __getitem__(self, attr): return self.dict[attr] @DATASETS.register_module() class VIPSegDVPSDataset: CLASSES = ( 'dummy' ) def __init__(self, pipeline=None, data_root=None, test_mode=False, split='train', ref_seq_index: List[int] = None, is_instance_only: bool = True, ): logger = get_root_logger() assert data_root is not None data_root = os.path.expanduser(data_root) img_root = os.path.join(data_root, 'images') seg_root = os.path.join(data_root, 'panomasks') assert os.path.exists(img_root) assert os.path.exists(seg_root) # read split file split_file = os.path.join(data_root, split + '.txt') video_folders = mmcv.list_from_file(split_file, prefix=img_root + '/') ann_folders = mmcv.list_from_file(split_file, prefix=seg_root + '/') logger.info("VIPSegDVPSDataset : There are totally {} videos in {} split.".format(len(video_folders), split)) # 58 things and 66 stuff, totally 124 classes self.num_thing_classes = 58 self.num_stuff_classes = 66 assert len(CLASSES_THING) == self.num_thing_classes assert len(CLASSES_STUFF) == self.num_stuff_classes assert len(CLASSES) == self.num_thing_classes + self.num_stuff_classes self.thing_before_stuff = False # ref_seq_index is None means no ref img if ref_seq_index is None: ref_seq_index = [] images = [] # remember that both img_id and seq_id start from 0 _tmp_seq_id = -1 for vid_folder, ann_folder in zip(video_folders, ann_folders): assert os.path.basename(vid_folder) == os.path.basename(ann_folder) _tmp_seq_id += 1 _tmp_img_id = -1 imgs_cur = sorted(list(map(lambda x: str(x), mmcv.scandir(vid_folder, recursive=False, suffix='.jpg')))) pans_cur = sorted(list(map(lambda x: str(x), mmcv.scandir(ann_folder, recursive=False, suffix='.png')))) for img_cur, pan_cur in zip(imgs_cur, pans_cur): assert img_cur.split('.')[0] == pan_cur.split('.')[0] _tmp_img_id += 1 seq_id = _tmp_seq_id img_id = _tmp_img_id item_full = os.path.join(vid_folder, img_cur) inst_map = os.path.join(ann_folder, pan_cur) images.append(SeqObj({ 'seq_id': int(seq_id), 'img_id': int(img_id), 'img': item_full, 'ann': inst_map, 'no_obj_class': 255 })) assert os.path.exists(images[-1]['img']) assert os.path.exists(images[-1]['ann']) # Warning from Haobo: the following codes are dangerous # because they rely on a consistent seed among different # processes. Please contact me before using it. reference_images = {hash(image): image for image in images} sequences = [] for img_cur in images: is_seq = True seq_now = [img_cur.dict] if ref_seq_index: for index in random.choices(ref_seq_index, k=1): query_obj = SeqObj({ 'seq_id': img_cur.dict['seq_id'], 'img_id': img_cur.dict['img_id'] + index }) if hash(query_obj) in reference_images: seq_now.append(reference_images[hash(query_obj)].dict) else: is_seq = False break if is_seq: sequences.append(seq_now) self.sequences = sequences self.ref_seq_index = ref_seq_index logger.info("VIPSegDVPSDataset : There are totally {} clips in {} split for training.".format( len(self.sequences), split)) # mmdet self.pipeline = Compose(pipeline) self.test_mode = test_mode # misc self.flag = self._set_groups() self.is_instance_only = is_instance_only # For evaluation self.max_ins = 1000 self.no_obj_id = 255 def pre_pipelines(self, results): for _results in results: _results['img_info'] = [] _results['thing_lower'] = 0 if self.thing_before_stuff else self.num_stuff_classes _results['thing_upper'] = self.num_thing_classes \ if self.thing_before_stuff else self.num_stuff_classes + self.num_thing_classes _results['is_instance_only'] = self.is_instance_only _results['ori_filename'] = os.path.basename(_results['img']) _results['filename'] = _results['img'] _results['pre_hook'] = vip2hb def prepare_train_img(self, idx): """Get training data and annotations after pipeline. Args: idx (int): Index of data. Returns: dict: Training data and annotation after pipeline with new keys \ introduced by pipeline. """ results = copy.deepcopy(self.sequences[idx]) self.pre_pipelines(results) return self.pipeline(results) def prepare_test_img(self, idx): results = copy.deepcopy(self.sequences[idx]) self.pre_pipelines(results) # During test time, one image inference does not requires seq if not self.ref_seq_index: results = results[0] return self.pipeline(results) def _rand_another(self, idx): """Get another random index from the same group as the given index.""" pool = np.where(self.flag == self.flag[idx])[0] return np.random.choice(pool) # Copy and Modify from mmdet def __getitem__(self, idx): """Get training/test data after pipeline. Args: idx (int): Index of data. Returns: dict: Training/test data (with annotation if `test_mode` is set \ True). """ if self.test_mode: return self.prepare_test_img(idx) else: while True: cur_data = self.prepare_train_img(idx) if cur_data is None: idx = self._rand_another(idx) continue return cur_data def __len__(self): """Total number of samples of data.""" return len(self.sequences) def _set_groups(self): return np.zeros((len(self)), dtype=np.int64) # The evaluate func def evaluate( self, results, **kwargs ): raise NotImplementedError if __name__ == '__main__': import dataset.dvps_pipelines.loading import dataset.dvps_pipelines.transforms import dataset.pipelines.transforms import dataset.pipelines.formatting import dataset.dvps_pipelines.tricks img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False) test_pipeline = [ dict(type='LoadMultiImagesDirect'), dict(type='SeqPadWithDepth', size_divisor=32), dict(type='SeqNormalize', **img_norm_cfg), dict( type='VideoCollect', keys=['img']), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ] _auto_aug_polices = [ [ dict(type='ColorTransform', prob=0.5, level=3), dict(type='EqualizeTransform', prob=0.5), dict(type='BrightnessTransform', prob=0.5, level=3), dict(type='ContrastTransform', prob=0.5, level=3), ], [ dict(type='EqualizeTransform', prob=0), ] ] data = VIPSegDVPSDataset( pipeline=[ dict(type='LoadMultiImagesDirect'), dict(type='LoadMultiAnnotationsDirect', with_depth=False, vipseg=True), dict(type='SeqAutoAug', policies=_auto_aug_polices), dict(type='SeqResizeWithDepth', img_scale=(720, 100000), ratio_range=[1., 2.], keep_ratio=True), dict(type='SeqFlipWithDepth', flip_ratio=0.5), dict(type='SeqRandomCropWithDepth', crop_size=(736, 736), share_params=True), dict(type='SeqPadWithDepth', size_divisor=32), dict(type='SeqNormalize', **img_norm_cfg), dict( type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), dict(type='ConcatVideoReferences'), dict(type='SeqDefaultFormatBundle', ref_prefix='ref'), ], data_root="data/VIPSeg", test_mode=False, split='train', ref_seq_index=[-1, 1], is_instance_only=False, ) np.set_string_function(lambda x: '<{} ; {}>'.format(x.shape, x.dtype)) torch.set_printoptions(profile='short') for item in data: print(item) ================================================ FILE: knet/__init__.py ================================================ ================================================ FILE: knet/cross_entropy_loss.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from mmdet.models.builder import LOSSES from mmdet.models.losses.utils import weight_reduce_loss def cross_entropy(pred, label, weight=None, reduction='mean', avg_factor=None, class_weight=None, ignore_index=-100): """Calculate the CrossEntropy loss. Args: pred (torch.Tensor): The prediction with shape (N, C), C is the number of classes. label (torch.Tensor): The learning label of the prediction. weight (torch.Tensor, optional): Sample-wise loss weight. reduction (str, optional): The method used to reduce the loss. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. class_weight (list[float], optional): The weight for each class. Returns: torch.Tensor: The calculated loss """ # element-wise losses loss = F.cross_entropy( pred, label, weight=class_weight, reduction='none', ignore_index=ignore_index) # apply weights and do the reduction if weight is not None: weight = weight.float() loss = weight_reduce_loss( loss, weight=weight, reduction=reduction, avg_factor=avg_factor) return loss def _expand_onehot_labels(labels, label_weights, label_channels): bin_labels = labels.new_full((labels.size(0), label_channels), 0) inds = torch.nonzero( (labels >= 0) & (labels < label_channels), as_tuple=False).squeeze() if inds.numel() > 0: bin_labels[inds, labels[inds]] = 1 if label_weights is None: bin_label_weights = None else: bin_label_weights = label_weights.view(-1, 1).expand( label_weights.size(0), label_channels) return bin_labels, bin_label_weights def binary_cross_entropy(pred, label, weight=None, reduction='mean', avg_factor=None, class_weight=None): """Calculate the binary CrossEntropy loss. Args: pred (torch.Tensor): The prediction with shape (N, 1). label (torch.Tensor): The learning label of the prediction. weight (torch.Tensor, optional): Sample-wise loss weight. reduction (str, optional): The method used to reduce the loss. Options are "none", "mean" and "sum". avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. class_weight (list[float], optional): The weight for each class. Returns: torch.Tensor: The calculated loss """ if pred.dim() != label.dim(): label, weight = _expand_onehot_labels(label, weight, pred.size(-1)) # weighted element-wise losses if weight is not None: weight = weight.float() loss = F.binary_cross_entropy_with_logits( pred, label.float(), pos_weight=class_weight, reduction='none') # do the reduction for the weighted loss loss = weight_reduce_loss( loss, weight, reduction=reduction, avg_factor=avg_factor) return loss def mask_cross_entropy(pred, target, label, reduction='mean', avg_factor=None, class_weight=None): """Calculate the CrossEntropy loss for masks. Args: pred (torch.Tensor): The prediction with shape (N, C, *), C is the number of classes. The trailing * indicates arbitrary shape. target (torch.Tensor): The learning label of the prediction. label (torch.Tensor): ``label`` indicates the class label of the mask corresponding object. This will be used to select the mask in the of the class which the object belongs to when the mask prediction if not class-agnostic. reduction (str, optional): The method used to reduce the loss. Options are "none", "mean" and "sum". avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. class_weight (list[float], optional): The weight for each class. Returns: torch.Tensor: The calculated loss Example: >>> N, C = 3, 11 >>> H, W = 2, 2 >>> pred = torch.randn(N, C, H, W) * 1000 >>> target = torch.rand(N, H, W) >>> label = torch.randint(0, C, size=(N,)) >>> reduction = 'mean' >>> avg_factor = None >>> class_weights = None >>> loss = mask_cross_entropy(pred, target, label, reduction, >>> avg_factor, class_weights) >>> assert loss.shape == (1,) """ # TODO: handle these two reserved arguments assert reduction == 'mean' and avg_factor is None num_rois = pred.size()[0] inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device) pred_slice = pred[inds, label].squeeze(1) return F.binary_cross_entropy_with_logits( pred_slice, target, weight=class_weight, reduction='mean')[None] @LOSSES.register_module(force=True) class CrossEntropyLoss(nn.Module): def __init__(self, use_sigmoid=False, use_mask=False, reduction='mean', class_weight=None, loss_weight=1.0): """CrossEntropyLoss. Args: use_sigmoid (bool, optional): Whether the prediction uses sigmoid of softmax. Defaults to False. use_mask (bool, optional): Whether to use mask cross entropy loss. Defaults to False. reduction (str, optional): . Defaults to 'mean'. Options are "none", "mean" and "sum". class_weight (list[float], optional): Weight of each class. Defaults to None. loss_weight (float, optional): Weight of the loss. Defaults to 1.0. """ super(CrossEntropyLoss, self).__init__() assert (use_sigmoid is False) or (use_mask is False) self.use_sigmoid = use_sigmoid self.use_mask = use_mask self.reduction = reduction self.loss_weight = loss_weight self.class_weight = class_weight if self.use_sigmoid: self.cls_criterion = binary_cross_entropy elif self.use_mask: self.cls_criterion = mask_cross_entropy else: self.cls_criterion = cross_entropy def forward(self, cls_score, label, weight=None, avg_factor=None, reduction_override=None, **kwargs): """Forward function. Args: cls_score (torch.Tensor): The prediction. label (torch.Tensor): The learning label of the prediction. weight (torch.Tensor, optional): Sample-wise loss weight. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction (str, optional): The method used to reduce the loss. Options are "none", "mean" and "sum". Returns: torch.Tensor: The calculated loss """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) if self.class_weight is not None: class_weight = cls_score.new_tensor( self.class_weight, device=cls_score.device) else: class_weight = None loss_cls = self.loss_weight * self.cls_criterion( cls_score, label, weight, class_weight=class_weight, reduction=reduction, avg_factor=avg_factor, **kwargs) return loss_cls ================================================ FILE: knet/det/dice_loss.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from mmdet.models.builder import LOSSES, build_loss from mmdet.models.losses.utils import weighted_loss @weighted_loss def dice_loss(input, target, eps=1e-3, numerator_eps=0): input = input.reshape(input.size()[0], -1) target = target.reshape(target.size()[0], -1).float() a = torch.sum(input * target, 1) b = torch.sum(input * input, 1) + eps c = torch.sum(target * target, 1) + eps d = (2 * a + numerator_eps) / (b + c) return 1 - d # # @LOSSES.register_module() # class DiceLoss(nn.Module): # # def __init__(self, # eps=1e-3, # numerator_eps=0.0, # use_sigmoid=True, # reduction='mean', # loss_weight=1.0): # super(DiceLoss, self).__init__() # self.eps = eps # self.reduction = reduction # self.loss_weight = loss_weight # self.use_sigmoid = use_sigmoid # self.numerator_eps = numerator_eps # # def forward(self, # pred, # target, # weight=None, # avg_factor=None, # reduction_override=None, # **kwargs): # if weight is not None and not torch.any(weight > 0): # return (pred * weight).sum() # 0 # assert reduction_override in (None, 'none', 'mean', 'sum') # reduction = ( # reduction_override if reduction_override else self.reduction) # pred = pred.sigmoid() # loss = self.loss_weight * dice_loss( # pred, # target, # weight, # eps=self.eps, # numerator_eps=self.numerator_eps, # reduction=reduction, # avg_factor=avg_factor, # **kwargs) # return loss ================================================ FILE: knet/det/kernel_head.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import (ConvModule, bias_init_with_prob, normal_init) from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean from mmdet.models.builder import HEADS, build_loss, build_neck from mmdet.models.losses import accuracy from mmdet.utils import get_root_logger @HEADS.register_module() class ConvKernelHead(nn.Module): def __init__(self, num_proposals=100, in_channels=256, out_channels=256, num_heads=8, num_cls_fcs=1, num_seg_convs=1, num_loc_convs=1, att_dropout=False, localization_fpn=None, conv_kernel_size=1, norm_cfg=dict(type='GN', num_groups=32), semantic_fpn=True, train_cfg=None, num_classes=80, xavier_init_kernel=False, kernel_init_std=0.01, use_binary=False, proposal_feats_with_obj=False, loss_mask=None, loss_seg=None, loss_cls=None, loss_dice=None, loss_rank=None, feat_downsample_stride=1, feat_refine_stride=1, feat_refine=True, with_embed=False, feat_embed_only=False, conv_normal_init=False, mask_out_stride=4, hard_target=False, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, cat_stuff_mask=False, **kwargs): super(ConvKernelHead, self).__init__() self.num_proposals = num_proposals self.num_cls_fcs = num_cls_fcs self.train_cfg = train_cfg self.in_channels = in_channels self.out_channels = out_channels self.num_classes = num_classes self.proposal_feats_with_obj = proposal_feats_with_obj self.sampling = False self.localization_fpn = build_neck(localization_fpn) self.semantic_fpn = semantic_fpn self.norm_cfg = norm_cfg self.num_heads = num_heads self.att_dropout = att_dropout self.mask_out_stride = mask_out_stride self.hard_target = hard_target self.conv_kernel_size = conv_kernel_size self.xavier_init_kernel = xavier_init_kernel self.kernel_init_std = kernel_init_std self.feat_downsample_stride = feat_downsample_stride self.feat_refine_stride = feat_refine_stride self.conv_normal_init = conv_normal_init self.feat_refine = feat_refine self.with_embed = with_embed self.feat_embed_only = feat_embed_only self.num_loc_convs = num_loc_convs self.num_seg_convs = num_seg_convs self.use_binary = use_binary self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.ignore_label = ignore_label self.thing_label_in_seg = thing_label_in_seg self.cat_stuff_mask = cat_stuff_mask if loss_mask is not None: self.loss_mask = build_loss(loss_mask) else: self.loss_mask = loss_mask if loss_dice is not None: self.loss_dice = build_loss(loss_dice) else: self.loss_dice = loss_dice if loss_seg is not None: self.loss_seg = build_loss(loss_seg) else: self.loss_seg = loss_seg if loss_cls is not None: self.loss_cls = build_loss(loss_cls) else: self.loss_cls = loss_cls if loss_rank is not None: self.loss_rank = build_loss(loss_rank) else: self.loss_rank = loss_rank if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) # use PseudoSampler when sampling is False if self.sampling and hasattr(self.train_cfg, 'sampler'): sampler_cfg = self.train_cfg.sampler else: sampler_cfg = dict(type='MaskPseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self._init_layers() def _init_layers(self): """Initialize a sparse set of proposal boxes and proposal features.""" self.init_kernels = nn.Conv2d( self.out_channels, self.num_proposals, self.conv_kernel_size, padding=int(self.conv_kernel_size // 2), bias=False) # (N, C) if self.semantic_fpn: self.conv_seg = nn.Conv2d(self.out_channels, self.num_classes, 1) if self.feat_downsample_stride > 1 and self.feat_refine: self.ins_downsample = ConvModule( self.in_channels, self.out_channels, 3, stride=self.feat_refine_stride, # 2 padding=1, norm_cfg=self.norm_cfg) self.seg_downsample = ConvModule( self.in_channels, self.out_channels, 3, stride=self.feat_refine_stride, # 2 padding=1, norm_cfg=self.norm_cfg) self.loc_convs = nn.ModuleList() for i in range(self.num_loc_convs): self.loc_convs.append( ConvModule( self.in_channels, self.out_channels, 1, norm_cfg=self.norm_cfg)) self.seg_convs = nn.ModuleList() for i in range(self.num_seg_convs): self.seg_convs.append( ConvModule( self.in_channels, self.out_channels, 1, norm_cfg=self.norm_cfg)) def init_weights(self): self.localization_fpn.init_weights() if self.feat_downsample_stride > 1 and self.conv_normal_init: logger = get_root_logger() logger.info('Initialize convs in KPN head by normal std 0.01') for conv in [self.loc_convs, self.seg_convs]: for m in conv.modules(): if isinstance(m, nn.Conv2d): normal_init(m, std=0.01) if self.semantic_fpn: bias_seg = bias_init_with_prob(0.01) if self.loss_seg.use_sigmoid: normal_init(self.conv_seg, std=0.01, bias=bias_seg) else: normal_init(self.conv_seg, mean=0, std=0.01) if self.xavier_init_kernel: logger = get_root_logger() logger.info('Initialize kernels by xavier uniform') nn.init.xavier_uniform_(self.init_kernels.weight) else: logger = get_root_logger() logger.info( f'Initialize kernels by normal std: {self.kernel_init_std}') normal_init(self.init_kernels, mean=0, std=self.kernel_init_std) def _decode_init_proposals(self, img, img_metas): num_imgs = len(img_metas) localization_feats = self.localization_fpn(img) ## thing branch if isinstance(localization_feats, list): loc_feats = localization_feats[0] else: loc_feats = localization_feats for conv in self.loc_convs: loc_feats = conv(loc_feats) if self.feat_downsample_stride > 1 and self.feat_refine: loc_feats = self.ins_downsample(loc_feats) # init kernel prediction mask_preds = self.init_kernels(loc_feats) # stuff branch if self.semantic_fpn: if isinstance(localization_feats, list): semantic_feats = localization_feats[1] else: semantic_feats = localization_feats for conv in self.seg_convs: semantic_feats = conv(semantic_feats) if self.feat_downsample_stride > 1 and self.feat_refine: semantic_feats = self.seg_downsample(semantic_feats) else: semantic_feats = None if semantic_feats is not None: seg_preds = self.conv_seg(semantic_feats) else: seg_preds = None proposal_feats = self.init_kernels.weight.clone() proposal_feats = proposal_feats[None].expand(num_imgs, *proposal_feats.size()) if semantic_feats is not None: x_feats = semantic_feats + loc_feats else: x_feats = loc_feats if self.proposal_feats_with_obj: sigmoid_masks = mask_preds.sigmoid() nonzero_inds = sigmoid_masks > 0.5 if self.use_binary: sigmoid_masks = nonzero_inds.float() else: sigmoid_masks = nonzero_inds.float() * sigmoid_masks obj_feats = torch.einsum('bnhw, bchw->bnc', sigmoid_masks, x_feats) cls_scores = None if self.proposal_feats_with_obj: # important use proposal_feats = proposal_feats + obj_feats.view( num_imgs, self.num_proposals, self.out_channels, 1, 1) if self.cat_stuff_mask and not self.training: mask_preds = torch.cat( [mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1) stuff_kernels = self.conv_seg.weight[self. num_thing_classes:].clone() stuff_kernels = stuff_kernels[None].expand(num_imgs, *stuff_kernels.size()) proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1) # (b, N_{st}+N_{th}, c) return proposal_feats, x_feats, mask_preds, cls_scores, seg_preds def forward_train(self, img, img_metas, gt_masks, gt_labels, gt_sem_seg=None, gt_sem_cls=None): """Forward function in training stage.""" num_imgs = len(img_metas) results = self._decode_init_proposals(img, img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = results if self.feat_downsample_stride > 1: scaled_mask_preds = F.interpolate( mask_preds, scale_factor=self.feat_downsample_stride, mode='bilinear', align_corners=False) if seg_preds is not None: scaled_seg_preds = F.interpolate( seg_preds, scale_factor=self.feat_downsample_stride, mode='bilinear', align_corners=False) else: scaled_mask_preds = mask_preds # thing scaled_seg_preds = seg_preds # stuff if self.hard_target: gt_masks = [x.bool().float() for x in gt_masks] else: gt_masks = gt_masks sampling_results = [] if cls_scores is None: detached_cls_scores = [None] * num_imgs else: detached_cls_scores = cls_scores.detach() for i in range(num_imgs): assign_result = self.assigner.assign(scaled_mask_preds[i].detach(), detached_cls_scores[i], gt_masks[i], gt_labels[i], img_metas[i]) sampling_result = self.sampler.sample(assign_result, scaled_mask_preds[i], gt_masks[i]) sampling_results.append(sampling_result) mask_targets = self.get_targets( sampling_results, gt_masks, self.train_cfg, True, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls) losses = self.loss(scaled_mask_preds, cls_scores, scaled_seg_preds, proposal_feats, *mask_targets) if self.cat_stuff_mask and self.training: mask_preds = torch.cat( [mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1) stuff_kernels = self.conv_seg.weight[self. num_thing_classes:].clone() stuff_kernels = stuff_kernels[None].expand(num_imgs, *stuff_kernels.size()) proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1) return losses, proposal_feats, x_feats, mask_preds, cls_scores def loss(self, mask_pred, cls_scores, seg_preds, proposal_feats, labels, label_weights, mask_targets, mask_weights, seg_targets, reduction_override=None, **kwargs): losses = dict() bg_class_ind = self.num_classes # note in spare rcnn num_gt == num_pos pos_inds = (labels >= 0) & (labels < bg_class_ind) num_preds = mask_pred.shape[0] * mask_pred.shape[1] if cls_scores is not None: num_pos = pos_inds.sum().float() avg_factor = reduce_mean(num_pos) assert mask_pred.shape[0] == cls_scores.shape[0] assert mask_pred.shape[1] == cls_scores.shape[1] losses['loss_rpn_cls'] = self.loss_cls( cls_scores.view(num_preds, -1), labels, label_weights, avg_factor=avg_factor, reduction_override=reduction_override) losses['rpn_pos_acc'] = accuracy( cls_scores.view(num_preds, -1)[pos_inds], labels[pos_inds]) bool_pos_inds = pos_inds.type(torch.bool) # 0~self.num_classes-1 are FG, self.num_classes is BG # do not perform bounding box regression for BG anymore. H, W = mask_pred.shape[-2:] if pos_inds.any(): pos_mask_pred = mask_pred.reshape(num_preds, H, W)[bool_pos_inds] pos_mask_targets = mask_targets[bool_pos_inds] losses['loss_rpn_mask'] = self.loss_mask(pos_mask_pred, pos_mask_targets) losses['loss_rpn_dice'] = self.loss_dice(pos_mask_pred, pos_mask_targets) if self.loss_rank is not None: batch_size = mask_pred.size(0) rank_target = mask_targets.new_full((batch_size, H, W), self.ignore_label, dtype=torch.long) rank_inds = pos_inds.view(batch_size, -1).nonzero(as_tuple=False) batch_mask_targets = mask_targets.view(batch_size, -1, H, W).bool() for i in range(batch_size): curr_inds = (rank_inds[:, 0] == i) curr_rank = rank_inds[:, 1][curr_inds] for j in curr_rank: rank_target[i][batch_mask_targets[i][j]] = j losses['loss_rpn_rank'] = self.loss_rank( mask_pred, rank_target, ignore_index=self.ignore_label) else: losses['loss_rpn_mask'] = mask_pred.sum() * 0 losses['loss_rpn_dice'] = mask_pred.sum() * 0 if self.loss_rank is not None: losses['loss_rank'] = mask_pred.sum() * 0 if seg_preds is not None: # focal loss if self.loss_seg.use_sigmoid: cls_channel = seg_preds.shape[1] flatten_seg = seg_preds.view( -1, cls_channel, H * W).permute(0, 2, 1).reshape(-1, cls_channel) flatten_seg_target = seg_targets.view(-1) num_dense_pos = (flatten_seg_target >= 0) & ( flatten_seg_target < bg_class_ind) num_dense_pos = num_dense_pos.sum().float().clamp(min=1.0) losses['loss_rpn_seg'] = self.loss_seg( flatten_seg, flatten_seg_target, avg_factor=num_dense_pos) # ce loss else: cls_channel = seg_preds.shape[1] flatten_seg = seg_preds.view(-1, cls_channel, H * W).permute( 0, 2, 1).reshape(-1, cls_channel) flatten_seg_target = seg_targets.view(-1) losses['loss_rpn_seg'] = self.loss_seg(flatten_seg, flatten_seg_target, ignore_index=self.num_classes) return losses def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask, pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls, cfg): num_pos = pos_mask.size(0) num_neg = neg_mask.size(0) num_samples = num_pos + num_neg H, W = pos_mask.shape[-2:] # original implementation uses new_zeros since BG are set to be 0 # now use empty & fill because BG cat_id = num_classes, # FG cat_id = [0, num_classes-1] labels = pos_mask.new_full((num_samples, ), self.num_classes, dtype=torch.long) label_weights = pos_mask.new_zeros(num_samples) mask_targets = pos_mask.new_zeros(num_samples, H, W) mask_weights = pos_mask.new_zeros(num_samples, H, W) seg_targets = pos_mask.new_full((H, W), self.num_classes, dtype=torch.long) if gt_sem_cls is not None and gt_sem_seg is not None: gt_sem_seg = gt_sem_seg.bool() for sem_mask, sem_cls in zip(gt_sem_seg, gt_sem_cls): seg_targets[sem_mask] = sem_cls.long() if num_pos > 0: labels[pos_inds] = pos_gt_labels pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight label_weights[pos_inds] = pos_weight mask_targets[pos_inds, ...] = pos_gt_mask mask_weights[pos_inds, ...] = 1 for i in range(num_pos): seg_targets[pos_gt_mask[i].bool()] = pos_gt_labels[i] if num_neg > 0: label_weights[neg_inds] = 1.0 return labels, label_weights, mask_targets, mask_weights, seg_targets def get_targets(self, sampling_results, gt_mask, rpn_train_cfg, concat=True, gt_sem_seg=None, gt_sem_cls=None): pos_inds_list = [res.pos_inds for res in sampling_results] neg_inds_list = [res.neg_inds for res in sampling_results] pos_mask_list = [res.pos_masks for res in sampling_results] neg_mask_list = [res.neg_masks for res in sampling_results] pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results] pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results] if gt_sem_seg is None: gt_sem_seg = [None] * 2 gt_sem_cls = [None] * 2 results = multi_apply( self._get_target_single, pos_inds_list, neg_inds_list, pos_mask_list, neg_mask_list, pos_gt_mask_list, pos_gt_labels_list, gt_sem_seg, gt_sem_cls, cfg=rpn_train_cfg) (labels, label_weights, mask_targets, mask_weights, seg_targets) = results if concat: labels = torch.cat(labels, 0) label_weights = torch.cat(label_weights, 0) mask_targets = torch.cat(mask_targets, 0) mask_weights = torch.cat(mask_weights, 0) seg_targets = torch.stack(seg_targets, 0) return labels, label_weights, mask_targets, mask_weights, seg_targets def simple_test_rpn(self, img, img_metas): """Forward function in testing stage.""" return self._decode_init_proposals(img, img_metas) def forward_dummy(self, img, img_metas): """Dummy forward function. Used in flops calculation. """ return self._decode_init_proposals(img, img_metas) ================================================ FILE: knet/det/kernel_iter_head.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from mmdet.core import build_assigner, build_sampler from mmdet.models.builder import HEADS, build_head from mmdet.models.roi_heads import BaseRoIHead from .mask_pseudo_sampler import MaskPseudoSampler @HEADS.register_module() class KernelIterHead(BaseRoIHead): def __init__(self, num_stages=6, recursive=False, assign_stages=5, stage_loss_weights=(1, 1, 1, 1, 1, 1), proposal_feature_channel=256, merge_cls_scores=False, do_panoptic=False, post_assign=False, hard_target=False, merge_joint=False, num_proposals=100, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, mask_head=dict( type='KernelUpdateHead', num_classes=80, num_fcs=2, num_heads=8, num_cls_fcs=1, num_reg_fcs=3, feedforward_channels=2048, hidden_channels=256, dropout=0.0, roi_feat_size=7, ffn_act_cfg=dict(type='ReLU', inplace=True)), mask_out_stride=4, train_cfg=None, test_cfg=None, **kwargs): assert mask_head is not None assert len(stage_loss_weights) == num_stages self.num_stages = num_stages self.stage_loss_weights = stage_loss_weights self.proposal_feature_channel = proposal_feature_channel self.merge_cls_scores = merge_cls_scores self.recursive = recursive self.post_assign = post_assign self.mask_out_stride = mask_out_stride self.hard_target = hard_target self.assign_stages = assign_stages self.do_panoptic = do_panoptic self.merge_joint = merge_joint self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.num_classes = self.num_thing_classes + self.num_stuff_classes self.mask_assign_stride = mask_assign_stride self.thing_label_in_seg = thing_label_in_seg self.num_proposals = num_proposals self.ignore_label = ignore_label super(KernelIterHead, self).__init__( mask_head=mask_head, train_cfg=train_cfg, test_cfg=test_cfg, **kwargs) # train_cfg would be None when run the test.py if train_cfg is not None: for stage in range(num_stages): assert isinstance( self.mask_sampler[stage], MaskPseudoSampler), \ 'Sparse Mask only support `MaskPseudoSampler`' def init_bbox_head(self, mask_roi_extractor, mask_head): """Initialize box head and box roi extractor. Args: mask_roi_extractor (dict): Config of box roi extractor. mask_head (dict): Config of box in box head. """ pass def init_assigner_sampler(self): """Initialize assigner and sampler for each stage.""" self.mask_assigner = [] self.mask_sampler = [] if self.train_cfg is not None: for idx, rcnn_train_cfg in enumerate(self.train_cfg): self.mask_assigner.append( build_assigner(rcnn_train_cfg.assigner)) self.current_stage = idx self.mask_sampler.append( build_sampler(rcnn_train_cfg.sampler, context=self)) def init_weights(self): for i in range(self.num_stages): self.mask_head[i].init_weights() def init_mask_head(self, mask_roi_extractor, mask_head): """Initialize mask head and mask roi extractor. Args: mask_roi_extractor (dict): Config of mask roi extractor. mask_head (dict): Config of mask in mask head. """ self.mask_head = nn.ModuleList() if not isinstance(mask_head, list): mask_head = [mask_head for _ in range(self.num_stages)] assert len(mask_head) == self.num_stages for head in mask_head: self.mask_head.append(build_head(head)) if self.recursive: for i in range(self.num_stages): self.mask_head[i] = self.mask_head[0] def _mask_forward(self, stage, x, object_feats, mask_preds, img_metas): mask_head = self.mask_head[stage] cls_score, mask_preds, object_feats = mask_head( x, object_feats, mask_preds, img_metas=img_metas) if mask_head.mask_upsample_stride > 1 and (stage == self.num_stages - 1 or self.training): scaled_mask_preds = F.interpolate( mask_preds, scale_factor=mask_head.mask_upsample_stride, align_corners=False, mode='bilinear') else: scaled_mask_preds = mask_preds mask_results = dict( cls_score=cls_score, mask_preds=mask_preds, scaled_mask_preds=scaled_mask_preds, object_feats=object_feats) return mask_results def forward_train(self, x, proposal_feats, mask_preds, cls_score, img_metas, gt_masks, gt_labels, gt_bboxes_ignore=None, imgs_whwh=None, gt_bboxes=None, gt_sem_seg=None, gt_sem_cls=None): num_imgs = len(img_metas) if self.mask_head[0].mask_upsample_stride > 1: prev_mask_preds = F.interpolate( mask_preds.detach(), scale_factor=self.mask_head[0].mask_upsample_stride, mode='bilinear', align_corners=False) else: prev_mask_preds = mask_preds.detach() if cls_score is not None: prev_cls_score = cls_score.detach() else: prev_cls_score = [None] * num_imgs if self.hard_target: gt_masks = [x.bool().float() for x in gt_masks] else: gt_masks = gt_masks object_feats = proposal_feats all_stage_loss = {} all_stage_mask_results = [] assign_results = [] for stage in range(self.num_stages): mask_results = self._mask_forward(stage, x, object_feats, mask_preds, img_metas) all_stage_mask_results.append(mask_results) mask_preds = mask_results['mask_preds'] scaled_mask_preds = mask_results['scaled_mask_preds'] cls_score = mask_results['cls_score'] object_feats = mask_results['object_feats'] if self.post_assign: prev_mask_preds = scaled_mask_preds.detach() prev_cls_score = cls_score.detach() sampling_results = [] if stage < self.assign_stages: assign_results = [] for i in range(num_imgs): if stage < self.assign_stages: mask_for_assign = prev_mask_preds[i][:self.num_proposals] if prev_cls_score[i] is not None: cls_for_assign = prev_cls_score[ i][:self.num_proposals, :self.num_thing_classes] else: cls_for_assign = None assign_result = self.mask_assigner[stage].assign( mask_for_assign, cls_for_assign, gt_masks[i], gt_labels[i], img_metas[i]) assign_results.append(assign_result) sampling_result = self.mask_sampler[stage].sample( assign_results[i], scaled_mask_preds[i], gt_masks[i]) sampling_results.append(sampling_result) mask_targets = self.mask_head[stage].get_targets( sampling_results, gt_masks, gt_labels, self.train_cfg[stage], True, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls) single_stage_loss = self.mask_head[stage].loss( object_feats, cls_score, scaled_mask_preds, *mask_targets, imgs_whwh=imgs_whwh) for key, value in single_stage_loss.items(): all_stage_loss[f's{stage}_{key}'] = value * \ self.stage_loss_weights[stage] if not self.post_assign: prev_mask_preds = scaled_mask_preds.detach() prev_cls_score = cls_score.detach() return all_stage_loss def simple_test(self, x, proposal_feats, mask_preds, cls_score, img_metas, imgs_whwh=None, rescale=False): # Decode initial proposals num_imgs = len(img_metas) # num_proposals = proposal_feats.size(1) object_feats = proposal_feats for stage in range(self.num_stages): mask_results = self._mask_forward(stage, x, object_feats, mask_preds, img_metas) object_feats = mask_results['object_feats'] cls_score = mask_results['cls_score'] mask_preds = mask_results['mask_preds'] scaled_mask_preds = mask_results['scaled_mask_preds'] num_classes = self.mask_head[-1].num_classes results = [] if self.mask_head[-1].loss_cls.use_sigmoid: cls_score = cls_score.sigmoid() else: cls_score = cls_score.softmax(-1)[..., :-1] if self.do_panoptic: for img_id in range(num_imgs): single_result = self.get_panoptic(cls_score[img_id], scaled_mask_preds[img_id], self.test_cfg, img_metas[img_id]) results.append(single_result) else: for img_id in range(num_imgs): cls_score_per_img = cls_score[img_id] scores_per_img, topk_indices = cls_score_per_img.flatten( 0, 1).topk( self.test_cfg.max_per_img, sorted=True) mask_indices = topk_indices // num_classes labels_per_img = topk_indices % num_classes masks_per_img = scaled_mask_preds[img_id][mask_indices] single_result = self.mask_head[-1].get_seg_masks( masks_per_img, labels_per_img, scores_per_img, self.test_cfg, img_metas[img_id]) results.append(single_result) return results def simple_test_mask_preds(self, x, proposal_feats, mask_preds, cls_score, img_metas, imgs_whwh=None, rescale=False): # Decode initial proposals num_imgs = len(img_metas) # num_proposals = proposal_feats.size(1) object_feats = proposal_feats for stage in range(self.num_stages): mask_results = self._mask_forward(stage, x, object_feats, mask_preds, img_metas) object_feats = mask_results['object_feats'] cls_score = mask_results['cls_score'] mask_preds = mask_results['mask_preds'] scaled_mask_preds = mask_results['scaled_mask_preds'] if self.mask_head[-1].loss_cls.use_sigmoid: cls_score = cls_score.sigmoid() else: cls_score = cls_score.softmax(-1)[..., :-1] return object_feats, cls_score, mask_preds, scaled_mask_preds def aug_test(self, features, proposal_list, img_metas, rescale=False): raise NotImplementedError('SparseMask does not support `aug_test`') def forward_dummy(self, x, proposal_boxes, proposal_feats, img_metas): """Dummy forward function when do the flops computing.""" all_stage_mask_results = [] num_imgs = len(img_metas) num_proposals = proposal_feats.size(1) C, H, W = x.shape[-3:] mask_preds = proposal_feats.bmm(x.view(num_imgs, C, -1)).view( num_imgs, num_proposals, H, W) object_feats = proposal_feats for stage in range(self.num_stages): mask_results = self._mask_forward(stage, x, object_feats, mask_preds, img_metas) all_stage_mask_results.append(mask_results) return all_stage_mask_results def get_panoptic(self, cls_scores, mask_preds, test_cfg, img_meta): # resize mask predictions back thing_scores = cls_scores[:self.num_proposals][:, :self. num_thing_classes] thing_mask_preds = mask_preds[:self.num_proposals] thing_scores, topk_indices = thing_scores.flatten(0, 1).topk( self.test_cfg.max_per_img, sorted=True) mask_indices = topk_indices // self.num_thing_classes thing_labels = topk_indices % self.num_thing_classes masks_per_img = thing_mask_preds[mask_indices] thing_masks = self.mask_head[-1].rescale_masks(masks_per_img, img_meta) if not self.merge_joint: thing_masks = thing_masks > test_cfg.mask_thr bbox_result, segm_result = self.mask_head[-1].segm2result( thing_masks, thing_labels, thing_scores) stuff_scores = cls_scores[ self.num_proposals:][:, self.num_thing_classes:].diag() stuff_scores, stuff_inds = torch.sort(stuff_scores, descending=True) stuff_masks = mask_preds[self.num_proposals:][stuff_inds] stuff_masks = self.mask_head[-1].rescale_masks(stuff_masks, img_meta) if not self.merge_joint: stuff_masks = stuff_masks > test_cfg.mask_thr if self.merge_joint: stuff_labels = stuff_inds + self.num_thing_classes panoptic_result = self.merge_stuff_thing_stuff_joint(thing_masks, thing_labels, thing_scores, stuff_masks, stuff_labels, stuff_scores, test_cfg.merge_stuff_thing) else: stuff_labels = stuff_inds + 1 panoptic_result = self.merge_stuff_thing(thing_masks, thing_labels, thing_scores, stuff_masks, stuff_labels, stuff_scores, test_cfg.merge_stuff_thing) return bbox_result, segm_result, panoptic_result def split_thing_stuff(self, mask_preds, det_labels, cls_scores): thing_scores = cls_scores[:self.num_proposals] thing_masks = mask_preds[:self.num_proposals] thing_labels = det_labels[:self.num_proposals] stuff_labels = det_labels[self.num_proposals:] stuff_labels = stuff_labels - self.num_thing_classes + 1 stuff_masks = mask_preds[self.num_proposals:] stuff_scores = cls_scores[self.num_proposals:] results = (thing_masks, thing_labels, thing_scores, stuff_masks, stuff_labels, stuff_scores) return results def merge_stuff_thing(self, thing_masks, thing_labels, thing_scores, stuff_masks, stuff_labels, stuff_scores, merge_cfg=None): H, W = thing_masks.shape[-2:] panoptic_seg = thing_masks.new_zeros((H, W), dtype=torch.int32) thing_masks = thing_masks.to( dtype=torch.bool, device=panoptic_seg.device) stuff_masks = stuff_masks.to( dtype=torch.bool, device=panoptic_seg.device) # sort instance outputs by scores sorted_inds = torch.argsort(-thing_scores) current_segment_id = 0 segments_info = [] # Add instances one-by-one, check for overlaps with existing ones for inst_id in sorted_inds: score = thing_scores[inst_id].item() if score < merge_cfg.instance_score_thr: break mask = thing_masks[inst_id] # H,W mask_area = mask.sum().item() if mask_area == 0: continue intersect = (mask > 0) & (panoptic_seg > 0) intersect_area = intersect.sum().item() if intersect_area * 1.0 / mask_area > merge_cfg.iou_thr: continue if intersect_area > 0: mask = mask & (panoptic_seg == 0) mask_area = mask.sum().item() if mask_area == 0: continue current_segment_id += 1 panoptic_seg[mask.bool()] = current_segment_id segments_info.append({ 'id': current_segment_id, 'isthing': True, 'score': score, 'category_id': thing_labels[inst_id].item(), 'instance_id': inst_id.item(), }) # Add semantic results to remaining empty areas sorted_inds = torch.argsort(-stuff_scores) sorted_stuff_labels = stuff_labels[sorted_inds] # paste semantic masks following the order of scores processed_label = [] for semantic_label in sorted_stuff_labels: semantic_label = semantic_label.item() if semantic_label in processed_label: continue processed_label.append(semantic_label) sem_inds = stuff_labels == semantic_label sem_masks = stuff_masks[sem_inds].sum(0).bool() mask = sem_masks & (panoptic_seg == 0) mask_area = mask.sum().item() if mask_area < merge_cfg.stuff_max_area: continue current_segment_id += 1 panoptic_seg[mask] = current_segment_id segments_info.append({ 'id': current_segment_id, 'isthing': False, 'category_id': semantic_label, 'area': mask_area, }) return panoptic_seg.cpu().numpy(), segments_info def merge_stuff_thing_stuff_joint(self, thing_masks, thing_labels, thing_scores, stuff_masks, stuff_labels, stuff_scores, merge_cfg=None): H, W = thing_masks.shape[-2:] panoptic_seg = thing_masks.new_zeros((H, W), dtype=torch.int32) total_masks = torch.cat([thing_masks, stuff_masks], dim=0) total_scores = torch.cat([thing_scores, stuff_scores], dim=0) total_labels = torch.cat([thing_labels, stuff_labels], dim=0) cur_prob_masks = total_scores.view(-1, 1, 1) * total_masks segments_info = [] cur_mask_ids = cur_prob_masks.argmax(0) # sort instance outputs by scores sorted_inds = torch.argsort(-total_scores) current_segment_id = 0 for k in sorted_inds: pred_class = total_labels[k].item() isthing = pred_class < self.num_thing_classes if isthing and total_scores[k] < merge_cfg.instance_score_thr: continue mask = cur_mask_ids == k mask_area = mask.sum().item() original_area = (total_masks[k] >= 0.5).sum().item() if mask_area > 0 and original_area > 0: if mask_area / original_area < merge_cfg.overlap_thr: continue current_segment_id += 1 panoptic_seg[mask] = current_segment_id if isthing: segments_info.append({ 'id': current_segment_id, 'isthing': isthing, 'score': total_scores[k].item(), 'category_id': pred_class, 'instance_id': k.item(), }) else: segments_info.append({ 'id': current_segment_id, 'isthing': isthing, 'category_id': pred_class - self.num_thing_classes + 1, 'area': mask_area, }) return panoptic_seg.cpu().numpy(), segments_info ================================================ FILE: knet/det/kernel_update_head.py ================================================ import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import (ConvModule, bias_init_with_prob, build_activation_layer, build_norm_layer) from mmcv.runner import force_fp32 from mmdet.core import multi_apply from mmdet.models.builder import HEADS, build_loss from mmdet.models.dense_heads.atss_head import reduce_mean from mmdet.models.losses import accuracy from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention, build_transformer_layer from mmdet.utils import get_root_logger @HEADS.register_module() class KernelUpdateHead(nn.Module): def __init__(self, num_classes=80, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=3, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, act_cfg=dict(type='ReLU', inplace=True), ffn_act_cfg=dict(type='ReLU', inplace=True), conv_kernel_size=3, feat_transform_cfg=None, hard_mask_thr=0.5, kernel_init=False, with_ffn=True, mask_out_stride=4, relative_coors=False, relative_coors_off=False, feat_gather_stride=1, mask_transform_stride=1, mask_upsample_stride=1, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, kernel_updator_cfg=dict( type='DynamicConv', in_channels=256, feat_channels=64, out_channels=256, input_feat_shape=1, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_rank=None, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0), loss_dice=dict(type='DiceLoss', loss_weight=3.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0)): super(KernelUpdateHead, self).__init__() self.num_classes = num_classes self.loss_cls = build_loss(loss_cls) self.loss_mask = build_loss(loss_mask) self.loss_dice = build_loss(loss_dice) if loss_rank is not None: self.loss_rank = build_loss(loss_rank) else: self.loss_rank = loss_rank self.in_channels = in_channels self.out_channels = out_channels self.mask_thr = mask_thr self.fp16_enabled = False self.dropout = dropout self.num_heads = num_heads self.hard_mask_thr = hard_mask_thr self.kernel_init = kernel_init self.with_ffn = with_ffn self.mask_out_stride = mask_out_stride self.relative_coors = relative_coors self.relative_coors_off = relative_coors_off self.conv_kernel_size = conv_kernel_size self.feat_gather_stride = feat_gather_stride self.mask_transform_stride = mask_transform_stride self.mask_upsample_stride = mask_upsample_stride self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.ignore_label = ignore_label self.thing_label_in_seg = thing_label_in_seg self.attention = MultiheadAttention( in_channels * conv_kernel_size**2, num_heads, dropout) self.attention_norm = build_norm_layer( dict(type='LN'), in_channels * conv_kernel_size**2)[1] self.kernel_update_conv = build_transformer_layer(kernel_updator_cfg) if feat_transform_cfg is not None: kernel_size = feat_transform_cfg.pop('kernel_size', 1) self.feat_transform = ConvModule( in_channels, in_channels, kernel_size, stride=feat_gather_stride, padding=int(feat_gather_stride // 2), **feat_transform_cfg) else: self.feat_transform = None if self.with_ffn: self.ffn = FFN( in_channels, feedforward_channels, num_ffn_fcs, act_cfg=ffn_act_cfg, dropout=dropout) self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1] self.cls_fcs = nn.ModuleList() for _ in range(num_cls_fcs): self.cls_fcs.append( nn.Linear(in_channels, in_channels, bias=False)) self.cls_fcs.append( build_norm_layer(dict(type='LN'), in_channels)[1]) self.cls_fcs.append(build_activation_layer(act_cfg)) if self.loss_cls.use_sigmoid: self.fc_cls = nn.Linear(in_channels, self.num_classes) else: self.fc_cls = nn.Linear(in_channels, self.num_classes + 1) self.mask_fcs = nn.ModuleList() for _ in range(num_mask_fcs): self.mask_fcs.append( nn.Linear(in_channels, in_channels, bias=False)) self.mask_fcs.append( build_norm_layer(dict(type='LN'), in_channels)[1]) self.mask_fcs.append(build_activation_layer(act_cfg)) self.fc_mask = nn.Linear(in_channels, out_channels) def init_weights(self): """Use xavier initialization for all weight parameter and set classification head bias as a specific value when use focal loss.""" for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) else: # adopt the default initialization for # the weight and bias of the layer norm pass if self.loss_cls.use_sigmoid: bias_init = bias_init_with_prob(0.01) nn.init.constant_(self.fc_cls.bias, bias_init) if self.kernel_init: logger = get_root_logger() logger.info( 'mask kernel in mask head is normal initialized by std 0.01') nn.init.normal_(self.fc_mask.weight, mean=0, std=0.01) def forward(self, x, proposal_feat, mask_preds, prev_cls_score=None, mask_shape=None, img_metas=None): N, num_proposals = proposal_feat.shape[:2] if self.feat_transform is not None: x = self.feat_transform(x) C, H, W = x.shape[-3:] mask_h, mask_w = mask_preds.shape[-2:] if mask_h != H or mask_w != W: gather_mask = F.interpolate( mask_preds, (H, W), align_corners=False, mode='bilinear') else: gather_mask = mask_preds sigmoid_masks = gather_mask.sigmoid() nonzero_inds = sigmoid_masks > self.hard_mask_thr sigmoid_masks = nonzero_inds.float() # einsum is faster than bmm by 30% x_feat = torch.einsum('bnhw,bchw->bnc', sigmoid_masks, x) # obj_feat in shape [B, N, C, K, K] -> [B, N, C, K*K] -> [B, N, K*K, C] proposal_feat = proposal_feat.reshape(N, num_proposals, self.in_channels, -1).permute(0, 1, 3, 2) obj_feat = self.kernel_update_conv(x_feat, proposal_feat) # [B, N, K*K, C] -> [B, N, K*K*C] -> [N, B, K*K*C] obj_feat = obj_feat.reshape(N, num_proposals, -1).permute(1, 0, 2) obj_feat = self.attention_norm(self.attention(obj_feat)) # [N, B, K*K*C] -> [B, N, K*K*C] obj_feat = obj_feat.permute(1, 0, 2) # obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C] obj_feat = obj_feat.reshape(N, num_proposals, -1, self.in_channels) # FFN if self.with_ffn: obj_feat = self.ffn_norm(self.ffn(obj_feat)) cls_feat = obj_feat.sum(-2) mask_feat = obj_feat for cls_layer in self.cls_fcs: cls_feat = cls_layer(cls_feat) for reg_layer in self.mask_fcs: mask_feat = reg_layer(mask_feat) cls_score = self.fc_cls(cls_feat).view(N, num_proposals, -1) # [B, N, K*K, C] -> [B, N, C, K*K] mask_feat = self.fc_mask(mask_feat).permute(0, 1, 3, 2) if (self.mask_transform_stride == 2 and self.feat_gather_stride == 1): mask_x = F.interpolate( x, scale_factor=0.5, mode='bilinear', align_corners=False) H, W = mask_x.shape[-2:] else: mask_x = x # group conv is 5x faster than unfold and uses about 1/5 memory # Group conv vs. unfold vs. concat batch, 2.9ms :13.5ms :3.8ms # Group conv vs. unfold vs. concat batch, 278 : 1420 : 369 # fold_x = F.unfold( # mask_x, # self.conv_kernel_size, # padding=int(self.conv_kernel_size // 2)) # mask_feat = mask_feat.reshape(N, num_proposals, -1) # new_mask_preds = torch.einsum('bnc,bcl->bnl', mask_feat, fold_x) # [B, N, C, K*K] -> [B*N, C, K, K] mask_feat = mask_feat.reshape(N, num_proposals, C, self.conv_kernel_size, self.conv_kernel_size) # [B, C, H, W] -> [1, B*C, H, W] new_mask_preds = [] for i in range(N): new_mask_preds.append( F.conv2d( mask_x[i:i + 1], mask_feat[i], padding=int(self.conv_kernel_size // 2))) new_mask_preds = torch.cat(new_mask_preds, dim=0) new_mask_preds = new_mask_preds.reshape(N, num_proposals, H, W) if self.mask_transform_stride == 2: new_mask_preds = F.interpolate( new_mask_preds, scale_factor=2, mode='bilinear', align_corners=False) if mask_shape is not None and mask_shape[0] != H: new_mask_preds = F.interpolate( new_mask_preds, mask_shape, align_corners=False, mode='bilinear') return cls_score, new_mask_preds, obj_feat.permute(0, 1, 3, 2).reshape( N, num_proposals, self.in_channels, self.conv_kernel_size, self.conv_kernel_size) @force_fp32(apply_to=('cls_score', 'mask_pred')) def loss(self, object_feats, cls_score, mask_pred, labels, label_weights, mask_targets, mask_weights, imgs_whwh=None, reduction_override=None, **kwargs): losses = dict() bg_class_ind = self.num_classes # note in spare rcnn num_gt == num_pos pos_inds = (labels >= 0) & (labels < bg_class_ind) num_pos = pos_inds.sum().float() avg_factor = reduce_mean(num_pos).clamp_(min=1.0) num_preds = mask_pred.shape[0] * mask_pred.shape[1] assert mask_pred.shape[0] == cls_score.shape[0] assert mask_pred.shape[1] == cls_score.shape[1] if cls_score is not None: if cls_score.numel() > 0: losses['loss_cls'] = self.loss_cls( cls_score.view(num_preds, -1), labels, label_weights, avg_factor=avg_factor, reduction_override=reduction_override) losses['pos_acc'] = accuracy( cls_score.view(num_preds, -1)[pos_inds], labels[pos_inds]) if mask_pred is not None: bool_pos_inds = pos_inds.type(torch.bool) # 0~self.num_classes-1 are FG, self.num_classes is BG # do not perform bounding box regression for BG anymore. H, W = mask_pred.shape[-2:] if pos_inds.any(): pos_mask_pred = mask_pred.reshape(num_preds, H, W)[bool_pos_inds] pos_mask_targets = mask_targets[bool_pos_inds] losses['loss_mask'] = self.loss_mask(pos_mask_pred, pos_mask_targets) losses['loss_dice'] = self.loss_dice(pos_mask_pred, pos_mask_targets) if self.loss_rank is not None: batch_size = mask_pred.size(0) rank_target = mask_targets.new_full((batch_size, H, W), self.ignore_label, dtype=torch.long) rank_inds = pos_inds.view(batch_size, -1).nonzero(as_tuple=False) batch_mask_targets = mask_targets.view( batch_size, -1, H, W).bool() for i in range(batch_size): curr_inds = (rank_inds[:, 0] == i) curr_rank = rank_inds[:, 1][curr_inds] for j in curr_rank: rank_target[i][batch_mask_targets[i][j]] = j losses['loss_rank'] = self.loss_rank( mask_pred, rank_target, ignore_index=self.ignore_label) else: losses['loss_mask'] = mask_pred.sum() * 0 losses['loss_dice'] = mask_pred.sum() * 0 if self.loss_rank is not None: losses['loss_rank'] = mask_pred.sum() * 0 return losses def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask, pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls, cfg): num_pos = pos_mask.size(0) num_neg = neg_mask.size(0) num_samples = num_pos + num_neg H, W = pos_mask.shape[-2:] # original implementation uses new_zeros since BG are set to be 0 # now use empty & fill because BG cat_id = num_classes, # FG cat_id = [0, num_classes-1] labels = pos_mask.new_full((num_samples, ), self.num_classes, dtype=torch.long) label_weights = pos_mask.new_zeros((num_samples, self.num_classes)) mask_targets = pos_mask.new_zeros(num_samples, H, W) mask_weights = pos_mask.new_zeros(num_samples, H, W) if num_pos > 0: labels[pos_inds] = pos_gt_labels pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight label_weights[pos_inds] = pos_weight pos_mask_targets = pos_gt_mask mask_targets[pos_inds, ...] = pos_mask_targets mask_weights[pos_inds, ...] = 1 if num_neg > 0: label_weights[neg_inds] = 1.0 if gt_sem_cls is not None and gt_sem_seg is not None: sem_labels = pos_mask.new_full((self.num_stuff_classes, ), self.num_classes, dtype=torch.long) sem_targets = pos_mask.new_zeros(self.num_stuff_classes, H, W) sem_weights = pos_mask.new_zeros(self.num_stuff_classes, H, W) sem_stuff_weights = torch.eye( self.num_stuff_classes, device=pos_mask.device) sem_thing_weights = pos_mask.new_zeros( (self.num_stuff_classes, self.num_thing_classes)) sem_label_weights = torch.cat( [sem_thing_weights, sem_stuff_weights], dim=-1) if len(gt_sem_cls > 0): sem_inds = gt_sem_cls - self.num_thing_classes sem_inds = sem_inds.long() sem_labels[sem_inds] = gt_sem_cls.long() sem_targets[sem_inds] = gt_sem_seg sem_weights[sem_inds] = 1 label_weights[:, self.num_thing_classes:] = 0 labels = torch.cat([labels, sem_labels]) label_weights = torch.cat([label_weights, sem_label_weights]) mask_targets = torch.cat([mask_targets, sem_targets]) mask_weights = torch.cat([mask_weights, sem_weights]) return labels, label_weights, mask_targets, mask_weights def get_targets(self, sampling_results, gt_mask, gt_labels, rcnn_train_cfg, concat=True, gt_sem_seg=None, gt_sem_cls=None): pos_inds_list = [res.pos_inds for res in sampling_results] neg_inds_list = [res.neg_inds for res in sampling_results] pos_mask_list = [res.pos_masks for res in sampling_results] neg_mask_list = [res.neg_masks for res in sampling_results] pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results] pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results] if gt_sem_seg is None: gt_sem_seg = [None] * 2 gt_sem_cls = [None] * 2 labels, label_weights, mask_targets, mask_weights = multi_apply( self._get_target_single, pos_inds_list, neg_inds_list, pos_mask_list, neg_mask_list, pos_gt_mask_list, pos_gt_labels_list, gt_sem_seg, gt_sem_cls, cfg=rcnn_train_cfg) if concat: labels = torch.cat(labels, 0) label_weights = torch.cat(label_weights, 0) mask_targets = torch.cat(mask_targets, 0) mask_weights = torch.cat(mask_weights, 0) return labels, label_weights, mask_targets, mask_weights def rescale_masks(self, masks_per_img, img_meta): h, w, _ = img_meta['img_shape'] masks_per_img = F.interpolate( masks_per_img.unsqueeze(0).sigmoid(), size=img_meta['batch_input_shape'], mode='bilinear', align_corners=False) masks_per_img = masks_per_img[:, :, :h, :w] ori_shape = img_meta['ori_shape'] seg_masks = F.interpolate( masks_per_img, size=ori_shape[:2], mode='bilinear', align_corners=False).squeeze(0) return seg_masks def get_seg_masks(self, masks_per_img, labels_per_img, scores_per_img, test_cfg, img_meta): # resize mask predictions back seg_masks = self.rescale_masks(masks_per_img, img_meta) seg_masks = seg_masks > test_cfg.mask_thr bbox_result, segm_result = self.segm2result(seg_masks, labels_per_img, scores_per_img) return bbox_result, segm_result def segm2result(self, mask_preds, det_labels, cls_scores): num_classes = self.num_classes bbox_result = None segm_result = [[] for _ in range(num_classes)] mask_preds = mask_preds.cpu().numpy() det_labels = det_labels.cpu().numpy() cls_scores = cls_scores.cpu().numpy() num_ins = mask_preds.shape[0] # fake bboxes bboxes = np.zeros((num_ins, 5), dtype=np.float32) bboxes[:, -1] = cls_scores bbox_result = [bboxes[det_labels == i, :] for i in range(num_classes)] for idx in range(num_ins): segm_result[det_labels[idx]].append(mask_preds[idx]) return bbox_result, segm_result ================================================ FILE: knet/det/knet.py ================================================ import torch import torch.nn.functional as F from mmdet.models.builder import DETECTORS from mmdet.models.detectors import TwoStageDetector from .utils import sem2ins_masks, sem2ins_masks_cityscapes, sem2ins_masks_kitti_step @DETECTORS.register_module() class KNet(TwoStageDetector): def __init__(self, *args, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, cityscapes=False, kitti_step=False, **kwargs): super(KNet, self).__init__(*args, **kwargs) assert self.with_rpn, 'KNet does not support external proposals' self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.thing_label_in_seg = thing_label_in_seg self.ignore_label = ignore_label self.cityscapes = cityscapes # whether to train the cityscape panoptic segmentation self.kitti_step = kitti_step def forward_train(self, img, img_metas, gt_bboxes=None, gt_labels=None, gt_bboxes_ignore=None, gt_masks=None, proposals=None, gt_semantic_seg=None, **kwargs): """Forward function of SparseR-CNN in train stage. Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see :class:`mmdet.datasets.pipelines.Collect`. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor): specify which bounding boxes can be ignored when computing the loss. gt_masks (List[Tensor], optional) : Segmentation masks for each box. But we don't support it in this architecture. proposals (List[Tensor], optional): override rpn proposals with custom proposals. Use when `with_rpn` is False. Returns: dict[str, Tensor]: a dictionary of loss components """ super(TwoStageDetector, self).forward_train(img, img_metas) assert proposals is None, 'KNet does not support' \ ' external proposals' assert gt_masks is not None # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks_tensor = [] gt_sem_seg = [] gt_sem_cls = [] # batch_input_shape shoud be the same across images pad_H, pad_W = img_metas[0]['batch_input_shape'] assign_H = pad_H // self.mask_assign_stride assign_W = pad_W // self.mask_assign_stride for i, gt_mask in enumerate(gt_masks): mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device) if gt_mask.width != pad_W or gt_mask.height != pad_H: pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height) mask_tensor = F.pad(mask_tensor, pad_wh, value=0) if gt_semantic_seg is not None: # gt_semantic seg is padded by zero when forming a batch # need to convert them from 0 to ignore gt_semantic_seg[ i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label gt_semantic_seg[ i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label if self.cityscapes: sem_labels, sem_seg = sem2ins_masks_cityscapes( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes) elif self.kitti_step: sem_labels, sem_seg = sem2ins_masks_kitti_step( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=2, thing_label_in_seg=(11, 13)) else: sem_labels, sem_seg = sem2ins_masks( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes, thing_label_in_seg=self.thing_label_in_seg) if sem_seg.shape[0] == 0: gt_sem_seg.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_sem_seg.append( F.interpolate( sem_seg[None], (assign_H, assign_W), mode='bilinear', align_corners=False)[0]) gt_sem_cls.append(sem_labels) else: gt_sem_seg = None gt_sem_cls = None if mask_tensor.shape[0] == 0: gt_masks_tensor.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_masks_tensor.append( F.interpolate( mask_tensor[None], (assign_H, assign_W), mode='bilinear', align_corners=False)[0]) gt_masks = gt_masks_tensor x = self.extract_feat(img) rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks, gt_labels, gt_sem_seg, gt_sem_cls) (rpn_losses, proposal_feats, x_feats, mask_preds, cls_scores) = rpn_results losses = self.roi_head.forward_train( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, gt_masks, gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, gt_bboxes=gt_bboxes, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls, imgs_whwh=None) losses.update(rpn_losses) return losses def simple_test(self, img, img_metas, rescale=False): """Test function without test time augmentation. Args: imgs (list[torch.Tensor]): List of multiple images img_metas (list[dict]): List of image information. rescale (bool): Whether to rescale the results. Defaults to False. Returns: list[list[np.ndarray]]: BBox results of each image and classes. The outer list corresponds to each image. The inner list corresponds to each class. """ x = self.extract_feat(img) rpn_results = self.rpn_head.simple_test_rpn(x, img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results segm_results = self.roi_head.simple_test( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, imgs_whwh=None, rescale=rescale) if self.kitti_step: res = segm_results[0] segm_results[0] = (*res, None, None) return segm_results def forward_dummy(self, img): """Used for computing network flops. See `mmdetection/tools/get_flops.py` """ # backbone x = self.extract_feat(img) # rpn num_imgs = len(img) dummy_img_metas = [ dict(img_shape=(3, *img.shape[-2:])) for _ in range(num_imgs) ] rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results # roi_head roi_outs = self.roi_head.simple_test_mask_preds(x_feats, proposal_feats, mask_preds, cls_scores, dummy_img_metas) return roi_outs ================================================ FILE: knet/det/mask_hungarian_assigner.py ================================================ import numpy as np import torch from mmdet.core import AssignResult, BaseAssigner, reduce_mean from mmdet.core.bbox.builder import BBOX_ASSIGNERS from mmdet.core.bbox.match_costs.builder import MATCH_COST, build_match_cost try: from scipy.optimize import linear_sum_assignment except ImportError: linear_sum_assignment = None @MATCH_COST.register_module() class DiceCost(object): """DiceCost. Args: weight (int | float, optional): loss_weight pred_act (bool): Whether to activate the prediction before calculating cost Examples: >>> from mmdet.core.bbox.match_costs.match_cost import BBoxL1Cost >>> import torch >>> self = BBoxL1Cost() >>> bbox_pred = torch.rand(1, 4) >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]]) >>> factor = torch.tensor([10, 8, 10, 8]) >>> self(bbox_pred, gt_bboxes, factor) tensor([[1.6172, 1.6422]]) """ def __init__(self, weight=1., pred_act=False, act_mode='sigmoid', eps=1e-3): self.weight = weight self.pred_act = pred_act self.act_mode = act_mode self.eps = eps def dice_loss(cls, input, target, eps=1e-3): input = input.reshape(input.size()[0], -1) target = target.reshape(target.size()[0], -1).float() # einsum saves 10x memory # a = torch.sum(input[:, None] * target[None, ...], -1) a = torch.einsum('nh,mh->nm', input, target) b = torch.sum(input * input, 1) + eps c = torch.sum(target * target, 1) + eps d = (2 * a) / (b[:, None] + c[None, ...]) # 1 is a constance that will not affect the matching, so ommitted return -d def __call__(self, mask_preds, gt_masks): """ Args: bbox_pred (Tensor): Predicted boxes with normalized coordinates (cx, cy, w, h), which are all in range [0, 1]. Shape [num_query, 4]. gt_bboxes (Tensor): Ground truth boxes with normalized coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. Returns: torch.Tensor: bbox_cost value with weight """ if self.pred_act and self.act_mode == 'sigmoid': mask_preds = mask_preds.sigmoid().clamp(min=0.001, max=1.0) elif self.pred_act: mask_preds = mask_preds.softmax(dim=0) # print("mask pred:", mask_preds) dice_cost = self.dice_loss(mask_preds, gt_masks, self.eps) return dice_cost * self.weight @MATCH_COST.register_module() class MaskCost(object): """MaskCost. Args: weight (int | float, optional): loss_weight """ def __init__(self, weight=1., pred_act=False, act_mode='sigmoid'): self.weight = weight self.pred_act = pred_act self.act_mode = act_mode def __call__(self, cls_pred, target): """ Args: cls_pred (Tensor): Predicted classification logits, shape [num_query, num_class]. gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). Returns: torch.Tensor: cls_cost value with weight """ if self.pred_act and self.act_mode == 'sigmoid': cls_pred = cls_pred.sigmoid().clamp(min=0.01, max=1.0) elif self.pred_act: cls_pred = cls_pred.softmax(dim=0) num_proposals = cls_pred.shape[0] num_gts, H, W = target.shape # flatten_cls_pred = cls_pred.view(num_proposals, -1) # eingum is ~10 times faster than matmul pos_cost = torch.einsum('nhw,mhw->nm', cls_pred, target) neg_cost = torch.einsum('nhw,mhw->nm', 1 - cls_pred, 1 - target) # flatten_target = target.view(num_gts, -1).t() # pos_cost = flatten_cls_pred.matmul(flatten_target) # neg_cost = (1 - flatten_cls_pred).matmul(1 - flatten_target) cls_cost = -(pos_cost + neg_cost) / (H * W) return cls_cost * self.weight @BBOX_ASSIGNERS.register_module() class MaskHungarianAssigner(BaseAssigner): """Computes one-to-one matching between predictions and ground truth. This class computes an assignment between the targets and the predictions based on the costs. The costs are weighted sum of three components: classfication cost, regression L1 cost and regression iou cost. The targets don't include the no_object, so generally there are more predictions than targets. After the one-to-one matching, the un-matched are treated as backgrounds. Thus each query prediction will be assigned with `0` or a positive integer indicating the ground truth index: - 0: negative sample, no assigned gt - positive integer: positive sample, index (1-based) of assigned gt Args: cls_weight (int | float, optional): The scale factor for classification cost. Default 1.0. bbox_weight (int | float, optional): The scale factor for regression L1 cost. Default 1.0. iou_weight (int | float, optional): The scale factor for regression iou cost. Default 1.0. iou_calculator (dict | optional): The config for the iou calculation. Default type `BboxOverlaps2D`. iou_mode (str | optional): "iou" (intersection over union), "iof" (intersection over foreground), or "giou" (generalized intersection over union). Default "giou". """ def __init__(self, cls_cost=dict(type='ClassificationCost', weight=1.), mask_cost=dict(type='SigmoidCost', weight=1.0), dice_cost=dict(), boundary_cost=None, topk=1): self.cls_cost = build_match_cost(cls_cost) self.mask_cost = build_match_cost(mask_cost) self.dice_cost = build_match_cost(dice_cost) if boundary_cost is not None: self.boundary_cost = build_match_cost(boundary_cost) else: self.boundary_cost = None self.topk = topk def assign(self, bbox_pred, cls_pred, gt_bboxes, gt_labels, img_meta=None, gt_bboxes_ignore=None, eps=1e-7): """Computes one-to-one matching based on the weighted costs. This method assign each query prediction to a ground truth or background. The `assigned_gt_inds` with -1 means don't care, 0 means negative sample, and positive number is the index (1-based) of assigned gt. The assignment is done in the following steps, the order matters. 1. assign every prediction to -1 2. compute the weighted costs 3. do Hungarian matching on CPU based on the costs 4. assign all to 0 (background) first, then for each matched pair between predictions and gts, treat this prediction as foreground and assign the corresponding gt index (plus 1) to it. Args: bbox_pred (Tensor): Predicted boxes with normalized coordinates (cx, cy, w, h), which are all in range [0, 1]. Shape [num_query, 4]. cls_pred (Tensor): Predicted classification logits, shape [num_query, num_class]. gt_bboxes (Tensor): Ground truth boxes with unnormalized coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). img_meta (dict): Meta information for current image. gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are labelled as `ignored`. Default None. eps (int | float, optional): A value added to the denominator for numerical stability. Default 1e-7. Returns: :obj:`AssignResult`: The assigned result. """ assert gt_bboxes_ignore is None, \ 'Only case when gt_bboxes_ignore is None is supported.' num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) # 1. assign -1 by default assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long) assigned_labels = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long) assigned_instance_ids = bbox_pred.new_full((num_bboxes,), -1, dtype=torch.long) if num_gts == 0 or num_bboxes == 0: # No ground truth or boxes, return empty assignment if num_gts == 0: # No ground truth, assign all to background assigned_gt_inds[:] = 0 return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels) # 2. compute the weighted costs # classification and bboxcost. if self.cls_cost.weight != 0 and cls_pred is not None: cls_cost = self.cls_cost(cls_pred, gt_labels) else: cls_cost = 0 if self.mask_cost.weight != 0: reg_cost = self.mask_cost(bbox_pred, gt_bboxes) else: reg_cost = 0 if self.dice_cost.weight != 0: dice_cost = self.dice_cost(bbox_pred, gt_bboxes) else: dice_cost = 0 if self.boundary_cost is not None and self.boundary_cost.weight != 0: b_cost = self.boundary_cost(bbox_pred, gt_bboxes) else: b_cost = 0 cost = cls_cost + reg_cost + dice_cost + b_cost # 3. do Hungarian matching on CPU using linear_sum_assignment cost = cost.detach().cpu() if linear_sum_assignment is None: raise ImportError('Please run "pip install scipy" ' 'to install scipy first.') if self.topk == 1: matched_row_inds, matched_col_inds = linear_sum_assignment(cost) else: topk_matched_row_inds = [] topk_matched_col_inds = [] for i in range(self.topk): matched_row_inds, matched_col_inds = linear_sum_assignment( cost) topk_matched_row_inds.append(matched_row_inds) topk_matched_col_inds.append(matched_col_inds) cost[matched_row_inds] = 1e10 matched_row_inds = np.concatenate(topk_matched_row_inds) matched_col_inds = np.concatenate(topk_matched_col_inds) matched_row_inds = torch.from_numpy(matched_row_inds).to( bbox_pred.device) matched_col_inds = torch.from_numpy(matched_col_inds).to( bbox_pred.device) # 4. assign backgrounds and foregrounds # assign all indices to backgrounds first assigned_gt_inds[:] = 0 # assign foregrounds based on matching results assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels) ================================================ FILE: knet/det/mask_pseudo_sampler.py ================================================ import torch from mmdet.core.bbox import BaseSampler, SamplingResult from mmdet.core.bbox.builder import BBOX_SAMPLERS class MaskSamplingResult(SamplingResult): """Bbox sampling result. Example: >>> # xdoctest: +IGNORE_WANT >>> from mmdet.core.bbox.samplers.sampling_result import * # NOQA >>> self = SamplingResult.random(rng=10) >>> print(f'self = {self}') self = """ def __init__(self, pos_inds, neg_inds, masks, gt_masks, assign_result, gt_flags): self.pos_inds = pos_inds self.neg_inds = neg_inds self.pos_masks = masks[pos_inds] self.neg_masks = masks[neg_inds] self.pos_is_gt = gt_flags[pos_inds] self.num_gts = gt_masks.shape[0] self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 if gt_masks.numel() == 0: # hack for index error case assert self.pos_assigned_gt_inds.numel() == 0 self.pos_gt_masks = torch.empty_like(gt_masks) else: self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :] if assign_result.labels is not None: self.pos_gt_labels = assign_result.labels[pos_inds] else: self.pos_gt_labels = None if "pids" in assign_result._extra_properties.keys(): self.pos_gt_pids = assign_result._extra_properties['pids'][pos_inds] else: self.pos_gt_pids = None @property def masks(self): """torch.Tensor: concatenated positive and negative boxes""" return torch.cat([self.pos_masks, self.neg_masks]) def __nice__(self): data = self.info.copy() data['pos_masks'] = data.pop('pos_masks').shape data['neg_masks'] = data.pop('neg_masks').shape parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())] body = ' ' + ',\n '.join(parts) return '{\n' + body + '\n}' @property def info(self): """Returns a dictionary of info about the object.""" return { 'pos_inds': self.pos_inds, 'neg_inds': self.neg_inds, 'pos_masks': self.pos_masks, 'neg_masks': self.neg_masks, 'pos_is_gt': self.pos_is_gt, 'num_gts': self.num_gts, 'pos_assigned_gt_inds': self.pos_assigned_gt_inds, } class MaskSamplingResultWithScore(SamplingResult): """Bbox sampling result. Example: >>> # xdoctest: +IGNORE_WANT >>> from mmdet.core.bbox.samplers.sampling_result import * # NOQA >>> self = SamplingResult.random(rng=10) >>> print(f'self = {self}') self = """ def __init__(self, pos_inds, neg_inds, masks, scores, gt_masks, assign_result, gt_flags): self.pos_inds = pos_inds self.neg_inds = neg_inds self.pos_masks = masks[pos_inds] self.neg_masks = masks[neg_inds] self.pos_scores = scores[pos_inds] self.neg_scores = scores[neg_inds] self.pos_is_gt = gt_flags[pos_inds] self.num_gts = gt_masks.shape[0] self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 if gt_masks.numel() == 0: # hack for index error case assert self.pos_assigned_gt_inds.numel() == 0 self.pos_gt_masks = torch.empty_like(gt_masks) else: self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :] if assign_result.labels is not None: self.pos_gt_labels = assign_result.labels[pos_inds] else: self.pos_gt_labels = None if "pids" in assign_result._extra_properties.keys(): self.pos_gt_pids = assign_result._extra_properties['pids'][pos_inds] else: self.pos_gt_pids = None @property def masks(self): """torch.Tensor: concatenated positive and negative boxes""" return torch.cat([self.pos_masks, self.neg_masks]) def __nice__(self): data = self.info.copy() data['pos_masks'] = data.pop('pos_masks').shape data['neg_masks'] = data.pop('neg_masks').shape parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())] body = ' ' + ',\n '.join(parts) return '{\n' + body + '\n}' @property def info(self): """Returns a dictionary of info about the object.""" return { 'pos_inds': self.pos_inds, 'neg_inds': self.neg_inds, 'pos_masks': self.pos_masks, 'neg_masks': self.neg_masks, 'pos_is_gt': self.pos_is_gt, 'num_gts': self.num_gts, 'pos_assigned_gt_inds': self.pos_assigned_gt_inds, } @BBOX_SAMPLERS.register_module() class MaskPseudoSampler(BaseSampler): """A pseudo sampler that does not do sampling actually.""" def __init__(self, **kwargs): pass def _sample_pos(self, **kwargs): """Sample positive samples.""" raise NotImplementedError def _sample_neg(self, **kwargs): """Sample negative samples.""" raise NotImplementedError def sample(self, assign_result, masks, gt_masks, **kwargs): """Directly returns the positive and negative indices of samples. Args: assign_result (:obj:`AssignResult`): Assigned results masks (torch.Tensor): Bounding boxes gt_masks (torch.Tensor): Ground truth boxes Returns: :obj:`SamplingResult`: sampler results """ pos_inds = torch.nonzero( assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique() neg_inds = torch.nonzero( assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique() gt_flags = masks.new_zeros(masks.shape[0], dtype=torch.uint8) sampling_result = MaskSamplingResult(pos_inds, neg_inds, masks, gt_masks, assign_result, gt_flags) return sampling_result @BBOX_SAMPLERS.register_module() class MaskScorePseudoSampler(BaseSampler): """A pseudo sampler that does not do sampling actually.""" def __init__(self, **kwargs): pass def _sample_pos(self, **kwargs): """Sample positive samples.""" raise NotImplementedError def _sample_neg(self, **kwargs): """Sample negative samples.""" raise NotImplementedError def sample(self, assign_result, masks, score, gt_masks, **kwargs): """Directly returns the positive and negative indices of samples. Args: assign_result (:obj:`AssignResult`): Assigned results masks (torch.Tensor): Bounding boxes gt_masks (torch.Tensor): Ground truth boxes Returns: :obj:`SamplingResult`: sampler results """ pos_inds = torch.nonzero( assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique() neg_inds = torch.nonzero( assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique() gt_flags = masks.new_zeros(masks.shape[0], dtype=torch.uint8) sampling_result = MaskSamplingResultWithScore(pos_inds, neg_inds, masks, score, gt_masks, assign_result, gt_flags) return sampling_result ================================================ FILE: knet/det/msdeformattn_decoder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import (Conv2d, ConvModule, caffe2_xavier_init, normal_init, xavier_init) from mmdet.models.builder import NECKS from mmcv.cnn.bricks.transformer import (build_positional_encoding, build_transformer_layer_sequence) from mmcv.runner import BaseModule, ModuleList from mmdet.core.anchor import MlvlPointGenerator from mmdet.models.utils.transformer import MultiScaleDeformableAttention @NECKS.register_module() class MSDeformAttnPixelDecoder(BaseModule): """Pixel decoder with multi-scale deformable attention. Args: in_channels (list[int] | tuple[int]): Number of channels in the input feature maps. strides (list[int] | tuple[int]): Output strides of feature from backbone. feat_channels (int): Number of channels for feature. out_channels (int): Number of channels for output. num_outs (int): Number of output scales. norm_cfg (:obj:`mmcv.ConfigDict` | dict): Config for normalization. Defaults to dict(type='GN', num_groups=32). act_cfg (:obj:`mmcv.ConfigDict` | dict): Config for activation. Defaults to dict(type='ReLU'). encoder (:obj:`mmcv.ConfigDict` | dict): Config for transformer encoder. Defaults to `DetrTransformerEncoder`. positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for transformer encoder position encoding. Defaults to dict(type='SinePositionalEncoding', num_feats=128, normalize=True). init_cfg (:obj:`mmcv.ConfigDict` | dict): Initialization config dict. """ def __init__(self, in_channels=[256, 512, 1024, 2048], strides=[4, 8, 16, 32], feat_channels=256, out_channels=256, num_outs=3, return_one_list=True, norm_cfg=dict(type='GN', num_groups=32), act_cfg=dict(type='ReLU'), encoder=dict( type='DetrTransformerEncoder', num_layers=6, transformerlayers=dict( type='BaseTransformerLayer', attn_cfgs=dict( type='MultiScaleDeformableAttention', embed_dims=256, num_heads=8, num_levels=3, num_points=4, im2col_step=64, dropout=0.0, batch_first=False, norm_cfg=None, init_cfg=None), feedforward_channels=1024, ffn_dropout=0.0, operation_order=('self_attn', 'norm', 'ffn', 'norm')), init_cfg=None), positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True), init_cfg=None): super().__init__(init_cfg=init_cfg) self.strides = strides self.num_input_levels = len(in_channels) self.return_one_list = return_one_list self.num_encoder_levels = \ encoder.transformerlayers.attn_cfgs.num_levels assert self.num_encoder_levels >= 1, \ 'num_levels in attn_cfgs must be at least one' input_conv_list = [] # from top to down (low to high resolution) for i in range(self.num_input_levels - 1, self.num_input_levels - self.num_encoder_levels - 1, -1): input_conv = ConvModule( in_channels[i], feat_channels, kernel_size=1, norm_cfg=norm_cfg, act_cfg=None, bias=True) input_conv_list.append(input_conv) self.input_convs = ModuleList(input_conv_list) self.encoder = build_transformer_layer_sequence(encoder) self.postional_encoding = build_positional_encoding( positional_encoding) # high resolution to low resolution self.level_encoding = nn.Embedding(self.num_encoder_levels, feat_channels) # fpn-like structure self.lateral_convs = ModuleList() self.output_convs = ModuleList() self.use_bias = norm_cfg is None # from top to down (low to high resolution) # fpn for the rest features that didn't pass in encoder for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1, -1): lateral_conv = ConvModule( in_channels[i], feat_channels, kernel_size=1, bias=self.use_bias, norm_cfg=norm_cfg, act_cfg=None) output_conv = ConvModule( feat_channels, feat_channels, kernel_size=3, stride=1, padding=1, bias=self.use_bias, norm_cfg=norm_cfg, act_cfg=act_cfg) self.lateral_convs.append(lateral_conv) self.output_convs.append(output_conv) self.mask_feature = Conv2d( feat_channels, out_channels, kernel_size=1, stride=1, padding=0) self.num_outs = num_outs self.point_generator = MlvlPointGenerator(strides) def init_weights(self): """Initialize weights.""" for i in range(0, self.num_encoder_levels): xavier_init( self.input_convs[i].conv, gain=1, bias=0, distribution='uniform') for i in range(0, self.num_input_levels - self.num_encoder_levels): caffe2_xavier_init(self.lateral_convs[i].conv, bias=0) caffe2_xavier_init(self.output_convs[i].conv, bias=0) caffe2_xavier_init(self.mask_feature, bias=0) normal_init(self.level_encoding, mean=0, std=1) for p in self.encoder.parameters(): if p.dim() > 1: nn.init.xavier_normal_(p) # init_weights defined in MultiScaleDeformableAttention for layer in self.encoder.layers: for attn in layer.attentions: if isinstance(attn, MultiScaleDeformableAttention): attn.init_weights() def forward(self, feats): """ Args: feats (list[Tensor]): Feature maps of each level. Each has shape of (batch_size, c, h, w). Returns: tuple: A tuple containing the following: - mask_feature (Tensor): shape (batch_size, c, h, w). - multi_scale_features (list[Tensor]): Multi scale \ features, each in shape (batch_size, c, h, w). """ # generate padding mask for each level, for each image batch_size = feats[0].shape[0] encoder_input_list = [] padding_mask_list = [] level_positional_encoding_list = [] spatial_shapes = [] reference_points_list = [] for i in range(self.num_encoder_levels): level_idx = self.num_input_levels - i - 1 feat = feats[level_idx] feat_projected = self.input_convs[i](feat) h, w = feat.shape[-2:] # no padding padding_mask_resized = feat.new_zeros( (batch_size, ) + feat.shape[-2:], dtype=torch.bool) pos_embed = self.postional_encoding(padding_mask_resized) level_embed = self.level_encoding.weight[i] level_pos_embed = level_embed.view(1, -1, 1, 1) + pos_embed # (h_i * w_i, 2) reference_points = self.point_generator.single_level_grid_priors( feat.shape[-2:], level_idx, device=feat.device) # normalize factor = feat.new_tensor([[w, h]]) * self.strides[level_idx] reference_points = reference_points / factor # shape (batch_size, c, h_i, w_i) -> (h_i * w_i, batch_size, c) feat_projected = feat_projected.flatten(2).permute(2, 0, 1) level_pos_embed = level_pos_embed.flatten(2).permute(2, 0, 1) padding_mask_resized = padding_mask_resized.flatten(1) encoder_input_list.append(feat_projected) padding_mask_list.append(padding_mask_resized) level_positional_encoding_list.append(level_pos_embed) spatial_shapes.append(feat.shape[-2:]) reference_points_list.append(reference_points) # shape (batch_size, total_num_query), # total_num_query=sum([., h_i * w_i,.]) padding_masks = torch.cat(padding_mask_list, dim=1) # shape (total_num_query, batch_size, c) encoder_inputs = torch.cat(encoder_input_list, dim=0) level_positional_encodings = torch.cat( level_positional_encoding_list, dim=0) device = encoder_inputs.device # shape (num_encoder_levels, 2), from low # resolution to high resolution spatial_shapes = torch.as_tensor( spatial_shapes, dtype=torch.long, device=device) # shape (0, h_0*w_0, h_0*w_0+h_1*w_1, ...) level_start_index = torch.cat((spatial_shapes.new_zeros( (1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) reference_points = torch.cat(reference_points_list, dim=0) reference_points = reference_points[None, :, None].repeat( batch_size, 1, self.num_encoder_levels, 1) valid_radios = reference_points.new_ones( (batch_size, self.num_encoder_levels, 2)) # shape (num_total_query, batch_size, c) memory = self.encoder( query=encoder_inputs, key=None, value=None, query_pos=level_positional_encodings, key_pos=None, attn_masks=None, key_padding_mask=None, query_key_padding_mask=padding_masks, spatial_shapes=spatial_shapes, reference_points=reference_points, level_start_index=level_start_index, valid_radios=valid_radios) # (num_total_query, batch_size, c) -> (batch_size, c, num_total_query) memory = memory.permute(1, 2, 0) # from low resolution to high resolution num_query_per_level = [e[0] * e[1] for e in spatial_shapes] outs = torch.split(memory, num_query_per_level, dim=-1) outs = [ x.reshape(batch_size, -1, spatial_shapes[i][0], spatial_shapes[i][1]) for i, x in enumerate(outs) ] for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1, -1): x = feats[i] cur_feat = self.lateral_convs[i](x) y = cur_feat + F.interpolate( outs[-1], size=cur_feat.shape[-2:], mode='bilinear', align_corners=False) y = self.output_convs[i](y) outs.append(y) multi_scale_features = outs[:self.num_outs] mask_feature = self.mask_feature(outs[-1]) multi_scale_features.append(mask_feature) multi_scale_features.reverse() return tuple(multi_scale_features) ================================================ FILE: knet/det/semantic_fpn_wrapper.py ================================================ import math import torch import torch.nn as nn from torch.nn import init from mmcv.cnn import ConvModule, normal_init from mmdet.models.builder import NECKS, BACKBONES from mmcv.cnn.bricks.transformer import build_positional_encoding from mmdet.utils import get_root_logger from mmcv.ops import DeformConv2dPack from mmcv.runner import BaseModule import torch.nn.functional as F @NECKS.register_module() class SemanticFPNWrapper(nn.Module): """ Implementation of Semantic FPN used in Panoptic FPN. Args: in_channels ([type]): [description] feat_channels ([type]): [description] out_channels ([type]): [description] start_level ([type]): [description] end_level ([type]): [description] cat_coors (bool, optional): [description]. Defaults to False. fuse_by_cat (bool, optional): [description]. Defaults to False. conv_cfg ([type], optional): [description]. Defaults to None. norm_cfg ([type], optional): [description]. Defaults to None. """ def __init__(self, in_channels, feat_channels, out_channels, start_level, end_level, cat_coors=False, positional_encoding=None, cat_coors_level=3, fuse_by_cat=False, return_list=False, upsample_times=3, with_pred=True, num_aux_convs=0, act_cfg=dict(type='ReLU', inplace=True), out_act_cfg=dict(type='ReLU'), conv_cfg=None, norm_cfg=None): super(SemanticFPNWrapper, self).__init__() self.in_channels = in_channels self.feat_channels = feat_channels self.start_level = start_level self.end_level = end_level assert start_level >= 0 and end_level >= start_level self.out_channels = out_channels self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.cat_coors = cat_coors self.cat_coors_level = cat_coors_level self.fuse_by_cat = fuse_by_cat self.return_list = return_list self.upsample_times = upsample_times self.with_pred = with_pred if positional_encoding is not None: self.positional_encoding = build_positional_encoding( positional_encoding) else: self.positional_encoding = None self.convs_all_levels = nn.ModuleList() for i in range(self.start_level, self.end_level + 1): convs_per_level = nn.Sequential() if i == 0: if i == self.cat_coors_level and self.cat_coors: chn = self.in_channels + 2 else: chn = self.in_channels if upsample_times == self.end_level - i: one_conv = ConvModule( chn, self.feat_channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, inplace=False) convs_per_level.add_module('conv' + str(i), one_conv) else: for i in range(self.end_level - upsample_times): one_conv = ConvModule( chn, self.feat_channels, 3, padding=1, stride=2, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, inplace=False) convs_per_level.add_module('conv' + str(i), one_conv) self.convs_all_levels.append(convs_per_level) continue for j in range(i): if j == 0: if i == self.cat_coors_level and self.cat_coors: chn = self.in_channels + 2 else: chn = self.in_channels one_conv = ConvModule( chn, self.feat_channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, inplace=False) convs_per_level.add_module('conv' + str(j), one_conv) if j < upsample_times - (self.end_level - i): one_upsample = nn.Upsample( scale_factor=2, mode='bilinear', align_corners=False) convs_per_level.add_module('upsample' + str(j), one_upsample) continue one_conv = ConvModule( self.feat_channels, self.feat_channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, inplace=False) convs_per_level.add_module('conv' + str(j), one_conv) if j < upsample_times - (self.end_level - i): one_upsample = nn.Upsample( scale_factor=2, mode='bilinear', align_corners=False) convs_per_level.add_module('upsample' + str(j), one_upsample) self.convs_all_levels.append(convs_per_level) if fuse_by_cat: in_channels = self.feat_channels * len(self.convs_all_levels) else: in_channels = self.feat_channels if self.with_pred: self.conv_pred = ConvModule( in_channels, self.out_channels, 1, padding=0, conv_cfg=self.conv_cfg, act_cfg=out_act_cfg, norm_cfg=self.norm_cfg) self.num_aux_convs = num_aux_convs self.aux_convs = nn.ModuleList() for i in range(num_aux_convs): self.aux_convs.append( ConvModule( in_channels, self.out_channels, 1, padding=0, conv_cfg=self.conv_cfg, act_cfg=out_act_cfg, norm_cfg=self.norm_cfg)) def init_weights(self): logger = get_root_logger() logger.info('Use normal intialization for semantic FPN') for m in self.modules(): if isinstance(m, nn.Conv2d): normal_init(m, std=0.01) def generate_coord(self, input_feat): x_range = torch.linspace( -1, 1, input_feat.shape[-1], device=input_feat.device) y_range = torch.linspace( -1, 1, input_feat.shape[-2], device=input_feat.device) y, x = torch.meshgrid(y_range, x_range) y = y.expand([input_feat.shape[0], 1, -1, -1]) x = x.expand([input_feat.shape[0], 1, -1, -1]) coord_feat = torch.cat([x, y], 1) return coord_feat def forward(self, inputs): mlvl_feats = [] for i in range(self.start_level, self.end_level + 1): input_p = inputs[i] if i == self.cat_coors_level: if self.positional_encoding is not None: ignore_mask = input_p.new_zeros( (input_p.shape[0], input_p.shape[-2], input_p.shape[-1]), dtype=torch.bool) positional_encoding = self.positional_encoding(ignore_mask) input_p = input_p + positional_encoding if self.cat_coors: coord_feat = self.generate_coord(input_p) input_p = torch.cat([input_p, coord_feat], 1) mlvl_feats.append(self.convs_all_levels[i](input_p)) if self.fuse_by_cat: feature_add_all_level = torch.cat(mlvl_feats, dim=1) else: feature_add_all_level = sum(mlvl_feats) if self.with_pred: out = self.conv_pred(feature_add_all_level) else: out = feature_add_all_level if self.num_aux_convs > 0: outs = [out] for conv in self.aux_convs: outs.append(conv(feature_add_all_level)) return outs if self.return_list: return [out] else: return out @NECKS.register_module() class UperNetAlignHead(BaseModule): def __init__(self, in_channels=[256, 512, 1024, 2048], out_channels=256, feat_channels=256, align_types="v1", start_level=1, end_level=3, conv3x3_type="conv", positional_encoding=None, cat_coors_level=3, upsample_times=2, cat_coors=False, fuse_by_cat=False, return_list=False, num_aux_convs=1, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True) ): super(UperNetAlignHead, self).__init__() if positional_encoding is not None: self.positional_encoding = build_positional_encoding( positional_encoding) else: self.positional_encoding = None self.cat_coors_level = cat_coors_level self.align_types = align_types self.dcn = DeformConv2dPack(in_channels=256, out_channels=out_channels, kernel_size=3, padding=1) self.fpn_in = [] for fpn_inplane in in_channels[:-1]: self.fpn_in.append( ConvModule(fpn_inplane, out_channels, kernel_size=1, norm_cfg=dict(type='BN2d'), act_cfg=dict(type='ReLU'), inplace=False) ) self.fpn_in = nn.ModuleList(self.fpn_in) self.fpn_out = [] self.fpn_out_align = [] self.dsn = [] for i in range(len(in_channels) - 1): self.fpn_out.append( ConvModule(in_channels=out_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1, norm_cfg=dict(type='BN2d'))) if conv3x3_type == 'conv': if self.align_types == "v1": self.fpn_out_align.append( AlignedModule(inplane=out_channels, outplane=out_channels // 2) ) else: self.fpn_out_align.append( AlignedModulev2PoolingAtten(inplane=out_channels, outplane=out_channels // 2) ) self.fpn_out = nn.ModuleList(self.fpn_out) self.fpn_out_align = nn.ModuleList(self.fpn_out_align) def forward(self, conv_out): f = conv_out[-1] fpn_feature_list = [f] for i in reversed(range(len(conv_out) - 1)): conv_x = conv_out[i] conv_x = self.fpn_in[i](conv_x) f = self.fpn_out_align[i]([conv_x, f]) f = conv_x + f fpn_feature_list.append(self.fpn_out[i](f)) output_size = conv_out[1].size()[2:] fusion_list = [] for i in range(0, len(fpn_feature_list)): fusion_list.append(nn.functional.interpolate( fpn_feature_list[i], output_size, mode='bilinear', align_corners=True)) x = fusion_list[0] for i in range(1, len(fusion_list)): x += fusion_list[i] # add position encodings ignore_mask = x.new_zeros( (x.shape[0], x.shape[-2], x.shape[-1]), dtype=torch.bool) positional_encoding = self.positional_encoding(ignore_mask) x = x + positional_encoding return self.dcn(x) class AlignedModule(nn.Module): def __init__(self, inplane, outplane, kernel_size=3): super(AlignedModule, self).__init__() self.down_h = nn.Conv2d(inplane, outplane, 1, bias=False) self.down_l = nn.Conv2d(inplane, outplane, 1, bias=False) self.flow_make = nn.Conv2d(outplane * 2, 2, kernel_size=kernel_size, padding=1, bias=False) def forward(self, x): low_feature, h_feature = x h_feature_orign = h_feature h, w = low_feature.size()[2:] size = (h, w) low_feature = self.down_l(low_feature) h_feature = self.down_h(h_feature) h_feature = F.interpolate(h_feature, size=size, mode="bilinear", align_corners=True) flow = self.flow_make(torch.cat([h_feature, low_feature], 1)) h_feature = self.flow_warp(h_feature_orign, flow, size=size) return h_feature def flow_warp(self, input, flow, size): out_h, out_w = size n, c, h, w = input.size() norm = torch.tensor([[[[out_w, out_h]]]]).type_as(input).to(input.device) h = torch.linspace(-1.0, 1.0, out_h).view(-1, 1).repeat(1, out_w) w = torch.linspace(-1.0, 1.0, out_w).repeat(out_h, 1) grid = torch.cat((w.unsqueeze(2), h.unsqueeze(2)), 2) grid = grid.repeat(n, 1, 1, 1).type_as(input).to(input.device) grid = grid + flow.permute(0, 2, 3, 1) / norm output = F.grid_sample(input, grid, align_corners=True) return output class AlignedModulev2PoolingAtten(nn.Module): def __init__(self, inplane, outplane, kernel_size=3): super(AlignedModulev2PoolingAtten, self).__init__() self.down_h = nn.Conv2d(inplane, outplane, 1, bias=False) self.down_l = nn.Conv2d(inplane, outplane, 1, bias=False) self.flow_make = nn.Conv2d(outplane*2, 4, kernel_size=kernel_size, padding=1, bias=False) self.flow_gate = nn.Sequential( nn.Conv2d(4, 1, kernel_size=kernel_size, padding=1, bias=False), nn.Sigmoid() ) def forward(self, x): low_feature, h_feature = x h_feature_orign = h_feature h, w = low_feature.size()[2:] size = (h, w) l_feature = self.down_l(low_feature) h_feature = self.down_h(h_feature) h_feature = F.upsample(h_feature, size=size, mode="bilinear", align_corners=True) flow = self.flow_make(torch.cat([h_feature, l_feature], 1)) flow_up, flow_down = flow[:, :2, :, :], flow[:, 2:, :, :] h_feature_warp = self.flow_warp(h_feature_orign, flow_up, size=size) l_feature_warp = self.flow_warp(low_feature, flow_down, size=size) h_feature_mean = torch.mean(h_feature, dim=1).unsqueeze(1) l_feature_mean = torch.mean(low_feature, dim=1).unsqueeze(1) h_feature_max = torch.max(h_feature, dim=1)[0].unsqueeze(1) l_feature_max = torch.max(low_feature, dim=1)[0].unsqueeze(1) flow_gates = self.flow_gate(torch.cat([h_feature_mean, l_feature_mean, h_feature_max, l_feature_max], 1)) fuse_feature = h_feature_warp * flow_gates + l_feature_warp * (1 - flow_gates) return fuse_feature def flow_warp(self, input, flow, size): out_h, out_w = size n, c, h, w = input.size() # n, c, h, w # n, 2, h, w norm = torch.tensor([[[[out_w, out_h]]]]).type_as(input).to(input.device) h = torch.linspace(-1.0, 1.0, out_h).view(-1, 1).repeat(1, out_w) w = torch.linspace(-1.0, 1.0, out_w).repeat(out_h, 1) grid = torch.cat((w.unsqueeze(2), h.unsqueeze(2)), 2) grid = grid.repeat(n, 1, 1, 1).type_as(input).to(input.device) grid = grid + flow.permute(0, 2, 3, 1) / norm output = F.grid_sample(input, grid, align_corners=True) return output @BACKBONES.register_module() class STDCNet1446(nn.Module): def __init__(self, base=64, layers=[4, 5, 3], block_num=4, type="cat", num_classes=1000, dropout=0.20, pretrain_model='./pretrained_models/STDCNet1446_76.47.tar', use_conv_last=False, norm_layer=nn.SyncBatchNorm, ): super(STDCNet1446, self).__init__() if type == "cat": block = CatBottleneck elif type == "add": block = AddBottleneck self.use_conv_last = use_conv_last self.features = self._make_layers(base, layers, block_num, block, norm_layer) self.conv_last = ConvX(base * 16, max(1024, base * 16), 1, 1) self.gap = nn.AdaptiveAvgPool2d(1) self.fc = nn.Linear(max(1024, base * 16), max(1024, base * 16), bias=False) self.bn = nn.BatchNorm1d(max(1024, base * 16)) self.relu = nn.ReLU(inplace=True) self.dropout = nn.Dropout(p=dropout) self.linear = nn.Linear(max(1024, base * 16), num_classes, bias=False) self.x2 = nn.Sequential(self.features[:1]) self.x4 = nn.Sequential(self.features[1:2]) self.x8 = nn.Sequential(self.features[2:6]) self.x16 = nn.Sequential(self.features[6:11]) self.x32 = nn.Sequential(self.features[11:]) if pretrain_model: print('use pretrain model {}'.format(pretrain_model)) self.init_weight(pretrain_model) else: self.init_params() self.features = None self.conv_last = None self.gap = None self.fc = None self.bn = None self.relu = None self.dropout = None self.linear = None def init_weight(self, pretrain_model): state_dict = torch.load(pretrain_model, map_location='cpu')["state_dict"] self_state_dict = self.state_dict() for k, v in state_dict.items(): self_state_dict.update({k: v}) self.load_state_dict(self_state_dict) def init_params(self): for m in self.modules(): if isinstance(m, nn.Conv2d): init.kaiming_normal_(m.weight, mode='fan_out') if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): init.constant_(m.weight, 1) init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): init.normal_(m.weight, std=0.001) if m.bias is not None: init.constant_(m.bias, 0) def _make_layers(self, base, layers, block_num, block, norm_layer): features = [] features += [ConvX(3, base // 2, 3, 2)] features += [ConvX(base // 2, base, 3, 2)] for i, layer in enumerate(layers): for j in range(layer): if i == 0 and j == 0: features.append(block(base, base * 4, block_num, 2, norm_layer=norm_layer)) elif j == 0: features.append(block(base * int(math.pow(2, i + 1)), base * int(math.pow(2, i + 2)), block_num, 2, norm_layer=norm_layer)) else: features.append(block(base * int(math.pow(2, i + 2)), base * int(math.pow(2, i + 2)), block_num, 1, norm_layer=norm_layer)) return nn.Sequential(*features) def forward(self, x): feat2 = self.x2(x) feat4 = self.x4(feat2) feat8 = self.x8(feat4) feat16 = self.x16(feat8) feat32 = self.x32(feat16) if self.use_conv_last: feat32 = self.conv_last(feat32) return feat4, feat8, feat16, feat32 @BACKBONES.register_module() class STDCNet813(nn.Module): def __init__(self, base=64, layers=[2, 2, 2], block_num=4, type="cat", num_classes=1000, dropout=0.20, pretrain_model='./pretrained_models/STDCNet813_73.91.tar', use_conv_last=False, norm_layer=nn.BatchNorm2d): super(STDCNet813, self).__init__() if type == "cat": block = CatBottleneck elif type == "add": block = AddBottleneck self.use_conv_last = use_conv_last self.features = self._make_layers(base, layers, block_num, block, norm_layer) self.conv_last = ConvX(base * 16, max(1024, base * 16), 1, 1) self.gap = nn.AdaptiveAvgPool2d(1) self.fc = nn.Linear(max(1024, base * 16), max(1024, base * 16), bias=False) self.bn = nn.BatchNorm1d(max(1024, base * 16)) self.relu = nn.ReLU(inplace=True) self.dropout = nn.Dropout(p=dropout) self.linear = nn.Linear(max(1024, base * 16), num_classes, bias=False) self.x2 = nn.Sequential(self.features[:1]) self.x4 = nn.Sequential(self.features[1:2]) self.x8 = nn.Sequential(self.features[2:4]) self.x16 = nn.Sequential(self.features[4:6]) self.x32 = nn.Sequential(self.features[6:]) if pretrain_model: print('use pretrain model {}'.format(pretrain_model)) self.init_weight(pretrain_model) else: self.init_params() self.features = None self.conv_last = None self.gap = None self.fc = None self.bn = None self.relu = None self.dropout = None self.linear = None def init_weight(self, pretrain_model): state_dict = torch.load(pretrain_model, map_location='cpu')["state_dict"] self_state_dict = self.state_dict() for k, v in state_dict.items(): self_state_dict.update({k: v}) self.load_state_dict(self_state_dict) def init_params(self): for m in self.modules(): if isinstance(m, nn.Conv2d): init.kaiming_normal_(m.weight, mode='fan_out') if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): init.constant_(m.weight, 1) init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): init.normal_(m.weight, std=0.001) if m.bias is not None: init.constant_(m.bias, 0) def _make_layers(self, base, layers, block_num, block, norm_layer): features = [] features += [ConvX(3, base // 2, 3, 2)] features += [ConvX(base // 2, base, 3, 2)] for i, layer in enumerate(layers): for j in range(layer): if i == 0 and j == 0: features.append(block(base, base * 4, block_num, 2, norm_layer=norm_layer)) elif j == 0: features.append(block(base * int(math.pow(2, i + 1)), base * int(math.pow(2, i + 2)), block_num, 2, norm_layer=norm_layer)) else: features.append(block(base * int(math.pow(2, i + 2)), base * int(math.pow(2, i + 2)), block_num, 1, norm_layer=norm_layer)) return nn.Sequential(*features) def forward(self, x): feat2 = self.x2(x) feat4 = self.x4(feat2) feat8 = self.x8(feat4) feat16 = self.x16(feat8) feat32 = self.x32(feat16) if self.use_conv_last: feat32 = self.conv_last(feat32) return feat4, feat8, feat16, feat32 class AddBottleneck(nn.Module): def __init__(self, in_planes, out_planes, block_num=3, stride=1, norm_layer=nn.BatchNorm2d): super(AddBottleneck, self).__init__() assert block_num > 1, print("block number should be larger than 1.") self.conv_list = nn.ModuleList() self.stride = stride if stride == 2: self.avd_layer = nn.Sequential( nn.Conv2d(out_planes // 2, out_planes // 2, kernel_size=3, stride=2, padding=1, groups=out_planes // 2, bias=False), norm_layer(out_planes // 2), ) self.skip = nn.Sequential( nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=2, padding=1, groups=in_planes, bias=False), norm_layer(in_planes), nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False), norm_layer(out_planes), ) stride = 1 for idx in range(block_num): if idx == 0: self.conv_list.append(ConvX(in_planes, out_planes // 2, kernel=1)) elif idx == 1 and block_num == 2: self.conv_list.append(ConvX(out_planes // 2, out_planes // 2, stride=stride)) elif idx == 1 and block_num > 2: self.conv_list.append(ConvX(out_planes // 2, out_planes // 4, stride=stride)) elif idx < block_num - 1: self.conv_list.append( ConvX(out_planes // int(math.pow(2, idx)), out_planes // int(math.pow(2, idx + 1)))) else: self.conv_list.append(ConvX(out_planes // int(math.pow(2, idx)), out_planes // int(math.pow(2, idx)))) def forward(self, x): out_list = [] out = x for idx, conv in enumerate(self.conv_list): if idx == 0 and self.stride == 2: out = self.avd_layer(conv(out)) else: out = conv(out) out_list.append(out) if self.stride == 2: x = self.skip(x) return torch.cat(out_list, dim=1) + x class CatBottleneck(nn.Module): def __init__(self, in_planes, out_planes, block_num=3, stride=1, norm_layer=nn.BatchNorm2d): super(CatBottleneck, self).__init__() assert block_num > 1, print("block number should be larger than 1.") self.conv_list = nn.ModuleList() self.stride = stride if stride == 2: self.avd_layer = nn.Sequential( nn.Conv2d(out_planes // 2, out_planes // 2, kernel_size=3, stride=2, padding=1, groups=out_planes // 2, bias=False), norm_layer(out_planes // 2), ) self.skip = nn.AvgPool2d(kernel_size=3, stride=2, padding=1) stride = 1 for idx in range(block_num): if idx == 0: self.conv_list.append(ConvX(in_planes, out_planes // 2, kernel=1)) elif idx == 1 and block_num == 2: self.conv_list.append(ConvX(out_planes // 2, out_planes // 2, stride=stride)) elif idx == 1 and block_num > 2: self.conv_list.append(ConvX(out_planes // 2, out_planes // 4, stride=stride)) elif idx < block_num - 1: self.conv_list.append( ConvX(out_planes // int(math.pow(2, idx)), out_planes // int(math.pow(2, idx + 1)))) else: self.conv_list.append(ConvX(out_planes // int(math.pow(2, idx)), out_planes // int(math.pow(2, idx)))) def forward(self, x): out_list = [] out1 = self.conv_list[0](x) for idx, conv in enumerate(self.conv_list[1:]): if idx == 0: if self.stride == 2: out = conv(self.avd_layer(out1)) else: out = conv(out1) else: out = conv(out) out_list.append(out) if self.stride == 2: out1 = self.skip(out1) out_list.insert(0, out1) out = torch.cat(out_list, dim=1) return out class ConvX(nn.Module): def __init__(self, in_planes, out_planes, kernel=3, stride=1, norm_layer=nn.BatchNorm2d): super(ConvX, self).__init__() self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel, stride=stride, padding=kernel//2, bias=False) self.bn = norm_layer(out_planes) self.relu = nn.ReLU(inplace=True) def forward(self, x): out = self.relu(self.bn(self.conv(x))) return out ================================================ FILE: knet/det/utils.py ================================================ from typing import List import torch import torch.nn.functional as F from mmdet.utils import get_root_logger def sem2ins_masks(gt_sem_seg, ignore_label=255, label_shift=80, thing_label_in_seg=0): classes = torch.unique(gt_sem_seg) ins_masks = [] ins_labels = [] for i in classes: # skip ignore class 255 and "special thing class" in semantic seg if i == ignore_label or i == thing_label_in_seg: continue ins_labels.append(i) ins_masks.append(gt_sem_seg == i) # 0 is the special thing class in semantic seg, so we also shift it by 1 # Thus, 0-79 is foreground classes of things (similar in instance seg) # 80-151 is foreground classes of stuffs (shifted by the original index) if len(ins_labels) > 0: ins_labels = torch.stack(ins_labels) + label_shift - 1 ins_masks = torch.cat(ins_masks) else: ins_labels = gt_sem_seg.new_zeros(size=[0]) ins_masks = gt_sem_seg.new_zeros( size=[0, gt_sem_seg.shape[-2], gt_sem_seg.shape[-1]]) return ins_labels.long(), ins_masks.float() def sem2ins_masks_cityscapes(gt_sem_seg, ignore_label=255, label_shift=8, thing_label_in_seg=list(range(11, 19))): """ Shift the cityscapes semantic labels to instance labels and masks. """ # assert label range from 0-18 (255) classes = torch.unique(gt_sem_seg) ins_masks = [] ins_labels = [] for i in classes: # skip ignore class 255 and "special thing class" in semantic seg if i == ignore_label or i in thing_label_in_seg: continue ins_labels.append(i) ins_masks.append(gt_sem_seg == i) # For cityscapes, 0-7 is foreground classes of things (similar in instance seg) # 8-18 is foreground classes of stuffs (shifted by the original index) if len(ins_labels) > 0: ins_labels = torch.stack(ins_labels) + label_shift ins_masks = torch.cat(ins_masks) else: ins_labels = gt_sem_seg.new_zeros(size=[0]) ins_masks = gt_sem_seg.new_zeros( size=[0, gt_sem_seg.shape[-2], gt_sem_seg.shape[-1]]) return ins_labels.long(), ins_masks.float() def sem2ins_masks_kitti_step(gt_sem_seg, ignore_label=255, label_shift=2, thing_label_in_seg=(11,13)): """ Shift the cityscapes semantic labels to instance labels and masks. """ # assert label range from 0-18 (255) classes = torch.unique(gt_sem_seg) ins_masks = [] ins_labels = [] for i in classes: # skip ignore class 255 and "special thing class" in semantic seg if i == ignore_label or i in thing_label_in_seg: continue offset = 0 for thing_label in thing_label_in_seg: if i > thing_label: offset -= 1 ins_labels.append(i + offset) ins_masks.append(gt_sem_seg == i) # For cityscapes, 0-7 is foreground classes of things (similar in instance seg) # 8-18 is foreground classes of stuffs (shifted by the original index) if len(ins_labels) > 0: ins_labels = torch.stack(ins_labels) + label_shift ins_masks = torch.cat(ins_masks) else: ins_labels = gt_sem_seg.new_zeros(size=[0]) ins_masks = gt_sem_seg.new_zeros( size=[0, gt_sem_seg.shape[-2], gt_sem_seg.shape[-1]]) return ins_labels.long(), ins_masks.float() ================================================ FILE: knet/kernel_updator.py ================================================ import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import build_activation_layer, build_norm_layer from mmcv.cnn.bricks.transformer import TRANSFORMER_LAYER @TRANSFORMER_LAYER.register_module() class KernelUpdator(nn.Module): def __init__(self, in_channels=256, feat_channels=64, out_channels=None, input_feat_shape=3, gate_sigmoid=True, gate_norm_act=False, activate_out=False, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')): super(KernelUpdator, self).__init__() self.in_channels = in_channels self.feat_channels = feat_channels self.out_channels_raw = out_channels self.gate_sigmoid = gate_sigmoid self.gate_norm_act = gate_norm_act self.activate_out = activate_out if isinstance(input_feat_shape, int): input_feat_shape = [input_feat_shape] * 2 self.input_feat_shape = input_feat_shape self.act_cfg = act_cfg self.norm_cfg = norm_cfg self.out_channels = out_channels if out_channels else in_channels self.num_params_in = self.feat_channels self.num_params_out = self.feat_channels self.dynamic_layer = nn.Linear( self.in_channels, self.num_params_in + self.num_params_out) self.input_layer = nn.Linear(self.in_channels, self.num_params_in + self.num_params_out, 1) self.input_gate = nn.Linear(self.in_channels, self.feat_channels, 1) self.update_gate = nn.Linear(self.in_channels, self.feat_channels, 1) if self.gate_norm_act: self.gate_norm = build_norm_layer(norm_cfg, self.feat_channels)[1] self.norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1] self.norm_out = build_norm_layer(norm_cfg, self.feat_channels)[1] self.input_norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1] self.input_norm_out = build_norm_layer(norm_cfg, self.feat_channels)[1] self.activation = build_activation_layer(act_cfg) self.fc_layer = nn.Linear(self.feat_channels, self.out_channels, 1) self.fc_norm = build_norm_layer(norm_cfg, self.out_channels)[1] def forward(self, update_feature, input_feature): update_feature = update_feature.reshape(-1, self.in_channels) num_proposals = update_feature.size(0) parameters = self.dynamic_layer(update_feature) param_in = parameters[:, :self.num_params_in].view( -1, self.feat_channels) param_out = parameters[:, -self.num_params_out:].view( -1, self.feat_channels) input_feats = self.input_layer( input_feature.reshape(num_proposals, -1, self.feat_channels)) input_in = input_feats[..., :self.num_params_in] input_out = input_feats[..., -self.num_params_out:] gate_feats = input_in * param_in.unsqueeze(-2) if self.gate_norm_act: gate_feats = self.activation(self.gate_norm(gate_feats)) input_gate = self.input_norm_in(self.input_gate(gate_feats)) update_gate = self.norm_in(self.update_gate(gate_feats)) if self.gate_sigmoid: input_gate = input_gate.sigmoid() update_gate = update_gate.sigmoid() param_out = self.norm_out(param_out) input_out = self.input_norm_out(input_out) if self.activate_out: param_out = self.activation(param_out) input_out = self.activation(input_out) # param_out has shape (batch_size, feat_channels, out_channels) features = update_gate * param_out.unsqueeze( -2) + input_gate * input_out features = self.fc_layer(features) features = self.fc_norm(features) features = self.activation(features) return features ================================================ FILE: knet/video/__init__.py ================================================ ================================================ FILE: knet/video/dice_loss.py ================================================ import torch import torch.nn as nn from mmdet.models.builder import LOSSES, build_loss from mmdet.models.losses.utils import weighted_loss @weighted_loss def dice_loss(input, target, eps=1e-3, numerator_eps=0): input = input.reshape(input.size()[0], -1) target = target.reshape(target.size()[0], -1).float() a = torch.sum(input * target, 1) b = torch.sum(input * input, 1) + eps c = torch.sum(target * target, 1) + eps d = (2 * a + numerator_eps) / (b + c) return 1 - d # # @LOSSES.register_module() # class DiceLoss(nn.Module): # # def __init__(self, # eps=1e-3, # numerator_eps=0.0, # use_sigmoid=True, # reduction='mean', # loss_weight=1.0): # super(DiceLoss, self).__init__() # self.eps = eps # self.reduction = reduction # self.loss_weight = loss_weight # self.use_sigmoid = use_sigmoid # self.numerator_eps = numerator_eps # # def forward(self, # pred, # target, # weight=None, # avg_factor=None, # reduction_override=None, # **kwargs): # if weight is not None and not torch.any(weight > 0): # return (pred * weight).sum() # 0 # assert reduction_override in (None, 'none', 'mean', 'sum') # reduction = ( # reduction_override if reduction_override else self.reduction) # pred = pred.sigmoid() # loss = self.loss_weight * dice_loss( # pred, # target, # weight, # eps=self.eps, # numerator_eps=self.numerator_eps, # reduction=reduction, # avg_factor=avg_factor, # **kwargs) # return loss ================================================ FILE: knet/video/kernel_head.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import (ConvModule, bias_init_with_prob, normal_init) from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean from mmdet.models.builder import HEADS, build_loss, build_neck from mmdet.models.losses import accuracy from mmdet.utils import get_root_logger @HEADS.register_module() class VideoConvKernelHead(nn.Module): """ This head for init mask and kernel prediction """ def __init__(self, num_proposals=100, in_channels=256, out_channels=256, num_heads=8, num_cls_fcs=1, num_seg_convs=1, num_loc_convs=1, att_dropout=False, localization_fpn=None, conv_kernel_size=1, norm_cfg=dict(type='GN', num_groups=32), semantic_fpn=True, train_cfg=None, num_classes=80, xavier_init_kernel=False, kernel_init_std=0.01, use_binary=False, proposal_feats_with_obj=False, loss_mask=None, loss_seg=None, loss_cls=None, loss_dice=None, loss_rank=None, feat_downsample_stride=1, feat_refine_stride=1, feat_refine=True, with_embed=False, feat_embed_only=False, conv_normal_init=False, mask_out_stride=4, hard_target=False, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, cat_stuff_mask=False, link_previous=False, **kwargs): super(VideoConvKernelHead, self).__init__() self.num_proposals = num_proposals self.num_cls_fcs = num_cls_fcs self.train_cfg = train_cfg self.in_channels = in_channels self.out_channels = out_channels self.num_classes = num_classes self.proposal_feats_with_obj = proposal_feats_with_obj self.sampling = False self.localization_fpn = build_neck(localization_fpn) self.semantic_fpn = semantic_fpn self.norm_cfg = norm_cfg self.num_heads = num_heads self.att_dropout = att_dropout self.mask_out_stride = mask_out_stride self.hard_target = hard_target self.conv_kernel_size = conv_kernel_size self.xavier_init_kernel = xavier_init_kernel self.kernel_init_std = kernel_init_std self.feat_downsample_stride = feat_downsample_stride self.feat_refine_stride = feat_refine_stride self.conv_normal_init = conv_normal_init self.feat_refine = feat_refine self.with_embed = with_embed self.feat_embed_only = feat_embed_only self.num_loc_convs = num_loc_convs self.num_seg_convs = num_seg_convs self.use_binary = use_binary self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.ignore_label = ignore_label self.thing_label_in_seg = thing_label_in_seg self.cat_stuff_mask = cat_stuff_mask self.link_previous = link_previous if loss_mask is not None: self.loss_mask = build_loss(loss_mask) else: self.loss_mask = loss_mask if loss_dice is not None: self.loss_dice = build_loss(loss_dice) else: self.loss_dice = loss_dice if loss_seg is not None: self.loss_seg = build_loss(loss_seg) else: self.loss_seg = loss_seg if loss_cls is not None: self.loss_cls = build_loss(loss_cls) else: self.loss_cls = loss_cls if loss_rank is not None: self.loss_rank = build_loss(loss_rank) else: self.loss_rank = loss_rank if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) # use PseudoSampler when sampling is False if self.sampling and hasattr(self.train_cfg, 'sampler'): sampler_cfg = self.train_cfg.sampler else: sampler_cfg = dict(type='MaskPseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self._init_layers() def _init_layers(self): """Initialize a sparse set of proposal boxes and proposal features.""" self.init_kernels = nn.Conv2d( self.out_channels, self.num_proposals, self.conv_kernel_size, padding=int(self.conv_kernel_size // 2), bias=False) # (N, C, 1, 1) -> (N, C) if self.semantic_fpn: self.conv_seg = nn.Conv2d(self.out_channels, self.num_classes, 1) if self.feat_downsample_stride > 1 and self.feat_refine: self.ins_downsample = ConvModule( self.in_channels, self.out_channels, 3, stride=self.feat_refine_stride, # 2 padding=1, norm_cfg=self.norm_cfg) self.seg_downsample = ConvModule( self.in_channels, self.out_channels, 3, stride=self.feat_refine_stride, # 2 padding=1, norm_cfg=self.norm_cfg) self.loc_convs = nn.ModuleList() for i in range(self.num_loc_convs): self.loc_convs.append( ConvModule( self.in_channels, self.out_channels, 1, norm_cfg=self.norm_cfg)) self.seg_convs = nn.ModuleList() for i in range(self.num_seg_convs): self.seg_convs.append( ConvModule( self.in_channels, self.out_channels, 1, norm_cfg=self.norm_cfg)) def init_weights(self): self.localization_fpn.init_weights() if self.feat_downsample_stride > 1 and self.conv_normal_init: logger = get_root_logger() logger.info('Initialize convs in KPN head by normal std 0.01') for conv in [self.loc_convs, self.seg_convs]: for m in conv.modules(): if isinstance(m, nn.Conv2d): normal_init(m, std=0.01) if self.semantic_fpn: bias_seg = bias_init_with_prob(0.01) if self.loss_seg.use_sigmoid: normal_init(self.conv_seg, std=0.01, bias=bias_seg) else: normal_init(self.conv_seg, mean=0, std=0.01) if self.xavier_init_kernel: logger = get_root_logger() logger.info('Initialize kernels by xavier uniform') nn.init.xavier_uniform_(self.init_kernels.weight) else: logger = get_root_logger() logger.info( f'Initialize kernels by normal std: {self.kernel_init_std}') normal_init(self.init_kernels, mean=0, std=self.kernel_init_std) def _decode_init_proposals(self, img, img_metas, previous_obj_feats=None, previous_mask_preds=None, previous_x_feats=None): num_imgs = len(img_metas) localization_feats = self.localization_fpn(img) ## thing branch if isinstance(localization_feats, list): loc_feats = localization_feats[0] else: loc_feats = localization_feats for conv in self.loc_convs: loc_feats = conv(loc_feats) if self.feat_downsample_stride > 1 and self.feat_refine: loc_feats = self.ins_downsample(loc_feats) # init kernel prediction mask_preds = self.init_kernels(loc_feats) # init mask prediction # stuff branch if self.semantic_fpn: if isinstance(localization_feats, list): semantic_feats = localization_feats[1] else: semantic_feats = localization_feats for conv in self.seg_convs: semantic_feats = conv(semantic_feats) if self.feat_downsample_stride > 1 and self.feat_refine: semantic_feats = self.seg_downsample(semantic_feats) else: semantic_feats = None if semantic_feats is not None: seg_preds = self.conv_seg(semantic_feats) else: seg_preds = None # init things proposal_feats = self.init_kernels.weight.clone() proposal_feats = proposal_feats[None].expand(num_imgs, *proposal_feats.size()) if semantic_feats is not None: x_feats = semantic_feats + loc_feats else: x_feats = loc_feats if self.proposal_feats_with_obj: sigmoid_masks = mask_preds.sigmoid() nonzero_inds = sigmoid_masks > 0.5 if self.use_binary: sigmoid_masks = nonzero_inds.float() else: sigmoid_masks = nonzero_inds.float() * sigmoid_masks obj_feats = torch.einsum('bnhw, bchw->bnc', sigmoid_masks, x_feats) cls_scores = None if self.proposal_feats_with_obj: # default True proposal_feats = proposal_feats + obj_feats.view( num_imgs, self.num_proposals, self.out_channels, 1, 1) if self.cat_stuff_mask and not self.training: mask_preds = torch.cat( [mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1) stuff_kernels = self.conv_seg.weight[self. num_thing_classes:].clone() stuff_kernels = stuff_kernels[None].expand(num_imgs, *stuff_kernels.size()) proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1) # (b, N_{st}+N_{th}, c) return proposal_feats, x_feats, mask_preds, cls_scores, seg_preds def forward_train(self, img, img_metas, gt_masks, gt_labels, gt_sem_seg=None, gt_sem_cls=None, previous_obj_feats=None, previous_mask_preds=None, previous_x_feats=None): """Forward function in training stage.""" num_imgs = len(img_metas) results = self._decode_init_proposals(img, img_metas, previous_obj_feats, previous_mask_preds, previous_x_feats) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = results if self.feat_downsample_stride > 1: scaled_mask_preds = F.interpolate( mask_preds, scale_factor=self.feat_downsample_stride, mode='bilinear', align_corners=False) if seg_preds is not None: scaled_seg_preds = F.interpolate( seg_preds, scale_factor=self.feat_downsample_stride, mode='bilinear', align_corners=False) else: scaled_mask_preds = mask_preds # thing scaled_seg_preds = seg_preds # stuff if self.hard_target: gt_masks = [x.bool().float() for x in gt_masks] else: gt_masks = gt_masks sampling_results = [] if cls_scores is None: detached_cls_scores = [None] * num_imgs else: detached_cls_scores = cls_scores.detach() for i in range(num_imgs): assign_result = self.assigner.assign(scaled_mask_preds[i].detach(), detached_cls_scores[i], gt_masks[i], gt_labels[i], img_meta=img_metas[i]) sampling_result = self.sampler.sample(assign_result, scaled_mask_preds[i], gt_masks[i]) sampling_results.append(sampling_result) mask_targets = self.get_targets( sampling_results, gt_masks, self.train_cfg, True, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls) losses = self.loss(scaled_mask_preds, cls_scores, scaled_seg_preds, proposal_feats, *mask_targets) if self.cat_stuff_mask and self.training: mask_preds = torch.cat( [mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1) stuff_kernels = self.conv_seg.weight[self. num_thing_classes:].clone() stuff_kernels = stuff_kernels[None].expand(num_imgs, *stuff_kernels.size()) proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1) return losses, proposal_feats, x_feats, mask_preds, cls_scores def loss(self, mask_pred, cls_scores, seg_preds, proposal_feats, labels, label_weights, mask_targets, mask_weights, seg_targets, reduction_override=None, **kwargs): losses = dict() bg_class_ind = self.num_classes # note in spare rcnn num_gt == num_pos pos_inds = (labels >= 0) & (labels < bg_class_ind) num_preds = mask_pred.shape[0] * mask_pred.shape[1] if cls_scores is not None: num_pos = pos_inds.sum().float() avg_factor = reduce_mean(num_pos) assert mask_pred.shape[0] == cls_scores.shape[0] assert mask_pred.shape[1] == cls_scores.shape[1] losses['loss_rpn_cls'] = self.loss_cls( cls_scores.view(num_preds, -1), labels, label_weights, avg_factor=avg_factor, reduction_override=reduction_override) losses['rpn_pos_acc'] = accuracy( cls_scores.view(num_preds, -1)[pos_inds], labels[pos_inds]) bool_pos_inds = pos_inds.type(torch.bool) # 0~self.num_classes-1 are FG, self.num_classes is BG # do not perform bounding box regression for BG anymore. H, W = mask_pred.shape[-2:] if pos_inds.any(): pos_mask_pred = mask_pred.reshape(num_preds, H, W)[bool_pos_inds] pos_mask_targets = mask_targets[bool_pos_inds] losses['loss_rpn_mask'] = self.loss_mask(pos_mask_pred, pos_mask_targets) losses['loss_rpn_dice'] = self.loss_dice(pos_mask_pred, pos_mask_targets) if self.loss_rank is not None: batch_size = mask_pred.size(0) rank_target = mask_targets.new_full((batch_size, H, W), self.ignore_label, dtype=torch.long) rank_inds = pos_inds.view(batch_size, -1).nonzero(as_tuple=False) batch_mask_targets = mask_targets.view(batch_size, -1, H, W).bool() for i in range(batch_size): curr_inds = (rank_inds[:, 0] == i) curr_rank = rank_inds[:, 1][curr_inds] for j in curr_rank: rank_target[i][batch_mask_targets[i][j]] = j losses['loss_rpn_rank'] = self.loss_rank( mask_pred, rank_target, ignore_index=self.ignore_label) else: losses['loss_rpn_mask'] = mask_pred.sum() * 0 losses['loss_rpn_dice'] = mask_pred.sum() * 0 if self.loss_rank is not None: losses['loss_rank'] = mask_pred.sum() * 0 if seg_preds is not None: if self.loss_seg.use_sigmoid: cls_channel = seg_preds.shape[1] flatten_seg = seg_preds.view( -1, cls_channel, H * W).permute(0, 2, 1).reshape(-1, cls_channel) flatten_seg_target = seg_targets.view(-1) num_dense_pos = (flatten_seg_target >= 0) & ( flatten_seg_target < bg_class_ind) num_dense_pos = num_dense_pos.sum().float().clamp(min=1.0) losses['loss_rpn_seg'] = self.loss_seg( flatten_seg, flatten_seg_target, avg_factor=num_dense_pos) else: cls_channel = seg_preds.shape[1] flatten_seg = seg_preds.view(-1, cls_channel, H * W).permute( 0, 2, 1).reshape(-1, cls_channel) flatten_seg_target = seg_targets.view(-1) losses['loss_rpn_seg'] = self.loss_seg(flatten_seg, flatten_seg_target, ignore_index=self.num_classes) return losses def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask, pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls, cfg): num_pos = pos_mask.size(0) num_neg = neg_mask.size(0) num_samples = num_pos + num_neg H, W = pos_mask.shape[-2:] # original implementation uses new_zeros since BG are set to be 0 # now use empty & fill because BG cat_id = num_classes, # FG cat_id = [0, num_classes-1] labels = pos_mask.new_full((num_samples, ), self.num_classes, dtype=torch.long) label_weights = pos_mask.new_zeros(num_samples) mask_targets = pos_mask.new_zeros(num_samples, H, W) mask_weights = pos_mask.new_zeros(num_samples, H, W) seg_targets = pos_mask.new_full((H, W), self.num_classes, dtype=torch.long) if gt_sem_cls is not None and gt_sem_seg is not None: gt_sem_seg = gt_sem_seg.bool() for sem_mask, sem_cls in zip(gt_sem_seg, gt_sem_cls): seg_targets[sem_mask] = sem_cls.long() if num_pos > 0: labels[pos_inds] = pos_gt_labels pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight label_weights[pos_inds] = pos_weight mask_targets[pos_inds, ...] = pos_gt_mask mask_weights[pos_inds, ...] = 1 for i in range(num_pos): seg_targets[pos_gt_mask[i].bool()] = pos_gt_labels[i] if num_neg > 0: label_weights[neg_inds] = 1.0 return labels, label_weights, mask_targets, mask_weights, seg_targets def get_targets(self, sampling_results, gt_mask, rpn_train_cfg, concat=True, gt_sem_seg=None, gt_sem_cls=None): pos_inds_list = [res.pos_inds for res in sampling_results] neg_inds_list = [res.neg_inds for res in sampling_results] pos_mask_list = [res.pos_masks for res in sampling_results] neg_mask_list = [res.neg_masks for res in sampling_results] pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results] pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results] if gt_sem_seg is None: gt_sem_seg = [None] * 2 gt_sem_cls = [None] * 2 results = multi_apply( self._get_target_single, pos_inds_list, neg_inds_list, pos_mask_list, neg_mask_list, pos_gt_mask_list, pos_gt_labels_list, gt_sem_seg, gt_sem_cls, cfg=rpn_train_cfg) (labels, label_weights, mask_targets, mask_weights, seg_targets) = results if concat: labels = torch.cat(labels, 0) label_weights = torch.cat(label_weights, 0) mask_targets = torch.cat(mask_targets, 0) mask_weights = torch.cat(mask_weights, 0) seg_targets = torch.stack(seg_targets, 0) return labels, label_weights, mask_targets, mask_weights, seg_targets def simple_test_rpn(self, img, img_metas, previous_obj_feats=None, previous_mask_preds=None, previous_x_feats=None): """Forward function in testing stage.""" return self._decode_init_proposals(img, img_metas, previous_obj_feats, previous_mask_preds, previous_x_feats) def forward_dummy(self, img, img_metas): """Dummy forward function. Used in flops calculation. """ return self._decode_init_proposals(img, img_metas) ================================================ FILE: knet/video/kernel_iter_head.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from mmdet.core import build_assigner, build_sampler from mmdet.models.builder import HEADS, build_head from mmdet.models.roi_heads import BaseRoIHead from knet.det.mask_pseudo_sampler import MaskPseudoSampler @HEADS.register_module() class VideoKernelIterHead(BaseRoIHead): def __init__(self, num_stages=6, recursive=False, assign_stages=5, stage_loss_weights=(1, 1, 1, 1, 1, 1), proposal_feature_channel=256, merge_cls_scores=False, do_panoptic=False, post_assign=False, hard_target=False, merge_joint=False, num_proposals=100, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, with_track=False, mask_head=dict( type='KernelUpdateHead', num_classes=80, num_fcs=2, num_heads=8, num_cls_fcs=1, num_reg_fcs=3, feedforward_channels=2048, hidden_channels=256, dropout=0.0, roi_feat_size=7, ffn_act_cfg=dict(type='ReLU', inplace=True)), mask_out_stride=4, train_cfg=None, test_cfg=None, **kwargs): assert mask_head is not None assert len(stage_loss_weights) == num_stages self.num_stages = num_stages self.stage_loss_weights = stage_loss_weights self.proposal_feature_channel = proposal_feature_channel self.merge_cls_scores = merge_cls_scores self.recursive = recursive self.post_assign = post_assign self.mask_out_stride = mask_out_stride self.hard_target = hard_target self.merge_joint = merge_joint self.assign_stages = assign_stages self.do_panoptic = do_panoptic self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.thing_label_in_seg = thing_label_in_seg self.num_proposals = num_proposals self.ignore_label = ignore_label self.with_track = with_track super(VideoKernelIterHead, self).__init__( mask_head=mask_head, train_cfg=train_cfg, test_cfg=test_cfg, **kwargs) # train_cfg would be None when run the test.py if train_cfg is not None: for stage in range(num_stages): assert isinstance( self.mask_sampler[stage], MaskPseudoSampler), \ 'Sparse Mask only support `MaskPseudoSampler`' def init_bbox_head(self, mask_roi_extractor, mask_head): """Initialize box head and box roi extractor. Args: mask_roi_extractor (dict): Config of box roi extractor. mask_head (dict): Config of box in box head. """ pass def init_assigner_sampler(self): """Initialize assigner and sampler for each stage.""" self.mask_assigner = [] self.mask_sampler = [] if self.train_cfg is not None: for idx, rcnn_train_cfg in enumerate(self.train_cfg): self.mask_assigner.append( build_assigner(rcnn_train_cfg.assigner)) self.current_stage = idx self.mask_sampler.append( build_sampler(rcnn_train_cfg.sampler, context=self)) def init_weights(self): for i in range(self.num_stages): self.mask_head[i].init_weights() def init_mask_head(self, mask_roi_extractor, mask_head): """Initialize mask head and mask roi extractor. Args: mask_roi_extractor (dict): Config of mask roi extractor. mask_head (dict): Config of mask in mask head. """ self.mask_head = nn.ModuleList() if not isinstance(mask_head, list): mask_head = [mask_head for _ in range(self.num_stages)] assert len(mask_head) == self.num_stages for head in mask_head: self.mask_head.append(build_head(head)) if self.recursive: for i in range(self.num_stages): self.mask_head[i] = self.mask_head[0] def _mask_forward(self, stage, x, object_feats, mask_preds, img_metas, previous_obj_feats=None, previous_mask_preds=None, previous_x_feats=None ): mask_head = self.mask_head[stage] cls_score, mask_preds, object_feats, x_feats, object_feats_track = mask_head( x, object_feats, mask_preds, img_metas=img_metas, previous_obj_feats=previous_obj_feats, previous_mask_preds=previous_mask_preds, previous_x_feats=previous_x_feats ) if mask_head.mask_upsample_stride > 1 and (stage == self.num_stages - 1 or self.training): scaled_mask_preds = F.interpolate( mask_preds, scale_factor=mask_head.mask_upsample_stride, align_corners=False, mode='bilinear') else: scaled_mask_preds = mask_preds mask_results = dict( cls_score=cls_score, mask_preds=mask_preds, scaled_mask_preds=scaled_mask_preds, object_feats=object_feats, object_feats_track=object_feats_track, x_feats=x_feats, ) return mask_results def forward_train(self, x, proposal_feats, mask_preds, cls_score, img_metas, gt_masks, gt_labels, gt_pids=None, gt_bboxes_ignore=None, imgs_whwh=None, gt_bboxes=None, gt_sem_seg=None, gt_sem_cls=None): num_imgs = len(img_metas) if self.mask_head[0].mask_upsample_stride > 1: prev_mask_preds = F.interpolate( mask_preds.detach(), scale_factor=self.mask_head[0].mask_upsample_stride, mode='bilinear', align_corners=False) else: prev_mask_preds = mask_preds.detach() if cls_score is not None: prev_cls_score = cls_score.detach() else: prev_cls_score = [None] * num_imgs if self.hard_target: gt_masks = [x.bool().float() for x in gt_masks] else: gt_masks = gt_masks object_feats = proposal_feats all_stage_loss = {} all_stage_mask_results = [] assign_results = [] final_sample_results = [] for stage in range(self.num_stages): mask_results = self._mask_forward(stage, x, object_feats, mask_preds, img_metas) all_stage_mask_results.append(mask_results) mask_preds = mask_results['mask_preds'] scaled_mask_preds = mask_results['scaled_mask_preds'] cls_score = mask_results['cls_score'] object_feats = mask_results['object_feats'] object_feats_track = mask_results['object_feats_track'] if self.post_assign: prev_mask_preds = scaled_mask_preds.detach() prev_cls_score = cls_score.detach() sampling_results = [] if stage < self.assign_stages: assign_results = [] for i in range(num_imgs): if stage < self.assign_stages: mask_for_assign = prev_mask_preds[i][:self.num_proposals] if prev_cls_score[i] is not None: cls_for_assign = prev_cls_score[ i][:self.num_proposals, :self.num_thing_classes] else: cls_for_assign = None assign_result = self.mask_assigner[stage].assign( mask_for_assign, cls_for_assign, gt_masks[i], gt_labels[i], img_meta=img_metas[i]) assign_results.append(assign_result) sampling_result = self.mask_sampler[stage].sample( assign_results[i], scaled_mask_preds[i], gt_masks[i]) sampling_results.append(sampling_result) mask_targets = self.mask_head[stage].get_targets( sampling_results, gt_masks, gt_labels, self.train_cfg[stage], True, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls) single_stage_loss = self.mask_head[stage].loss( object_feats, cls_score, scaled_mask_preds, *mask_targets, imgs_whwh=imgs_whwh) for key, value in single_stage_loss.items(): all_stage_loss[f's{stage}_{key}'] = value * \ self.stage_loss_weights[stage] if not self.post_assign: prev_mask_preds = scaled_mask_preds.detach() prev_cls_score = cls_score.detach() if stage == self.num_stages - 1: final_sample_results.extend(sampling_results) if self.with_track: return all_stage_loss, object_feats, cls_score, mask_preds, scaled_mask_preds else: return all_stage_loss def forward_train_with_previous(self, x, proposal_feats, mask_preds, cls_score, img_metas, gt_masks, gt_labels, gt_pids=None, gt_bboxes_ignore=None, imgs_whwh=None, gt_bboxes=None, gt_sem_seg=None, gt_sem_cls=None, previous_obj_feats=None, previous_mask_preds=None, previous_x_feats=None, ): num_imgs = len(img_metas) if self.mask_head[0].mask_upsample_stride > 1: prev_mask_preds = F.interpolate( mask_preds.detach(), scale_factor=self.mask_head[0].mask_upsample_stride, mode='bilinear', align_corners=False) else: prev_mask_preds = mask_preds.detach() if cls_score is not None: prev_cls_score = cls_score.detach() else: prev_cls_score = [None] * num_imgs if self.hard_target: gt_masks = [x.bool().float() for x in gt_masks] else: gt_masks = gt_masks object_feats = proposal_feats all_stage_loss = {} all_stage_mask_results = [] assign_results = [] final_sample_results = [] for stage in range(self.num_stages): # only link the last stage previous_obj_feats_cur = previous_obj_feats if stage == self.num_stages - 1 else None previous_mask_preds_cur = previous_mask_preds if stage == self.num_stages - 1 else None previous_x_feats_cur = previous_x_feats if stage == self.num_stages - 1 else None # only link the first stage # previous_obj_feats_cur = previous_obj_feats if stage == 0 else None # previous_mask_preds_cur = previous_mask_preds if stage == 0 else None # previous_x_feats_cur = previous_x_feats if stage == 0 else None mask_results = self._mask_forward(stage, x, object_feats, mask_preds, img_metas, previous_obj_feats=previous_obj_feats_cur, previous_mask_preds=previous_mask_preds_cur, previous_x_feats=previous_x_feats_cur) all_stage_mask_results.append(mask_results) mask_preds = mask_results['mask_preds'] scaled_mask_preds = mask_results['scaled_mask_preds'] cls_score = mask_results['cls_score'] object_feats = mask_results['object_feats'] object_feats_track = mask_results['object_feats_track'] if self.post_assign: prev_mask_preds = scaled_mask_preds.detach() prev_cls_score = cls_score.detach() sampling_results = [] if stage < self.assign_stages: assign_results = [] for i in range(num_imgs): if stage < self.assign_stages: mask_for_assign = prev_mask_preds[i][:self.num_proposals] if prev_cls_score[i] is not None: cls_for_assign = prev_cls_score[ i][:self.num_proposals, :self.num_thing_classes] else: cls_for_assign = None assign_result = self.mask_assigner[stage].assign( mask_for_assign, cls_for_assign, gt_masks[i], gt_labels[i], img_meta=img_metas[i]) assign_results.append(assign_result) sampling_result = self.mask_sampler[stage].sample( assign_results[i], scaled_mask_preds[i], gt_masks[i]) sampling_results.append(sampling_result) mask_targets = self.mask_head[stage].get_targets( sampling_results, gt_masks, gt_labels, self.train_cfg[stage], True, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls) single_stage_loss = self.mask_head[stage].loss( object_feats, cls_score, scaled_mask_preds, *mask_targets, imgs_whwh=imgs_whwh) for key, value in single_stage_loss.items(): all_stage_loss[f's{stage}_{key}'] = value * \ self.stage_loss_weights[stage] if not self.post_assign: prev_mask_preds = scaled_mask_preds.detach() prev_cls_score = cls_score.detach() if stage == self.num_stages - 1: final_sample_results.extend(sampling_results) if self.with_track: return all_stage_loss, object_feats, cls_score, mask_preds, scaled_mask_preds, object_feats_track else: return all_stage_loss def simple_test(self, x, proposal_feats, mask_preds, cls_score, img_metas): # Decode initial proposals num_imgs = len(img_metas) # num_proposals = proposal_feats.size(1) object_feats = proposal_feats for stage in range(self.num_stages): mask_results = self._mask_forward(stage, x, object_feats, mask_preds, img_metas) object_feats = mask_results['object_feats'] cls_score = mask_results['cls_score'] mask_preds = mask_results['mask_preds'] scaled_mask_preds = mask_results['scaled_mask_preds'] object_feats_track = mask_results['object_feats_track'] num_classes = self.mask_head[-1].num_classes results = [] if self.mask_head[-1].loss_cls.use_sigmoid: cls_score = cls_score.sigmoid() else: cls_score = cls_score.softmax(-1)[..., :-1] if self.do_panoptic: for img_id in range(num_imgs): single_result = self.get_panoptic(cls_score[img_id], scaled_mask_preds[img_id], self.test_cfg, img_metas[img_id], object_feats[img_id] ) results.append(single_result) else: for img_id in range(num_imgs): cls_score_per_img = cls_score[img_id] scores_per_img, topk_indices = cls_score_per_img.flatten( 0, 1).topk( self.test_cfg.max_per_img, sorted=True) mask_indices = topk_indices // num_classes labels_per_img = topk_indices % num_classes masks_per_img = scaled_mask_preds[img_id][mask_indices] single_result = self.mask_head[-1].get_seg_masks( masks_per_img, labels_per_img, scores_per_img, self.test_cfg, img_metas[img_id]) results.append(single_result) if self.with_track: return results, object_feats, cls_score, mask_preds, scaled_mask_preds else: return results def simple_test_with_previous(self, x, proposal_feats, mask_preds, cls_score, img_metas, previous_obj_feats=None, previous_mask_preds=None, previous_x_feats=None, is_first=False, ): # Decode initial proposals num_imgs = len(img_metas) # num_proposals = proposal_feats.size(1) object_feats = proposal_feats for stage in range(self.num_stages): # only link the last stage inputs previous_obj_feats_cur = previous_obj_feats if stage == self.num_stages - 1 else None previous_mask_preds_cur = previous_mask_preds if stage == self.num_stages - 1 else None previous_x_feats_cur = previous_x_feats if stage == self.num_stages - 1 else None mask_results = self._mask_forward(stage, x, object_feats, mask_preds, img_metas, previous_obj_feats=previous_obj_feats_cur, previous_mask_preds=previous_mask_preds_cur, previous_x_feats=previous_x_feats_cur ) object_feats = mask_results['object_feats'] cls_score = mask_results['cls_score'] mask_preds = mask_results['mask_preds'] scaled_mask_preds = mask_results['scaled_mask_preds'] object_feats_track = mask_results['object_feats_track'] num_classes = self.mask_head[-1].num_classes results = [] if self.mask_head[-1].loss_cls.use_sigmoid: cls_score = cls_score.sigmoid() else: cls_score = cls_score.softmax(-1)[..., :-1] if is_first: object_feats_track = object_feats if self.do_panoptic: for img_id in range(num_imgs): single_result = self.get_panoptic(cls_score[img_id], scaled_mask_preds[img_id], self.test_cfg, img_metas[img_id], object_feats_track[img_id]) results.append(single_result) else: for img_id in range(num_imgs): cls_score_per_img = cls_score[img_id] scores_per_img, topk_indices = cls_score_per_img.flatten( 0, 1).topk( self.test_cfg.max_per_img, sorted=True) mask_indices = topk_indices // num_classes labels_per_img = topk_indices % num_classes masks_per_img = scaled_mask_preds[img_id][mask_indices] single_result = self.mask_head[-1].get_seg_masks( masks_per_img, labels_per_img, scores_per_img, self.test_cfg, img_metas[img_id]) results.append(single_result) if self.with_track: return results, object_feats, cls_score, mask_preds, scaled_mask_preds else: return results def simple_test_mask_preds(self, x, proposal_feats, mask_preds, cls_score, img_metas): object_feats = proposal_feats for stage in range(self.num_stages): mask_results = self._mask_forward(stage, x, object_feats, mask_preds, img_metas) object_feats = mask_results['object_feats'] cls_score = mask_results['cls_score'] mask_preds = mask_results['mask_preds'] scaled_mask_preds = mask_results['scaled_mask_preds'] if self.mask_head[-1].loss_cls.use_sigmoid: cls_score = cls_score.sigmoid() else: cls_score = cls_score.softmax(-1)[..., :-1] return object_feats, cls_score, mask_preds, scaled_mask_preds def simple_test_mask_preds_plus_previous( self, x, proposal_feats, mask_preds, cls_score, img_metas, previous_obj_feats=None, previous_mask_preds=None, previous_x_feats=None, ): object_feats = proposal_feats for stage in range(self.num_stages): previous_obj_feats_cur = previous_obj_feats if stage == self.num_stages - 1 else None previous_mask_preds_cur = previous_mask_preds if stage == self.num_stages - 1 else None previous_x_feats_cur = previous_x_feats if stage == self.num_stages - 1 else None mask_results = self._mask_forward(stage, x, object_feats, mask_preds, img_metas, previous_obj_feats=previous_obj_feats_cur, previous_mask_preds=previous_mask_preds_cur, previous_x_feats=previous_x_feats_cur ) object_feats = mask_results['object_feats'] cls_score = mask_results['cls_score'] mask_preds = mask_results['mask_preds'] scaled_mask_preds = mask_results['scaled_mask_preds'] if self.mask_head[-1].loss_cls.use_sigmoid: cls_score = cls_score.sigmoid() else: cls_score = cls_score.softmax(-1)[..., :-1] return object_feats, cls_score, mask_preds, scaled_mask_preds def get_masked_feature(self, x, mask_pred): sigmoid_masks = mask_pred.sigmoid() nonzero_inds = sigmoid_masks > 0.5 sigmoid_masks = nonzero_inds.float() x_feat = torch.einsum('bnhw,bchw->bnc', sigmoid_masks, x) return x_feat def aug_test(self, features, proposal_list, img_metas, rescale=False): raise NotImplementedError('SparseMask does not support `aug_test`') def forward_dummy(self, x, proposal_boxes, proposal_feats, img_metas): """Dummy forward function when do the flops computing.""" all_stage_mask_results = [] num_imgs = len(img_metas) num_proposals = proposal_feats.size(1) C, H, W = x.shape[-3:] mask_preds = proposal_feats.bmm(x.view(num_imgs, C, -1)).view( num_imgs, num_proposals, H, W) object_feats = proposal_feats for stage in range(self.num_stages): mask_results = self._mask_forward(stage, x, object_feats, mask_preds, img_metas) all_stage_mask_results.append(mask_results) return all_stage_mask_results def get_panoptic(self, cls_scores, mask_preds, test_cfg, img_meta, obj_feat=None): # resize mask predictions back thing_scores = cls_scores[:self.num_proposals][:, :self. num_thing_classes] thing_mask_preds = mask_preds[:self.num_proposals] thing_scores, topk_indices = thing_scores.flatten(0, 1).topk( self.test_cfg.max_per_img, sorted=True) mask_indices = topk_indices // self.num_thing_classes thing_labels = topk_indices % self.num_thing_classes masks_per_img = thing_mask_preds[mask_indices] thing_masks = self.mask_head[-1].rescale_masks(masks_per_img, img_meta) # thing obj_feat thing_obj_feat = obj_feat[:self.num_proposals] thing_obj_feat = thing_obj_feat[mask_indices] if not self.merge_joint: thing_masks = thing_masks > test_cfg.mask_thr bbox_result, segm_result, thing_mask_preds = self.mask_head[-1].segm2result( thing_masks, thing_labels, thing_scores) stuff_scores = cls_scores[ self.num_proposals:][:, self.num_thing_classes:].diag() stuff_scores, stuff_inds = torch.sort(stuff_scores, descending=True) stuff_masks = mask_preds[self.num_proposals:][stuff_inds] stuff_masks = self.mask_head[-1].rescale_masks(stuff_masks, img_meta) # stuff obj_feat stuff_obj_feat = obj_feat[self.num_proposals:][stuff_inds] if not self.merge_joint: stuff_masks = stuff_masks > test_cfg.mask_thr if self.merge_joint: stuff_labels = stuff_inds + self.num_thing_classes panoptic_result, thing_obj_feat = self.merge_stuff_thing_stuff_joint(thing_masks, thing_labels, thing_scores, stuff_masks, stuff_labels, stuff_scores, test_cfg.merge_stuff_thing, thing_obj_feat, stuff_obj_feat ) else: stuff_labels = stuff_inds + 1 panoptic_result, thing_obj_feat = self.merge_stuff_thing_thing_first(thing_masks, thing_labels, thing_scores, stuff_masks, stuff_labels, stuff_scores, test_cfg.merge_stuff_thing, thing_obj_feat, stuff_obj_feat) return bbox_result, segm_result, thing_mask_preds, panoptic_result, thing_obj_feat def split_thing_stuff(self, mask_preds, det_labels, cls_scores): thing_scores = cls_scores[:self.num_proposals] thing_masks = mask_preds[:self.num_proposals] thing_labels = det_labels[:self.num_proposals] stuff_labels = det_labels[self.num_proposals:] stuff_labels = stuff_labels - self.num_thing_classes + 1 stuff_masks = mask_preds[self.num_proposals:] stuff_scores = cls_scores[self.num_proposals:] results = (thing_masks, thing_labels, thing_scores, stuff_masks, stuff_labels, stuff_scores) return results def merge_stuff_thing_thing_first(self, thing_masks, thing_labels, thing_scores, stuff_masks, stuff_labels, stuff_scores, merge_cfg=None, thing_obj_feat=None, stuff_obj_feat=None): H, W = thing_masks.shape[-2:] panoptic_seg = thing_masks.new_zeros((H, W), dtype=torch.int32) thing_masks = thing_masks.to( dtype=torch.bool, device=panoptic_seg.device) stuff_masks = stuff_masks.to( dtype=torch.bool, device=panoptic_seg.device) # sort instance outputs by scores sorted_inds = torch.argsort(-thing_scores) thing_obj_feat = thing_obj_feat[sorted_inds] current_segment_id = 0 segments_info = [] instance_ids = [] # Add instances one-by-one, check for overlaps with existing ones for inst_id in sorted_inds: score = thing_scores[inst_id].item() if score < merge_cfg.instance_score_thr: break mask = thing_masks[inst_id] # H,W mask_area = mask.sum().item() if mask_area == 0: continue intersect = (mask > 0) & (panoptic_seg > 0) intersect_area = intersect.sum().item() if intersect_area * 1.0 / mask_area > merge_cfg.iou_thr: continue if intersect_area > 0: mask = mask & (panoptic_seg == 0) mask_area = mask.sum().item() if mask_area == 0: continue current_segment_id += 1 panoptic_seg[mask.bool()] = current_segment_id segments_info.append({ 'id': current_segment_id, 'isthing': True, 'score': score, 'category_id': thing_labels[inst_id].item(), 'instance_id': inst_id.item(), }) instance_ids.append(inst_id.item()) # Add semantic results to remaining empty areas sorted_inds = torch.argsort(-stuff_scores) sorted_stuff_labels = stuff_labels[sorted_inds] # paste semantic masks following the order of scores processed_label = [] for semantic_label in sorted_stuff_labels: semantic_label = semantic_label.item() if semantic_label in processed_label: continue processed_label.append(semantic_label) sem_inds = stuff_labels == semantic_label sem_masks = stuff_masks[sem_inds].sum(0).bool() mask = sem_masks & (panoptic_seg == 0) mask_area = mask.sum().item() if mask_area < merge_cfg.stuff_max_area: continue current_segment_id += 1 panoptic_seg[mask] = current_segment_id segments_info.append({ 'id': current_segment_id, 'isthing': False, 'category_id': semantic_label, 'area': mask_area, }) return (panoptic_seg.cpu().numpy(), segments_info), thing_obj_feat[instance_ids] def merge_stuff_thing_stuff_first(self, thing_masks, thing_labels, thing_scores, stuff_masks, stuff_labels, stuff_scores, merge_cfg=None, thing_obj_feat=None, stuff_obj_feat=None): H, W = thing_masks.shape[-2:] panoptic_seg = thing_masks.new_zeros((H, W), dtype=torch.int32) thing_masks = thing_masks.to( dtype=torch.bool, device=panoptic_seg.device) stuff_masks = stuff_masks.to( dtype=torch.bool, device=panoptic_seg.device) current_segment_id = 0 segments_info = [] # Add semantic results first sorted_inds = torch.argsort(-stuff_scores) sorted_stuff_labels = stuff_labels[sorted_inds] # paste semantic masks following the order of scores processed_label = [] for semantic_label in sorted_stuff_labels: semantic_label = semantic_label.item() if semantic_label in processed_label: continue processed_label.append(semantic_label) sem_inds = stuff_labels == semantic_label sem_masks = stuff_masks[sem_inds].sum(0).bool() mask = sem_masks & (panoptic_seg == 0) mask_area = mask.sum().item() if mask_area < merge_cfg.stuff_max_area: continue current_segment_id += 1 panoptic_seg[mask] = current_segment_id segments_info.append({ 'id': current_segment_id, 'isthing': False, 'category_id': semantic_label, 'area': mask_area, }) # sort instance outputs by scores sorted_inds = torch.argsort(-thing_scores) # thing obj feat thing_obj_feat = thing_obj_feat[sorted_inds] # Add instances one-by-one, check for overlaps with existing ones instance_ids = [] for inst_id in sorted_inds: score = thing_scores[inst_id].item() if score < merge_cfg.instance_score_thr: break mask = thing_masks[inst_id] # H,W mask_area = mask.sum().item() if mask_area == 0: continue intersect = (mask > 0) & (panoptic_seg > 0) intersect_area = intersect.sum().item() if intersect_area * 1.0 / mask_area > merge_cfg.iou_thr: continue if intersect_area > 0: mask = mask & (panoptic_seg == 0) mask_area = mask.sum().item() if mask_area == 0: continue current_segment_id += 1 panoptic_seg[mask.bool()] = current_segment_id segments_info.append({ 'id': current_segment_id, 'isthing': True, 'score': score, 'category_id': thing_labels[inst_id].item(), 'instance_id': inst_id.item(), }) instance_ids.append(inst_id.item()) return (panoptic_seg.cpu().numpy(), segments_info), thing_obj_feat[instance_ids] def merge_stuff_thing_stuff_joint(self, thing_masks, thing_labels, thing_scores, stuff_masks, stuff_labels, stuff_scores, merge_cfg=None, thing_obj=None, stuff_obj=None ): H, W = thing_masks.shape[-2:] panoptic_seg = thing_masks.new_zeros((H, W), dtype=torch.int32) total_masks = torch.cat([thing_masks, stuff_masks], dim=0) total_scores = torch.cat([thing_scores, stuff_scores], dim=0) total_labels = torch.cat([thing_labels, stuff_labels], dim=0) obj_fea = torch.cat([thing_obj, stuff_obj], dim=0) cur_prob_masks = total_scores.view(-1, 1, 1) * total_masks segments_info = [] cur_mask_ids = cur_prob_masks.argmax(0) # sort instance outputs by scores sorted_inds = torch.argsort(-total_scores) current_segment_id = 0 sort_obj_fea = obj_fea things_ids = [] for k in sorted_inds: pred_class = total_labels[k].item() isthing = pred_class < self.num_thing_classes if isthing and total_scores[k] < merge_cfg.instance_score_thr: continue mask = cur_mask_ids == k mask_area = mask.sum().item() original_area = (total_masks[k] >= 0.5).sum().item() if mask_area > 0 and original_area > 0: if mask_area / original_area < merge_cfg.overlap_thr: continue current_segment_id += 1 panoptic_seg[mask] = current_segment_id if isthing: segments_info.append({ 'id': current_segment_id, 'isthing': isthing, 'score': total_scores[k].item(), 'category_id': pred_class, # 0, num_thing - 1 'instance_id': k.item(), }) things_ids.append(k.item()) else: segments_info.append({ 'id': current_segment_id, 'isthing': isthing, 'category_id': pred_class - self.num_thing_classes + 1, # 1, num_stuff 'area': mask_area, }) return (panoptic_seg.cpu().numpy(), segments_info), sort_obj_fea[things_ids] ================================================ FILE: knet/video/kernel_update_head.py ================================================ import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import (ConvModule, bias_init_with_prob, build_activation_layer, build_norm_layer) from mmcv.runner import force_fp32 from mmdet.core import multi_apply from mmdet.models.builder import HEADS, build_loss from mmdet.models.dense_heads.atss_head import reduce_mean from mmdet.models.losses import accuracy from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention, build_transformer_layer from mmdet.utils import get_root_logger from unitrack.mask import mask2box, tensor_mask2box @HEADS.register_module() class VideoKernelUpdateHead(nn.Module): def __init__(self, num_classes=80, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=3, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, act_cfg=dict(type='ReLU', inplace=True), ffn_act_cfg=dict(type='ReLU', inplace=True), conv_kernel_size=3, feat_transform_cfg=None, hard_mask_thr=0.5, kernel_init=False, with_ffn=True, mask_out_stride=4, relative_coors=False, relative_coors_off=False, feat_gather_stride=1, mask_transform_stride=1, mask_upsample_stride=1, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, previous=None, previous_x_feat=None, previous_link=None, # seg/cls embeddings previous_type=None, # tracking embeddings previous_detach=False, previous_detach_link=False, # whether detach linl query previous_link_detach=False, kernel_updator_cfg=dict( type='DynamicConv', in_channels=256, feat_channels=64, out_channels=256, input_feat_shape=1, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_rank=None, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0), loss_dice=dict(type='DiceLoss', loss_weight=3.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0)): super(VideoKernelUpdateHead, self).__init__() self.num_classes = num_classes self.loss_cls = build_loss(loss_cls) self.loss_mask = build_loss(loss_mask) self.loss_dice = build_loss(loss_dice) if loss_rank is not None: self.loss_rank = build_loss(loss_rank) else: self.loss_rank = loss_rank self.in_channels = in_channels self.out_channels = out_channels self.mask_thr = mask_thr self.fp16_enabled = False self.dropout = dropout self.num_heads = num_heads self.hard_mask_thr = hard_mask_thr self.kernel_init = kernel_init self.with_ffn = with_ffn self.mask_out_stride = mask_out_stride self.relative_coors = relative_coors self.relative_coors_off = relative_coors_off self.conv_kernel_size = conv_kernel_size self.feat_gather_stride = feat_gather_stride self.mask_transform_stride = mask_transform_stride self.mask_upsample_stride = mask_upsample_stride self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.ignore_label = ignore_label self.thing_label_in_seg = thing_label_in_seg self.attention = MultiheadAttention( in_channels * conv_kernel_size ** 2, num_heads, dropout) self.attention_norm = build_norm_layer( dict(type='LN'), in_channels * conv_kernel_size ** 2)[1] self.kernel_update_conv = build_transformer_layer(kernel_updator_cfg) if feat_transform_cfg is not None: kernel_size = feat_transform_cfg.pop('kernel_size', 1) self.feat_transform = ConvModule( in_channels, in_channels, kernel_size, stride=feat_gather_stride, padding=int(feat_gather_stride // 2), **feat_transform_cfg) else: self.feat_transform = None if self.with_ffn: self.ffn = FFN( in_channels, feedforward_channels, num_ffn_fcs, act_cfg=ffn_act_cfg, dropout=dropout) self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1] self.cls_fcs = nn.ModuleList() for _ in range(num_cls_fcs): self.cls_fcs.append( nn.Linear(in_channels, in_channels, bias=False)) self.cls_fcs.append( build_norm_layer(dict(type='LN'), in_channels)[1]) self.cls_fcs.append(build_activation_layer(act_cfg)) if self.loss_cls.use_sigmoid: self.fc_cls = nn.Linear(in_channels, self.num_classes) else: self.fc_cls = nn.Linear(in_channels, self.num_classes + 1) self.mask_fcs = nn.ModuleList() for _ in range(num_mask_fcs): self.mask_fcs.append( nn.Linear(in_channels, in_channels, bias=False)) self.mask_fcs.append( build_norm_layer(dict(type='LN'), in_channels)[1]) self.mask_fcs.append(build_activation_layer(act_cfg)) self.fc_mask = nn.Linear(in_channels, out_channels) self.previous = previous self.previous_type = previous_type self.previous_link = previous_link self.previous_x_feat = previous_x_feat self.previous_detach = previous_detach self.previous_detach_link = previous_detach_link self.previous_link_detach = previous_link_detach if self.previous is not None: _in_channels = self.in_channels _conv_kernel_size = self.conv_kernel_size _num_head = 8 _dropout = 0. # tracking embedding if self.previous_type == "ffn": self.attention_previous = MultiheadAttention( _in_channels * _conv_kernel_size ** 2, _num_head, _dropout, ) _, self.attention_previous_norm = build_norm_layer( dict(type='LN'), _in_channels * _conv_kernel_size ** 2 ) # add link ffn self.link_ffn = FFN( in_channels, feedforward_channels, num_ffn_fcs, act_cfg=ffn_act_cfg, dropout=dropout) self.link_ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1] elif self.previous_type == "update" or self.previous_type == "update_obj": self.attention_previous_update_track = build_transformer_layer(kernel_updator_cfg) self.attention_previous_track = MultiheadAttention( _in_channels * _conv_kernel_size ** 2, _num_head, _dropout, ) _, self.attention_previous_norm_track = build_norm_layer( dict(type='LN'), _in_channels * _conv_kernel_size ** 2 ) # add link ffn self.link_ffn_track = FFN( in_channels, feedforward_channels, num_ffn_fcs, act_cfg=ffn_act_cfg, dropout=dropout) self.link_ffn_norm_track = build_norm_layer(dict(type='LN'), in_channels)[1] # seg and cls embedding Link if self.previous_link == "update_dynamic_cov": _in_channels = self.in_channels _conv_kernel_size = self.conv_kernel_size _num_head = 8 _dropout = 0. self.attention_previous_update_link = build_transformer_layer(kernel_updator_cfg) self.attention_previous_link = MultiheadAttention( _in_channels * _conv_kernel_size ** 2, _num_head, _dropout, ) _, self.attention_previous_norm_link = build_norm_layer( dict(type='LN'), _in_channels * _conv_kernel_size ** 2 ) # add link ffn self.link_ffn_link = FFN( in_channels, feedforward_channels, num_ffn_fcs, act_cfg=ffn_act_cfg, dropout=dropout) self.link_ffn_norm_link = build_norm_layer(dict(type='LN'), in_channels)[1] elif self.previous_link == "link_atten": _in_channels = self.in_channels _conv_kernel_size = self.conv_kernel_size _num_head = 8 _dropout = 0. self.attention_previous_link = MultiheadAttention( _in_channels * _conv_kernel_size ** 2, _num_head, _dropout, ) _, self.attention_previous_norm_link = build_norm_layer( dict(type='LN'), _in_channels * _conv_kernel_size ** 2 ) # add link ffn self.link_ffn_link = FFN( in_channels, feedforward_channels, num_ffn_fcs, act_cfg=ffn_act_cfg, dropout=dropout) self.link_ffn_norm_link = build_norm_layer(dict(type='LN'), in_channels)[1] def init_weights(self): """Use xavier initialization for all weight parameter and set classification head bias as a specific value when use focal loss.""" for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) else: # adopt the default initialization for # the weight and bias of the layer norm pass if self.loss_cls.use_sigmoid: bias_init = bias_init_with_prob(0.01) nn.init.constant_(self.fc_cls.bias, bias_init) if self.kernel_init: logger = get_root_logger() logger.info( 'mask kernel in mask head is normal initialized by std 0.01') nn.init.normal_(self.fc_mask.weight, mean=0, std=0.01) def forward(self, x, proposal_feat, mask_preds, prev_cls_score=None, mask_shape=None, img_metas=None, previous_obj_feats=None, previous_mask_preds=None, previous_x_feats=None ): N, num_proposals = proposal_feat.shape[:2] if self.feat_transform is not None: x = self.feat_transform(x) if previous_x_feats is not None: previous_x_feats = self.feat_transform(previous_x_feats) C, H, W = x.shape[-3:] mask_h, mask_w = mask_preds.shape[-2:] if mask_h != H or mask_w != W: gather_mask = F.interpolate( mask_preds, (H, W), align_corners=False, mode='bilinear') else: gather_mask = mask_preds sigmoid_masks = gather_mask.sigmoid() nonzero_inds = sigmoid_masks > self.hard_mask_thr sigmoid_masks = nonzero_inds.float() # einsum is faster than bmm by 30% x_feat = torch.einsum('bnhw,bchw->bnc', sigmoid_masks, x) # obj_feat in shape [B, N, C, K, K] -> [B, N, C, K*K] -> [B, N, K*K, C] proposal_feat = proposal_feat.reshape(N, num_proposals, self.in_channels, -1).permute(0, 1, 3, 2) # whether to detach the previous outputs if self.training and self.previous_detach: previous_obj_feats = previous_obj_feats.detach() # update previous with link object query if previous_obj_feats is not None and self.previous_link == "update_dynamic_cov": previous_obj_feats_link = previous_obj_feats.reshape(N, num_proposals, self.in_channels, -1).permute(0, 1, 3, 2) if self.training and self.previous_detach_link: previous_obj_feats_link = previous_obj_feats_link.detach() previous_obj_feats_update = self.attention_previous_update_link(x_feat, previous_obj_feats_link) previous_obj_feats_update = previous_obj_feats_update.reshape(N, num_proposals, -1).permute(1, 0, 2) cur_obj_feat = proposal_feat.reshape(N, num_proposals, self.in_channels * self.conv_kernel_size ** 2). \ permute(1, 0, 2) cur_obj_feat = self.attention_previous_norm_link( self.attention_previous_link( query=cur_obj_feat, key=previous_obj_feats_update, value=previous_obj_feats_update, identity=cur_obj_feat ), ) cur_obj_feat = cur_obj_feat.permute(1, 0, 2) cur_obj_feat = cur_obj_feat.reshape(N, num_proposals, -1, self.in_channels) # pre_obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C] proposal_feat = self.link_ffn_norm_link(self.link_ffn_link(cur_obj_feat)) if previous_obj_feats is not None and self.previous_link == "link_atten": previous_obj_feats_link = previous_obj_feats.reshape(N, num_proposals, self.in_channels, -1).permute(0, 1, 3, 2) previous_obj_feats_update = previous_obj_feats_link.reshape(N, num_proposals, -1).permute(1, 0, 2) cur_obj_feat = proposal_feat.reshape(N, num_proposals, self.in_channels * self.conv_kernel_size ** 2). \ permute(1, 0, 2) cur_obj_feat = self.attention_previous_norm_link( self.attention_previous_link( query=cur_obj_feat, key=previous_obj_feats_update, value=previous_obj_feats_update, identity=cur_obj_feat ), ) cur_obj_feat = cur_obj_feat.permute(1, 0, 2) cur_obj_feat = cur_obj_feat.reshape(N, num_proposals, -1, self.in_channels) # pre_obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C] proposal_feat = self.link_ffn_norm_link(self.link_ffn_link(cur_obj_feat)) # update current obj_feat = self.kernel_update_conv(x_feat, proposal_feat) # [B, N, K*K, C] -> [B, N, K*K*C] -> [N, B, K*K*C] obj_feat = obj_feat.reshape(N, num_proposals, -1).permute(1, 0, 2) obj_feat = self.attention_norm(self.attention(obj_feat)) # [N, B, K*K*C] -> [B, N, K*K*C] obj_feat = obj_feat.permute(1, 0, 2) # obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C] obj_feat = obj_feat.reshape(N, num_proposals, -1, self.in_channels) # FFN if self.with_ffn: obj_feat = self.ffn_norm(self.ffn(obj_feat)) # For Tracking Parts # Link previous and cur if previous obj feat is Not None if previous_obj_feats is not None: # previous_obj_feats (b, n, c, k, k) -> (b,n,c,k*k) -> (b,,n, k*k, c) # permute to correct dimension if self.previous_type == "ffn": previous_obj_feats = previous_obj_feats.reshape(N, num_proposals, self.in_channels, -1).permute(0, 1, 3, 2) cur_obj_feat = obj_feat.reshape(N, num_proposals, self.in_channels * self.conv_kernel_size ** 2). \ permute(1, 0, 2) previous_obj_feats = previous_obj_feats.reshape(N, num_proposals, self.in_channels * self.conv_kernel_size ** 2).permute( 1, 0, 2) previous_obj_feat = self.attention_previous_norm( self.attention_previous( query=cur_obj_feat, key=previous_obj_feats, value=previous_obj_feats, identity=cur_obj_feat ), ) previous_obj_feat = previous_obj_feat.permute(1, 0, 2) previous_obj_feat_track = previous_obj_feat.reshape(N, num_proposals, -1, self.in_channels) # pre_obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C] previous_obj_feat_track = self.link_ffn_norm(self.link_ffn(previous_obj_feat_track)) elif self.previous_type == "update": # not work previous_obj_feats = previous_obj_feats.reshape(N, num_proposals, self.in_channels, -1).permute(0, 1, 3, 2) previous_obj_feats_track = self.attention_previous_update_track(x_feat, previous_obj_feats) previous_obj_feats_track = previous_obj_feats_track.reshape(N, num_proposals, self.in_channels, -1).permute(0, 1, 3, 2) cur_obj_feat = obj_feat.reshape(N, num_proposals, self.in_channels * self.conv_kernel_size ** 2). \ permute(1, 0, 2) previous_obj_feats_track = previous_obj_feats_track.reshape(N, num_proposals, self.in_channels * self.conv_kernel_size ** 2).permute( 1, 0, 2) previous_obj_feats_track = self.attention_previous_norm_track( self.attention_previous_track( query=cur_obj_feat, key=previous_obj_feats_track, value=previous_obj_feats_track, identity=cur_obj_feat ), ) previous_obj_feats_track = previous_obj_feats_track.permute(1, 0, 2) previous_obj_feats_track = previous_obj_feats_track.reshape(N, num_proposals, -1, self.in_channels) # pre_obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C] previous_obj_feat_track = self.link_ffn_norm_track(self.link_ffn_track(previous_obj_feats_track)) elif self.previous_type == "update_obj": # not work previous_obj_feats = previous_obj_feats.reshape(N, num_proposals, self.in_channels, -1).permute(0, 1, 3, 2) previous_obj_feats_track = self.attention_previous_update_track(obj_feat.squeeze(2), previous_obj_feats) previous_obj_feats_track = previous_obj_feats_track.reshape(N, num_proposals, self.in_channels, -1).permute(0, 1, 3, 2) cur_obj_feat = obj_feat.reshape(N, num_proposals, self.in_channels * self.conv_kernel_size ** 2). \ permute(1, 0, 2) previous_obj_feats_track = previous_obj_feats_track.reshape(N, num_proposals, self.in_channels * self.conv_kernel_size ** 2).permute( 1, 0, 2) previous_obj_feats_track = self.attention_previous_norm_track( self.attention_previous_track( query=cur_obj_feat, key=previous_obj_feats_track, value=previous_obj_feats_track, identity=cur_obj_feat ), ) previous_obj_feats_track = previous_obj_feats_track.permute(1, 0, 2) previous_obj_feats_track = previous_obj_feats_track.reshape(N, num_proposals, -1, self.in_channels) # pre_obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C] previous_obj_feat_track = self.link_ffn_norm_track(self.link_ffn_track(previous_obj_feats_track)) else: previous_obj_feat_track = None cls_feat = obj_feat.sum(-2) mask_feat = obj_feat for cls_layer in self.cls_fcs: cls_feat = cls_layer(cls_feat) for reg_layer in self.mask_fcs: mask_feat = reg_layer(mask_feat) cls_score = self.fc_cls(cls_feat).view(N, num_proposals, -1) # [B, N, K*K, C] -> [B, N, C, K*K] mask_feat = self.fc_mask(mask_feat).permute(0, 1, 3, 2) if (self.mask_transform_stride == 2 and self.feat_gather_stride == 1): mask_x = F.interpolate( x, scale_factor=0.5, mode='bilinear', align_corners=False) H, W = mask_x.shape[-2:] else: mask_x = x # group conv is 5x faster than unfold and uses about 1/5 memory # Group conv vs. unfold vs. concat batch, 2.9ms :13.5ms :3.8ms # Group conv vs. unfold vs. concat batch, 278 : 1420 : 369 # fold_x = F.unfold( # mask_x, # self.conv_kernel_size, # padding=int(self.conv_kernel_size // 2)) # mask_feat = mask_feat.reshape(N, num_proposals, -1) # new_mask_preds = torch.einsum('bnc,bcl->bnl', mask_feat, fold_x) # [B, N, C, K*K] -> [B*N, C, K, K] mask_feat = mask_feat.reshape(N, num_proposals, C, self.conv_kernel_size, self.conv_kernel_size) # [B, C, H, W] -> [1, B*C, H, W] new_mask_preds = [] for i in range(N): new_mask_preds.append( F.conv2d( mask_x[i:i + 1], mask_feat[i], padding=int(self.conv_kernel_size // 2))) new_mask_preds = torch.cat(new_mask_preds, dim=0) new_mask_preds = new_mask_preds.reshape(N, num_proposals, H, W) if self.mask_transform_stride == 2: new_mask_preds = F.interpolate( new_mask_preds, scale_factor=2, mode='bilinear', align_corners=False) if mask_shape is not None and mask_shape[0] != H: new_mask_preds = F.interpolate( new_mask_preds, mask_shape, align_corners=False, mode='bilinear') if previous_obj_feats is not None and previous_obj_feat_track is not None: return cls_score, new_mask_preds, obj_feat.permute(0, 1, 3, 2).reshape( N, num_proposals, self.in_channels, self.conv_kernel_size, self.conv_kernel_size), x_feat, \ previous_obj_feat_track.permute(0, 1, 3, 2).reshape( N, num_proposals, self.in_channels, self.conv_kernel_size, self.conv_kernel_size) else: return cls_score, new_mask_preds, obj_feat.permute(0, 1, 3, 2).reshape( N, num_proposals, self.in_channels, self.conv_kernel_size, self.conv_kernel_size), x_feat, None @force_fp32(apply_to=('cls_score', 'mask_pred')) def loss(self, object_feats, cls_score, mask_pred, labels, label_weights, mask_targets, mask_weights, imgs_whwh=None, reduction_override=None, **kwargs): losses = dict() bg_class_ind = self.num_classes # note in spare rcnn num_gt == num_pos pos_inds = (labels >= 0) & (labels < bg_class_ind) num_pos = pos_inds.sum().float() avg_factor = reduce_mean(num_pos).clamp_(min=1.0) num_preds = mask_pred.shape[0] * mask_pred.shape[1] assert mask_pred.shape[0] == cls_score.shape[0] assert mask_pred.shape[1] == cls_score.shape[1] if cls_score is not None: if cls_score.numel() > 0: losses['loss_cls'] = self.loss_cls( cls_score.view(num_preds, -1), labels, label_weights, avg_factor=avg_factor, reduction_override=reduction_override) losses['pos_acc'] = accuracy( cls_score.view(num_preds, -1)[pos_inds], labels[pos_inds]) if mask_pred is not None: bool_pos_inds = pos_inds.type(torch.bool) # 0~self.num_classes-1 are FG, self.num_classes is BG # do not perform bounding box regression for BG anymore. H, W = mask_pred.shape[-2:] if pos_inds.any(): pos_mask_pred = mask_pred.reshape(num_preds, H, W)[bool_pos_inds] pos_mask_targets = mask_targets[bool_pos_inds] losses['loss_mask'] = self.loss_mask(pos_mask_pred, pos_mask_targets) losses['loss_dice'] = self.loss_dice(pos_mask_pred, pos_mask_targets) if self.loss_rank is not None: batch_size = mask_pred.size(0) rank_target = mask_targets.new_full((batch_size, H, W), self.ignore_label, dtype=torch.long) rank_inds = pos_inds.view(batch_size, -1).nonzero(as_tuple=False) batch_mask_targets = mask_targets.view( batch_size, -1, H, W).bool() for i in range(batch_size): curr_inds = (rank_inds[:, 0] == i) curr_rank = rank_inds[:, 1][curr_inds] for j in curr_rank: rank_target[i][batch_mask_targets[i][j]] = j losses['loss_rank'] = self.loss_rank( mask_pred, rank_target, ignore_index=self.ignore_label) else: losses['loss_mask'] = mask_pred.sum() * 0 losses['loss_dice'] = mask_pred.sum() * 0 if self.loss_rank is not None: losses['loss_rank'] = mask_pred.sum() * 0 return losses def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask, pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls, cfg): num_pos = pos_mask.size(0) num_neg = neg_mask.size(0) num_samples = num_pos + num_neg H, W = pos_mask.shape[-2:] # original implementation uses new_zeros since BG are set to be 0 # now use empty & fill because BG cat_id = num_classes, # FG cat_id = [0, num_classes-1] labels = pos_mask.new_full((num_samples,), self.num_classes, dtype=torch.long) label_weights = pos_mask.new_zeros((num_samples, self.num_classes)) mask_targets = pos_mask.new_zeros(num_samples, H, W) mask_weights = pos_mask.new_zeros(num_samples, H, W) if num_pos > 0: labels[pos_inds] = pos_gt_labels pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight label_weights[pos_inds] = pos_weight pos_mask_targets = pos_gt_mask mask_targets[pos_inds, ...] = pos_mask_targets mask_weights[pos_inds, ...] = 1 if num_neg > 0: label_weights[neg_inds] = 1.0 if gt_sem_cls is not None and gt_sem_seg is not None: sem_labels = pos_mask.new_full((self.num_stuff_classes,), self.num_classes, dtype=torch.long) sem_targets = pos_mask.new_zeros(self.num_stuff_classes, H, W) sem_weights = pos_mask.new_zeros(self.num_stuff_classes, H, W) sem_stuff_weights = torch.eye( self.num_stuff_classes, device=pos_mask.device) sem_thing_weights = pos_mask.new_zeros( (self.num_stuff_classes, self.num_thing_classes)) sem_label_weights = torch.cat( [sem_thing_weights, sem_stuff_weights], dim=-1) if len(gt_sem_cls > 0): sem_inds = gt_sem_cls - self.num_thing_classes sem_inds = sem_inds.long() sem_labels[sem_inds] = gt_sem_cls.long() sem_targets[sem_inds] = gt_sem_seg sem_weights[sem_inds] = 1 label_weights[:, self.num_thing_classes:] = 0 labels = torch.cat([labels, sem_labels]) label_weights = torch.cat([label_weights, sem_label_weights]) mask_targets = torch.cat([mask_targets, sem_targets]) mask_weights = torch.cat([mask_weights, sem_weights]) return labels, label_weights, mask_targets, mask_weights def get_targets(self, sampling_results, gt_mask, gt_labels, rcnn_train_cfg, concat=True, gt_sem_seg=None, gt_sem_cls=None ): pos_inds_list = [res.pos_inds for res in sampling_results] neg_inds_list = [res.neg_inds for res in sampling_results] pos_mask_list = [res.pos_masks for res in sampling_results] neg_mask_list = [res.neg_masks for res in sampling_results] pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results] pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results] if gt_sem_seg is None: gt_sem_seg = [None] * 2 gt_sem_cls = [None] * 2 labels, label_weights, mask_targets, mask_weights = multi_apply( self._get_target_single, pos_inds_list, neg_inds_list, pos_mask_list, neg_mask_list, pos_gt_mask_list, pos_gt_labels_list, gt_sem_seg, gt_sem_cls, cfg=rcnn_train_cfg) if concat: labels = torch.cat(labels, 0) label_weights = torch.cat(label_weights, 0) mask_targets = torch.cat(mask_targets, 0) mask_weights = torch.cat(mask_weights, 0) return labels, label_weights, mask_targets, mask_weights def rescale_masks(self, masks_per_img, img_meta): h, w, _ = img_meta['img_shape'] masks_per_img = F.interpolate( masks_per_img.unsqueeze(0).sigmoid(), size=img_meta['batch_input_shape'], mode='bilinear', align_corners=False) masks_per_img = masks_per_img[:, :, :h, :w] ori_shape = img_meta['ori_shape'] seg_masks = F.interpolate( masks_per_img, size=ori_shape[:2], mode='bilinear', align_corners=False).squeeze(0) return seg_masks def get_seg_masks(self, masks_per_img, labels_per_img, scores_per_img, test_cfg, img_meta): # resize mask predictions back seg_masks = self.rescale_masks(masks_per_img, img_meta) seg_masks = seg_masks > test_cfg.mask_thr bbox_result, segm_result, mask_preds = self.segm2result(seg_masks, labels_per_img, scores_per_img) return bbox_result, segm_result, mask_preds def segm2result(self, mask_preds, det_labels, cls_scores): num_classes = self.num_classes bbox_result = None segm_result = [[] for _ in range(num_classes)] det_labels = det_labels.cpu().numpy() cls_scores = cls_scores.cpu().numpy() num_ins = mask_preds.shape[0] # fake bboxes mask to bboxes bboxes = np.zeros((num_ins, 5), dtype=np.float32) bboxes[:, -1] = cls_scores bboxes[:, :4] = np.array(tensor_mask2box(mask_preds).clip(min=0)) # mask_preds = mask_preds.cpu().numpy() # bbox_result = [bboxes[det_labels == i, :] for i in range(num_classes)] for idx in range(num_ins): segm_result[det_labels[idx]].append(mask_preds[idx]) return bboxes, segm_result, mask_preds ================================================ FILE: knet/video/knet.py ================================================ import torch import torch.nn.functional as F from mmdet.models.builder import DETECTORS from mmdet.models.detectors import TwoStageDetector, BaseDetector from mmdet.models.builder import build_head from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes @DETECTORS.register_module() class VideoKNet(TwoStageDetector): def __init__(self, *args, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, cityscapes=False, **kwargs): super(VideoKNet, self).__init__(*args, **kwargs) assert self.with_rpn, 'KNet does not support external proposals' self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.thing_label_in_seg = thing_label_in_seg self.ignore_label = ignore_label self.cityscapes = cityscapes # whether to train the cityscape panoptic segmentation def forward_train(self, img, img_metas, gt_bboxes=None, gt_labels=None, gt_bboxes_ignore=None, gt_masks=None, gt_semantic_seg=None, ref_img=None, ref_img_metas=None, ref_gt_bboxes_ignore=None, ref_gt_labels=None, ref_gt_masks=None, ref_gt_semantic_seg=None, proposals=None, **kwargs): """Forward function of SparseR-CNN-like network in train stage. Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see :class:`mmdet.datasets.pipelines.Collect`. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor): specify which bounding boxes can be ignored when computing the loss. gt_masks (List[Tensor], optional) : Segmentation masks for each box. But we don't support it in this architecture. proposals (List[Tensor], optional): override rpn proposals with custom proposals. Use when `with_rpn` is False. # This is for video only: ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images. Typically these should be mean centered and std scaled. 2 denotes there is two reference images for each input image. ref_img_metas (list[list[dict]]): The first list only has one element. The second list contains reference image information dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmtrack/datasets/pipelines/formatting.py:VideoCollect`. ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The Tensor contains ground truth bboxes for each reference image with shape (num_all_ref_gts, 5) in [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id start from 0, and denotes the id of reference image for each key image. ref_gt_labels (list[Tensor]): The list only has one Tensor. The Tensor contains class indices corresponding to each reference box with shape (num_all_ref_gts, 2) in [ref_img_id, class_indice]. Returns: dict[str, Tensor]: a dictionary of loss components """ super(TwoStageDetector, self).forward_train(img, img_metas) assert proposals is None, 'KNet does not support' \ ' external proposals' assert gt_masks is not None # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks_tensor = [] gt_sem_seg = [] gt_sem_cls = [] # batch_input_shape shoud be the same across images pad_H, pad_W = img_metas[0]['batch_input_shape'] assign_H = pad_H // self.mask_assign_stride assign_W = pad_W // self.mask_assign_stride for i, gt_mask in enumerate(gt_masks): mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device) if gt_mask.width != pad_W or gt_mask.height != pad_H: pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height) mask_tensor = F.pad(mask_tensor, pad_wh, value=0) if gt_semantic_seg is not None: # gt_semantic seg is padded by zero when forming a batch # need to convert them from 0 to ignore gt_semantic_seg[ i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label gt_semantic_seg[ i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label if self.cityscapes: sem_labels, sem_seg = sem2ins_masks_cityscapes( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes) else: sem_labels, sem_seg = sem2ins_masks( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes, thing_label_in_seg=self.thing_label_in_seg) if sem_seg.shape[0] == 0: gt_sem_seg.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_sem_seg.append( F.interpolate( sem_seg[None], (assign_H, assign_W), mode='bilinear', align_corners=False)[0]) gt_sem_cls.append(sem_labels) else: gt_sem_seg = None gt_sem_cls = None if mask_tensor.shape[0] == 0: gt_masks_tensor.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_masks_tensor.append( F.interpolate( mask_tensor[None], (assign_H, assign_W), # downsample to 1/4 resolution mode='bilinear', align_corners=False)[0]) gt_masks = gt_masks_tensor x = self.extract_feat(img) rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks, gt_labels, gt_sem_seg, gt_sem_cls) (rpn_losses, proposal_feats, x_feats, mask_preds, cls_scores) = rpn_results losses = self.roi_head.forward_train( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, gt_masks, gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, gt_bboxes=gt_bboxes, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls, imgs_whwh=None) losses.update(rpn_losses) return losses def simple_test(self, img, img_metas, rescale=False): """Test function without test time augmentation. Args: imgs (list[torch.Tensor]): List of multiple images img_metas (list[dict]): List of image information. rescale (bool): Whether to rescale the results. Defaults to False. Returns: list[list[np.ndarray]]: BBox results of each image and classes. The outer list corresponds to each image. The inner list corresponds to each class. """ x = self.extract_feat(img) rpn_results = self.rpn_head.simple_test_rpn(x, img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results segm_results = self.roi_head.simple_test( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, imgs_whwh=None, rescale=rescale) return segm_results def forward_dummy(self, img): """Used for computing network flops. See `mmdetection/tools/get_flops.py` """ # backbone x = self.extract_feat(img) # rpn num_imgs = len(img) dummy_img_metas = [ dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs) ] rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results # roi_head roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats, dummy_img_metas) return roi_outs def extract_feat(self, img): """Directly extract features from the backbone+neck.""" x = self.backbone(img) if self.with_neck: x = self.neck(x) return x ================================================ FILE: knet/video/knet_quansi_dense.py ================================================ import warnings import numpy as np import torch import torch.nn.functional as F from mmcv.cnn import ConvModule from mmdet.models.builder import DETECTORS from mmdet.models.detectors import BaseDetector from mmdet.models.builder import build_head, build_neck, build_backbone from mmdet.core import build_assigner, build_sampler from knet.video.qdtrack.builder import build_tracker from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes from unitrack.mask import tensor_mask2box @DETECTORS.register_module() class VideoKNetQuansiTrack(BaseDetector): """ Simple Extension of KNet to Video KNet by the implementation of VPSFuse Net. """ def __init__(self, backbone, neck=None, rpn_head=None, roi_head=None, track_head=None, extra_neck=None, track_localization_fpn=None, tracker=None, train_cfg=None, test_cfg=None, track_train_cfg=None, pretrained=None, init_cfg=None, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, cityscapes=False, kitti_step=False, fix_knet=False, freeze_detector=False, semantic_filter=False, **kwargs): super(VideoKNetQuansiTrack, self).__init__(init_cfg) if pretrained: warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') backbone.pretrained = pretrained self.backbone = build_backbone(backbone) if neck is not None: self.neck = build_neck(neck) if extra_neck is not None: self.extra_neck = build_neck(extra_neck) if rpn_head is not None: rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None rpn_head_ = rpn_head.copy() rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn) self.rpn_head = build_head(rpn_head_) if roi_head is not None: # update train and test cfg here for now # TODO: refactor assigner & sampler rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None roi_head.update(train_cfg=rcnn_train_cfg) roi_head.update(test_cfg=test_cfg.rcnn) roi_head.pretrained = pretrained self.roi_head = build_head(roi_head) if track_head is not None: self.track_train_cfg = track_train_cfg self.track_head = build_head(track_head) self.init_track_assigner_sampler() if track_localization_fpn is not None: self.track_localization_fpn = build_neck(track_localization_fpn) if tracker is not None: self.tracker_cfg = tracker if freeze_detector: self._freeze_detector() if fix_knet: for p in self.backbone.parameters(): p.requires_grad_(False) self.backbone.eval() for p in self.neck.parameters(): p.requires_grad_(False) self.neck.eval() for p in self.rpn_head.parameters(): p.requires_grad_(False) self.rpn_head.eval() for p in self.roi_head.parameters(): p.requires_grad_(False) self.roi_head.eval() self.train_cfg = train_cfg self.test_cfg = test_cfg self.num_proposals = self.rpn_head.num_proposals self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.thing_label_in_seg = thing_label_in_seg self.ignore_label = ignore_label self.cityscapes = cityscapes # whether to train the cityscape panoptic segmentation self.kitti_step = kitti_step self.semantic_filter = semantic_filter def init_tracker(self): self.tracker = build_tracker(self.tracker_cfg) def _freeze_detector(self): self.detector = [ self.rpn_head, self.roi_head ] for model in self.detector: model.eval() for param in model.parameters(): param.requires_grad = False def init_track_assigner_sampler(self): """Initialize assigner and sampler.""" self.track_roi_assigner = build_assigner( self.track_train_cfg.assigner) self.track_share_assigner = False self.track_roi_sampler = build_sampler( self.track_train_cfg.sampler, context=self) self.track_share_sampler = False def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_semantic_seg): # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks_tensor = [] gt_sem_seg = [] gt_sem_cls = [] # batch_input_shape shoud be the same across images pad_H, pad_W = img_metas[0]['batch_input_shape'] assign_H = pad_H // self.mask_assign_stride assign_W = pad_W // self.mask_assign_stride for i, gt_mask in enumerate(gt_masks): mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device) if gt_mask.width != pad_W or gt_mask.height != pad_H: pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height) mask_tensor = F.pad(mask_tensor, pad_wh, value=0) if gt_semantic_seg is not None: # gt_semantic seg is padded by zero when forming a batch # need to convert them from 0 to ignore gt_semantic_seg[ i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label gt_semantic_seg[ i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label if self.cityscapes: sem_labels, sem_seg = sem2ins_masks_cityscapes( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes) else: sem_labels, sem_seg = sem2ins_masks( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes, thing_label_in_seg=self.thing_label_in_seg) if sem_seg.shape[0] == 0: gt_sem_seg.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_sem_seg.append( F.interpolate( sem_seg[None], (assign_H, assign_W), mode='bilinear', align_corners=False)[0]) gt_sem_cls.append(sem_labels) else: gt_sem_seg = None gt_sem_cls = None if mask_tensor.shape[0] == 0: gt_masks_tensor.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_masks_tensor.append( F.interpolate( mask_tensor[None], (assign_H, assign_W), # downsample to 1/4 resolution mode='bilinear', align_corners=False)[0]) return gt_masks_tensor, gt_sem_cls, gt_sem_seg def forward_train(self, img, img_metas, gt_bboxes=None, gt_labels=None, gt_bboxes_ignore=None, gt_masks=None, gt_semantic_seg=None, gt_instance_ids=None, ref_img=None, ref_img_metas=None, ref_gt_bboxes_ignore=None, ref_gt_labels=None, ref_gt_bboxes=None, ref_gt_masks=None, ref_gt_semantic_seg=None, ref_gt_instance_ids=None, proposals=None, **kwargs): """Forward function of SparseR-CNN-like network in train stage. Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see :class:`mmdet.datasets.pipelines.Collect`. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor): specify which bounding boxes can be ignored when computing the loss. gt_masks (List[Tensor], optional) : Segmentation masks for each box. But we don't support it in this architecture. proposals (List[Tensor], optional): override rpn proposals with custom proposals. Use when `with_rpn` is False. # This is for video only: ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images. Typically these should be mean centered and std scaled. 2 denotes there is two reference images for each input image. ref_img_metas (list[list[dict]]): The first list only has one element. The second list contains reference image information dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmtrack/datasets/pipelines/formatting.py:VideoCollect`. ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The Tensor contains ground truth bboxes for each reference image with shape (num_all_ref_gts, 5) in [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id start from 0, and denotes the id of reference image for each key image. ref_gt_labels (list[Tensor]): The list only has one Tensor. The Tensor contains class indices corresponding to each reference box with shape (num_all_ref_gts, 2) in [ref_img_id, class_indice]. Returns: dict[str, Tensor]: a dictionary of loss components """ batch_input_shape = tuple(img[0].size()[-2:]) for img_meta in img_metas: img_meta['batch_input_shape'] = batch_input_shape assert proposals is None, 'KNet does not support' \ ' external proposals' assert gt_masks is not None assert gt_instance_ids is not None # preprocess the reference images ref_img = ref_img.squeeze(1) # (b,3,h,w) ref_masks_gt = [] for ref_gt_mask in ref_gt_masks: ref_masks_gt.append(ref_gt_mask[0]) ref_labels_gt = [] for ref_gt_label in ref_gt_labels: ref_labels_gt.append(ref_gt_label[:, 1].long()) ref_gt_labels = ref_labels_gt ref_semantic_seg_gt = ref_gt_semantic_seg.squeeze(1) ref_gt_instance_id_list = [] for ref_gt_instance_id in ref_gt_instance_ids: ref_gt_instance_id_list.append(ref_gt_instance_id[:,1].long()) ref_img_metas_new = [] for ref_img_meta in ref_img_metas: ref_img_meta[0]['batch_input_shape'] = batch_input_shape ref_img_metas_new.append(ref_img_meta[0]) # prepare the gt_match_indices gt_pids_list =[] for i in range(len(ref_gt_instance_id_list)): ref_ids = ref_gt_instance_id_list[i].cpu().data.numpy().tolist() gt_ids = gt_instance_ids[i].cpu().data.numpy().tolist() gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids] gt_pids_list.append(torch.LongTensor([gt_pids]).to(img.device)[0]) gt_match_indices = gt_pids_list # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks, gt_sem_cls, gt_sem_seg = self.preprocess_gt_masks(img_metas, gt_masks, gt_labels, gt_semantic_seg) ref_gt_masks, ref_gt_sem_cls, ref_gt_sem_seg = self.preprocess_gt_masks(ref_img_metas_new, ref_masks_gt, ref_gt_labels, ref_semantic_seg_gt) x = self.extract_feat(img) self.backbone.eval() with torch.no_grad(): x_ref = self.extract_feat(ref_img) self.backbone.train() rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks, gt_labels, gt_sem_seg, gt_sem_cls) # simple forward to get the reference results ref_rpn_results = self.rpn_head.simple_test_rpn(x_ref, ref_img_metas_new) (rpn_losses, proposal_feats, x_feats, mask_preds, cls_scores) = rpn_results (ref_proposal_feats, ref_x_feats, ref_mask_preds, ref_cls_scores, ref_seg_preds) = ref_rpn_results # forward to get the current results losses, object_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.forward_train( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, gt_masks, gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, gt_bboxes=gt_bboxes, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls, imgs_whwh=None) # simple forward to get the reference results _, ref_cls_scores, ref_mask_preds, ref_scaled_mask_preds = self.roi_head.simple_test_mask_preds( ref_x_feats, ref_proposal_feats, ref_mask_preds, ref_cls_scores, ref_img_metas_new, ) # ===== Tracking Part -==== # # assign both key frame and reference frame tracking targets key_sampling_results, ref_sampling_results = [], [] num_imgs = len(img_metas) x_track_fea = x_feats x_track_fea_ref = ref_x_feats for i in range(num_imgs): assign_result = self.track_roi_assigner.assign( scaled_mask_preds[i][:self.num_proposals].detach(), cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(), gt_masks[i], gt_labels[i], img_meta=img_metas[i]) sampling_result = self.track_roi_sampler.sample( assign_result, mask_preds[i][:self.num_proposals].detach(), gt_masks[i]) key_sampling_results.append(sampling_result) ref_assign_result = self.track_roi_assigner.assign( ref_scaled_mask_preds[i][:self.num_proposals].detach(), ref_cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(), ref_gt_masks[i], ref_gt_labels[i], img_meta=ref_img_metas_new[i]) ref_sampling_result = self.track_roi_sampler.sample( ref_assign_result, ref_mask_preds[i][:self.num_proposals].detach(), ref_gt_masks[i]) ref_sampling_results.append(ref_sampling_result) # mask feature embeddings key_masks = [res.pos_gt_masks for res in key_sampling_results] key_feats = self._track_forward(x_track_fea, key_masks) ref_masks = [res.pos_gt_masks for res in ref_sampling_results] ref_feats = self._track_forward(x_track_fea_ref, ref_masks) match_feats = self.track_head.match(key_feats, ref_feats, key_sampling_results, ref_sampling_results) asso_targets = self.track_head.get_track_targets( gt_match_indices, key_sampling_results, ref_sampling_results) loss_track = self.track_head.loss(*match_feats, *asso_targets) losses.update(loss_track) losses.update(rpn_losses) return losses def simple_test(self, img, img_metas, rescale=False, ref_img=None, **kwargs): """Test function without test time augmentation. Args: imgs (list[torch.Tensor]): List of multiple images img_metas (list[dict]): List of image information. rescale (bool): Whether to rescale the results. Defaults to False. Returns: list[list[np.ndarray]]: BBox results of each image and classes. The outer list corresponds to each image. The inner list corresponds to each class. """ # if ref_img is not None: # ref_img = ref_img[0] # whether is the first frame for such clips assert 'city' in img_metas[0]['filename'] and 'iid' in img_metas[0] iid = img_metas[0]['iid'] fid = iid % 10000 img_name = img_metas[0]['filename'].split("/")[-1].split(".")[0] if "city" in img_metas[0]['filename']: iid = img_metas[0]['iid'] fid = iid % 10000 is_first = (fid == 1) else: iid = kwargs['img_id'][0].item() fid = iid % 10000 is_first = (fid == 0) if is_first: self.init_tracker() # for current frame x = self.extract_feat(img) # x_track_fea = self.track_localization_fpn(x) # current frame inference rpn_results = self.rpn_head.simple_test_rpn(x, img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results x_track_fea = x_feats cur_segm_results, query_output, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test( x_feats, proposal_feats, mask_preds, cls_scores, img_metas) # for tracking part sorted_bbox_result, segm_result, mask_preds, panoptic_result = cur_segm_results[0] panoptic_seg, segments_info = panoptic_result # get the semantic filter if self.semantic_filter: seg_preds = torch.nn.functional.interpolate(seg_preds, panoptic_seg.shape, mode='bilinear', align_corners=False) seg_preds = seg_preds.sigmoid() seg_out = seg_preds.argmax(1) semantic_thing = (seg_out < self.num_thing_classes).to(dtype=torch.float32) else: semantic_thing = 1. # get sorted tracking thing ids, labels, masks, score for tracking things_index_for_tracking, things_labels_for_tracking, thing_masks_for_tracking, things_score_for_tracking = \ self.get_things_id_for_tracking(panoptic_seg, segments_info) things_labels_for_tracking = torch.Tensor(things_labels_for_tracking).to(cls_scores.device).long() if len(things_labels_for_tracking) > 0: thing_masks_for_tracking_final = [] for mask in thing_masks_for_tracking: thing_masks_for_tracking_final.append(torch.Tensor(mask).unsqueeze(0).to( x_feats.device).float()) thing_masks_for_tracking_final = torch.cat(thing_masks_for_tracking_final, 0) thing_masks_for_tracking = thing_masks_for_tracking_final thing_masks_for_tracking_scaled = F.interpolate(thing_masks_for_tracking.unsqueeze(0), size=x_track_fea.size()[2:], mode="bilinear", align_corners=False) things_bbox_for_tracking = torch.zeros((len(things_score_for_tracking), 5), dtype=torch.float, device=x_feats.device) things_bbox_for_tracking[:, 4] = torch.tensor(things_score_for_tracking, device=things_bbox_for_tracking.device) thing_masks_for_tracking_with_semantic_filter = thing_masks_for_tracking_scaled * semantic_thing if len(things_labels_for_tracking) == 0: track_feats = None else: # tracking embedding features track_feats = self._track_forward(x_track_fea, thing_masks_for_tracking_with_semantic_filter) if track_feats is not None: # assert len(things_id_for_tracking) == len(things_labels_for_tracking) things_bbox_for_tracking[:, :4] = torch.tensor( tensor_mask2box(thing_masks_for_tracking_with_semantic_filter), device=things_bbox_for_tracking.device) bboxes, labels, ids = self.tracker.match( bboxes=things_bbox_for_tracking, labels=things_labels_for_tracking, track_feats=track_feats, frame_id=fid) ids = ids + 1 # hack for unmatched into background ids[ids == -1] = 0 else: ids = [] track_maps = self.generate_track_id_maps(ids, thing_masks_for_tracking, panoptic_seg) return self.get_semantic_seg(panoptic_seg, segments_info), track_maps, None, None, None def _track_forward(self, x, mask_pred): """Track head forward function used in both training and testing. We use mask pooling to get the fine grain features""" track_feats_list = [] for i, masks in enumerate(mask_pred): masks = masks.sigmoid() > 0.5 masks = masks.float().detach() size = x.size()[2:] masks = F.interpolate(masks.unsqueeze(0), size=size, mode="bilinear", align_corners=True).squeeze(0) track_feats = torch.einsum('nhw,chw->nc', masks, x[i]) track_feats = track_feats / (masks.sum(-1).sum(-1) + 1).unsqueeze(-1) track_feats_list.append(track_feats) track_feats = torch.cat(track_feats_list, 0) track_feats = self.track_head(track_feats) return track_feats def forward_dummy(self, img): """Used for computing network flops. See `mmdetection/tools/get_flops.py` """ # backbone x = self.extract_feat(img) # rpn num_imgs = len(img) dummy_img_metas = [ dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs) ] rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats, dummy_img_metas) return roi_outs def extract_feat(self, img): """Directly extract features from the backbone+neck.""" x = self.backbone(img) if self.with_neck: x = self.neck(x) return x @property def with_rpn(self): """bool: whether the detector has RPN""" return hasattr(self, 'rpn_head') and self.rpn_head is not None @property def with_roi_head(self): """bool: whether the detector has a RoI head""" return hasattr(self, 'roi_head') and self.roi_head is not None def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ pass def get_things_id_for_tracking(self, panoptic_seg, seg_infos): idxs = [] labels = [] masks = [] score = [] for segment in seg_infos: if segment['isthing'] == True: thing_mask = panoptic_seg == segment["id"] masks.append(thing_mask) idxs.append(segment["instance_id"]) labels.append(segment['category_id']) score.append(segment['score']) return idxs, labels, masks, score def pack_things_object(self, object_feats, ref_object_feats): object_feats, ref_object_feats = object_feats.squeeze(-1).squeeze(-1), ref_object_feats.squeeze(-1).squeeze(-1) thing_object_feats = torch.split(object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] ref_thing_object_feats = torch.split(ref_object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] return thing_object_feats, ref_thing_object_feats def pack_things_masks(self, mask_pred, ref_mask_pred): thing_mask_pred = torch.split(mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] ref_thing_thing_mask_pred = torch.split(ref_mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] return thing_mask_pred, ref_thing_thing_mask_pred def get_semantic_seg(self, panoptic_seg, segments_info): results = {} masks = [] scores = [] kitti_step2cityscpaes = [11, 13] semantic_seg = np.zeros(panoptic_seg.shape) for segment in segments_info: if segment['isthing'] == True: if self.kitti_step: cat_cur = kitti_step2cityscpaes[segment["category_id"]] semantic_seg[panoptic_seg == segment["id"]] = cat_cur else: semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + 11 else: # for stuff (0- n-1) if self.kitti_step: cat_cur = segment["category_id"] cat_cur -= 1 offset = 0 for thing_id in kitti_step2cityscpaes: if cat_cur + offset >= thing_id: offset += 1 cat_cur += offset semantic_seg[panoptic_seg == segment["id"]] = cat_cur else: semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1 return semantic_seg def generate_track_id_maps(self, ids, masks, panopitc_seg_maps): final_id_maps = np.zeros(panopitc_seg_maps.shape) if len(ids) == 0: return final_id_maps # assert len(things_mask_results) == len(track_results) masks = masks.bool() for i, id in enumerate(ids): mask = masks[i].cpu().numpy() final_id_maps[mask] = id return final_id_maps import cv2 import numpy as np import os.path as osp def log_masks_for_inference(masks_preds, names, output_dirs="work_dirs/vps/vps_output/thing_masks"): for i, masks in enumerate(masks_preds): out_masks = np.zeros(masks_preds[0].shape).astype(np.int16) masks = masks.sigmoid() > 0.5 masks = masks.cpu().numpy() out_masks[masks==1] = 255 file_name = osp.join(output_dirs, names + "_" + str(i) + ".png") cv2.imwrite(file_name, out_masks) ================================================ FILE: knet/video/knet_quansi_dense_embed_fc.py ================================================ import warnings import numpy as np import torch import torch.nn.functional as F import torch.nn as nn from mmcv.cnn import (ConvModule, bias_init_with_prob, build_activation_layer, build_norm_layer) from mmdet.models.builder import DETECTORS from mmdet.models.detectors import BaseDetector from mmdet.models.builder import build_head, build_neck, build_backbone, build_roi_extractor from mmdet.core import build_assigner, build_sampler from knet.video.qdtrack.builder import build_tracker from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes, sem2ins_masks_kitti_step from unitrack.mask import tensor_mask2box @DETECTORS.register_module() class VideoKNetQuansiEmbedFC(BaseDetector): """ Simple Extension of KNet to Video KNet by the implementation of VPSFuse Net. """ def __init__(self, backbone, neck=None, rpn_head=None, roi_head=None, track_head=None, extra_neck=None, track_mhsa=False, tracker=None, train_cfg=None, test_cfg=None, track_train_cfg=None, pretrained=None, init_cfg=None, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, detach_mask_emd=False, cityscapes=False, kitti_step=False, cityscapes_short=False, freeze_detector=False, semantic_filter=True, # linking parameters link_previous=False, bbox_roi_extractor=None, **kwargs): super(VideoKNetQuansiEmbedFC, self).__init__(init_cfg) if pretrained: warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') backbone.pretrained = pretrained self.backbone = build_backbone(backbone) if neck is not None: self.neck = build_neck(neck) if extra_neck is not None: self.extra_neck = build_neck(extra_neck) if rpn_head is not None: rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None rpn_head_ = rpn_head.copy() rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn) self.rpn_head = build_head(rpn_head_) if roi_head is not None: # update train and test cfg here for now # TODO: refactor assigner & sampler rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None roi_head.update(train_cfg=rcnn_train_cfg) roi_head.update(test_cfg=test_cfg.rcnn) roi_head.pretrained = pretrained self.roi_head = build_head(roi_head) if track_head is not None: self.track_train_cfg = track_train_cfg self.track_head = build_head(track_head) self.init_track_assigner_sampler() if bbox_roi_extractor is not None: self.track_roi_extractor = build_roi_extractor( bbox_roi_extractor) if tracker is not None: self.tracker_cfg = tracker if freeze_detector: self._freeze_detector() self.train_cfg = train_cfg self.test_cfg = test_cfg self.num_proposals = self.rpn_head.num_proposals self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.thing_label_in_seg = thing_label_in_seg self.ignore_label = ignore_label self.cityscapes = cityscapes # whether to train the cityscape panoptic segmentation self.kitti_step = kitti_step # whether to train the kitti step panoptic segmentation self.cityscapes_short = cityscapes_short # whether to test with short clips (300) self.semantic_filter = semantic_filter self.link_previous = link_previous self.detach_mask_emd = detach_mask_emd self.track_mhsa = track_mhsa # add embedding fcs for the final stage queries num_emb_fcs = 1 act_cfg = dict(type='ReLU', inplace=True) in_channels = 256 out_channels = 256 self.embed_fcs = nn.ModuleList() for _ in range(num_emb_fcs): self.embed_fcs.append( nn.Linear(in_channels, in_channels, bias=False)) self.embed_fcs.append( build_norm_layer(dict(type='LN'), in_channels)[1]) self.embed_fcs.append(build_activation_layer(act_cfg)) self.fc_embed = nn.Linear(in_channels, out_channels) def init_tracker(self): self.tracker = build_tracker(self.tracker_cfg) def _freeze_detector(self): self.detector = [ self.rpn_head, self.roi_head ] for model in self.detector: model.eval() for param in model.parameters(): param.requires_grad = False def init_track_assigner_sampler(self): """Initialize assigner and sampler.""" self.track_roi_assigner = build_assigner( self.track_train_cfg.assigner) self.track_share_assigner = False self.track_roi_sampler = build_sampler( self.track_train_cfg.sampler, context=self) self.track_share_sampler = False def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_semantic_seg): # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks_tensor = [] gt_sem_seg = [] gt_sem_cls = [] # batch_input_shape shoud be the same across images pad_H, pad_W = img_metas[0]['batch_input_shape'] assign_H = pad_H // self.mask_assign_stride assign_W = pad_W // self.mask_assign_stride for i, gt_mask in enumerate(gt_masks): mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device) if gt_mask.width != pad_W or gt_mask.height != pad_H: pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height) mask_tensor = F.pad(mask_tensor, pad_wh, value=0) if gt_semantic_seg is not None: # gt_semantic seg is padded by zero when forming a batch # need to convert them from 0 to ignore gt_semantic_seg[ i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label gt_semantic_seg[ i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label if self.cityscapes: sem_labels, sem_seg = sem2ins_masks_cityscapes( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes, thing_label_in_seg=list(range(self.num_stuff_classes, self.num_thing_classes + self.num_stuff_classes)) ) elif self.kitti_step: sem_labels, sem_seg = sem2ins_masks_kitti_step( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=2, thing_label_in_seg=(11, 13)) else: sem_labels, sem_seg = sem2ins_masks( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes, thing_label_in_seg=self.thing_label_in_seg) if sem_seg.shape[0] == 0: gt_sem_seg.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_sem_seg.append( F.interpolate( sem_seg[None], (assign_H, assign_W), mode='bilinear', align_corners=False)[0]) gt_sem_cls.append(sem_labels) else: gt_sem_seg = None gt_sem_cls = None if mask_tensor.shape[0] == 0: gt_masks_tensor.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_masks_tensor.append( F.interpolate( mask_tensor[None], (assign_H, assign_W), # downsample to 1/4 resolution mode='bilinear', align_corners=False)[0]) return gt_masks_tensor, gt_sem_cls, gt_sem_seg def forward_train(self, img, img_metas, gt_bboxes=None, gt_labels=None, gt_bboxes_ignore=None, gt_masks=None, gt_semantic_seg=None, gt_instance_ids=None, ref_img=None, ref_img_metas=None, ref_gt_bboxes_ignore=None, ref_gt_labels=None, ref_gt_bboxes=None, ref_gt_masks=None, ref_gt_semantic_seg=None, ref_gt_instance_ids=None, proposals=None, **kwargs): """Forward function of SparseR-CNN-like network in train stage. Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see :class:`mmdet.datasets.pipelines.Collect`. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor): specify which bounding boxes can be ignored when computing the loss. gt_masks (List[Tensor], optional) : Segmentation masks for each box. But we don't support it in this architecture. proposals (List[Tensor], optional): override rpn proposals with custom proposals. Use when `with_rpn` is False. # This is for video only: ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images. Typically these should be mean centered and std scaled. 2 denotes there is two reference images for each input image. ref_img_metas (list[list[dict]]): The first list only has one element. The second list contains reference image information dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmtrack/datasets/pipelines/formatting.py:VideoCollect`. ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The Tensor contains ground truth bboxes for each reference image with shape (num_all_ref_gts, 5) in [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id start from 0, and denotes the id of reference image for each key image. ref_gt_labels (list[Tensor]): The list only has one Tensor. The Tensor contains class indices corresponding to each reference box with shape (num_all_ref_gts, 2) in [ref_img_id, class_indice]. Returns: dict[str, Tensor]: a dictionary of loss components """ batch_input_shape = tuple(img[0].size()[-2:]) for img_meta in img_metas: img_meta['batch_input_shape'] = batch_input_shape assert proposals is None, 'KNet does not support' \ ' external proposals' assert gt_masks is not None assert gt_instance_ids is not None # preprocess the reference images ref_img = ref_img.squeeze(1) # (b,3,h,w) img_h, img_w = batch_input_shape ref_masks_gt = [] for ref_gt_mask in ref_gt_masks: ref_masks_gt.append(ref_gt_mask[0]) ref_labels_gt = [] for ref_gt_label in ref_gt_labels: ref_labels_gt.append(ref_gt_label[:, 1].long()) ref_gt_labels = ref_labels_gt ref_semantic_seg_gt = ref_gt_semantic_seg.squeeze(1) ref_gt_instance_id_list = [] for ref_gt_instance_id in ref_gt_instance_ids: ref_gt_instance_id_list.append(ref_gt_instance_id[:, 1].long()) ref_img_metas_new = [] for ref_img_meta in ref_img_metas: ref_img_meta[0]['batch_input_shape'] = batch_input_shape ref_img_metas_new.append(ref_img_meta[0]) # prepare the gt_match_indices gt_pids_list = [] for i in range(len(ref_gt_instance_id_list)): ref_ids = ref_gt_instance_id_list[i].cpu().data.numpy().tolist() gt_ids = gt_instance_ids[i].cpu().data.numpy().tolist() gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids] gt_pids_list.append(torch.LongTensor([gt_pids]).to(img.device)[0]) gt_match_indices = gt_pids_list # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks, gt_sem_cls, gt_sem_seg = self.preprocess_gt_masks(img_metas, gt_masks, gt_labels, gt_semantic_seg) ref_gt_masks, ref_gt_sem_cls, ref_gt_sem_seg = self.preprocess_gt_masks(ref_img_metas_new, ref_masks_gt, ref_gt_labels, ref_semantic_seg_gt) x = self.extract_feat(img) x_ref = self.extract_feat(ref_img) rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks, gt_labels, gt_sem_seg, gt_sem_cls) # simple forward to get the reference results self.rpn_head.eval() ref_rpn_results = self.rpn_head.simple_test_rpn(x_ref, ref_img_metas_new) self.rpn_head.train() (rpn_losses, proposal_feats, x_feats, mask_preds, cls_scores) = rpn_results (ref_proposal_feats, ref_x_feats, ref_mask_preds, ref_cls_scores, ref_seg_preds) = ref_rpn_results ref_obj_feats, ref_cls_scores, ref_mask_preds, ref_scaled_mask_preds = self.roi_head.simple_test_mask_preds( ref_x_feats, ref_proposal_feats, ref_mask_preds, ref_cls_scores, ref_img_metas_new, ) if self.link_previous: losses, object_feats, cls_scores, mask_preds, scaled_mask_preds, object_feats_track = self.roi_head.forward_train_with_previous( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, gt_masks, gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, gt_bboxes=gt_bboxes, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls, imgs_whwh=None, previous_obj_feats=ref_obj_feats, previous_mask_preds=ref_scaled_mask_preds, previous_x_feats=ref_x_feats, ) else: # forward to get the current results losses, object_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.forward_train( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, gt_masks, gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, gt_bboxes=gt_bboxes, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls, imgs_whwh=None) # ===== Tracking Part -==== # # assign both key frame and reference frame tracking targets key_sampling_results, ref_sampling_results = [], [] num_imgs = len(img_metas) for i in range(num_imgs): assign_result = self.track_roi_assigner.assign( scaled_mask_preds[i][:self.num_proposals].detach(), cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(), gt_masks[i], gt_labels[i], img_meta=img_metas[i]) sampling_result = self.track_roi_sampler.sample( assign_result, mask_preds[i][:self.num_proposals].detach(), gt_masks[i]) key_sampling_results.append(sampling_result) ref_assign_result = self.track_roi_assigner.assign( ref_scaled_mask_preds[i][:self.num_proposals].detach(), ref_cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(), ref_gt_masks[i], ref_gt_labels[i], img_meta=ref_img_metas_new[i]) ref_sampling_result = self.track_roi_sampler.sample( ref_assign_result, ref_mask_preds[i][:self.num_proposals].detach(), ref_gt_masks[i]) ref_sampling_results.append(ref_sampling_result) if self.detach_mask_emd: object_feats = object_feats.detach() ref_obj_feats = ref_obj_feats.detach() if self.link_previous: object_feats = object_feats_track N, num_proposal, _, _, _ = object_feats.shape emb_feat = object_feats.squeeze(-2).squeeze(-1)[:, :self.num_proposals, ] for emb_layer in self.embed_fcs: emb_feat = emb_layer(emb_feat) object_feats_embed = self.fc_embed(emb_feat).view(N, self.num_proposals, -1) ref_emb_feat = ref_obj_feats.squeeze(-2).squeeze(-1)[:, :self.num_proposals, ] for emb_layer in self.embed_fcs: ref_emb_feat = emb_layer(ref_emb_feat) ref_object_feats_embed = self.fc_embed(ref_emb_feat).view(N, self.num_proposals, -1) # sampling predicted GT mask key_emb_indexs = [res.pos_inds for res in key_sampling_results] object_feats_embed_list = [] for i in range(len(key_emb_indexs)): object_feats_embed_list.append(object_feats_embed[:, key_emb_indexs[i], :].squeeze(0)) key_feats = self._track_forward(object_feats_embed_list) ref_emb_indexs = [res.pos_inds for res in ref_sampling_results] ref_object_feats_embed_list = [] for i in range(len(ref_emb_indexs)): ref_object_feats_embed_list.append(ref_object_feats_embed[:, ref_emb_indexs[i], :].squeeze(0)) ref_feats = self._track_forward(ref_object_feats_embed_list) match_feats = self.track_head.match(key_feats, ref_feats, key_sampling_results, ref_sampling_results) asso_targets = self.track_head.get_track_targets( gt_match_indices, key_sampling_results, ref_sampling_results) loss_track = self.track_head.loss(*match_feats, *asso_targets) losses.update(loss_track) losses.update(rpn_losses) return losses def simple_test(self, img, img_metas, rescale=False, ref_img=None, **kwargs): """Test function without test time augmentation. Args: imgs (list[torch.Tensor]): List of multiple images img_metas (list[dict]): List of image information. rescale (bool): Whether to rescale the results. Defaults to False. Returns: list[list[np.ndarray]]: BBox results of each image and classes. The outer list corresponds to each image. The inner list corresponds to each class. """ # set the dataset type # whether is the first frame for such clips if self.cityscapes and not self.kitti_step and not self.cityscapes_short: iid = img_metas[0]['iid'] fid = iid % 10000 is_first = (fid == 1) else: iid = kwargs['img_id'][0].item() fid = iid % 10000 is_first = (fid == 0) if is_first: self.init_tracker() self.obj_feats_memory = None self.x_feats_memory = None self.mask_preds_memory = None # for current frame x = self.extract_feat(img) # current frame inference rpn_results = self.rpn_head.simple_test_rpn(x, img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results if self.link_previous: cur_segm_results, obj_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test_with_previous( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, previous_obj_feats=self.obj_feats_memory, previous_mask_preds=self.mask_preds_memory, previous_x_feats=self.x_feats_memory, is_first=is_first, ) self.obj_feats_memory = obj_feats self.x_feats_memory = x_feats self.mask_preds_memory = scaled_mask_preds else: cur_segm_results, query_output, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test( x_feats, proposal_feats, mask_preds, cls_scores, img_metas) # for tracking part _, segm_result, mask_preds, panoptic_result, query_output = cur_segm_results[0] panoptic_seg, segments_info = panoptic_result # get sorted tracking thing ids, labels, masks, score for tracking things_index_for_tracking, things_labels_for_tracking, thing_masks_for_tracking, things_score_for_tracking = \ self.get_things_id_for_tracking(panoptic_seg, segments_info) things_labels_for_tracking = torch.Tensor(things_labels_for_tracking).to(cls_scores.device).long() # get the semantic filter if self.semantic_filter: seg_preds = torch.nn.functional.interpolate(seg_preds, panoptic_seg.shape, mode='bilinear', align_corners=False) seg_preds = seg_preds.sigmoid() seg_out = seg_preds.argmax(1) semantic_thing = (seg_out < self.num_thing_classes).to(dtype=torch.float32) else: semantic_thing = 1. if len(things_labels_for_tracking) > 0: things_bbox_for_tracking = torch.zeros((len(things_score_for_tracking), 5), dtype=torch.float, device=x_feats.device) things_bbox_for_tracking[:, 4] = torch.tensor(things_score_for_tracking, device=things_bbox_for_tracking.device) thing_masks_for_tracking_final = [] for mask in thing_masks_for_tracking: thing_masks_for_tracking_final.append(torch.Tensor(mask).unsqueeze(0).to( x_feats.device).float()) thing_masks_for_tracking_final = torch.cat(thing_masks_for_tracking_final, 0) thing_masks_for_tracking = thing_masks_for_tracking_final thing_masks_for_tracking_with_semantic_filter = thing_masks_for_tracking_final * semantic_thing if len(things_labels_for_tracking) == 0: track_feats = None else: # tracking embeddings N, _, _, _ = query_output.shape emb_feat = query_output.squeeze(-2).squeeze(-1).unsqueeze(0) # (n,d,1,1) -> (1,n,d) for emb_layer in self.embed_fcs: emb_feat = emb_layer(emb_feat) object_feats_embed = self.fc_embed(emb_feat).view(1, N, -1) object_feats_embed_for_tracking = object_feats_embed.squeeze(0) # tracking embedding features track_feats = self._track_forward([object_feats_embed_for_tracking]) if track_feats is not None: things_bbox_for_tracking[:, :4] = torch.tensor( tensor_mask2box(thing_masks_for_tracking_with_semantic_filter), device=things_bbox_for_tracking.device) bboxes, labels, ids = self.tracker.match( bboxes=things_bbox_for_tracking, labels=things_labels_for_tracking, track_feats=track_feats, frame_id=fid) ids = ids + 1 ids[ids == -1] = 0 else: ids = [] print("ids", ids) track_maps = self.generate_track_id_maps(ids, thing_masks_for_tracking, panoptic_seg) semantic_map = self.get_semantic_seg(panoptic_seg, segments_info) from scripts.visualizer import trackmap2rgb, cityscapes_cat2rgb, draw_bbox_on_img vis_tracker = trackmap2rgb(track_maps) vis_sem = cityscapes_cat2rgb(semantic_map) if len(things_labels_for_tracking): vis_tracker = draw_bbox_on_img(vis_tracker, things_bbox_for_tracking.cpu().numpy()) # Visualization usage return semantic_map, track_maps, None, vis_sem, vis_tracker def _track_forward(self, track_feats, x=None, mask_pred=None): """Track head forward function used in both training and testing. We use mask pooling to get the fine grain features""" # if not self.training: # mask_pred = [mask_pred] # bbox_list = batch_mask2boxlist(mask_pred) # track_rois = bboxlist2roi(bbox_list) # track_rois = track_rois.clamp(min=0.0) # track_feats = self.track_roi_extractor(x[:self.track_roi_extractor.num_inputs], track_rois) track_feats = torch.cat(track_feats, 0) # print(track_feats.shape) # print(track_feats.shape) # track_feats = track_feats track_feats = self.track_head(track_feats) return track_feats def forward_dummy(self, img): """Used for computing network flops. See `mmdetection/tools/get_flops.py` """ # backbone x = self.extract_feat(img) # rpn num_imgs = len(img) dummy_img_metas = [ dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs) ] rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats, dummy_img_metas) return roi_outs def extract_feat(self, img): """Directly extract features from the backbone+neck.""" x = self.backbone(img) if self.with_neck: x = self.neck(x) return x @property def with_rpn(self): """bool: whether the detector has RPN""" return hasattr(self, 'rpn_head') and self.rpn_head is not None @property def with_roi_head(self): """bool: whether the detector has a RoI head""" return hasattr(self, 'roi_head') and self.roi_head is not None def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ pass def get_things_id_for_tracking(self, panoptic_seg, seg_infos): idxs = [] labels = [] masks = [] score = [] for segment in seg_infos: if segment['isthing'] == True: thing_mask = panoptic_seg == segment["id"] masks.append(thing_mask) idxs.append(segment["instance_id"]) labels.append(segment['category_id']) score.append(segment['score']) return idxs, labels, masks, score def pack_things_object(self, object_feats, ref_object_feats): object_feats, ref_object_feats = object_feats.squeeze(-1).squeeze(-1), ref_object_feats.squeeze(-1).squeeze(-1) thing_object_feats = torch.split(object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] ref_thing_object_feats = \ torch.split(ref_object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] return thing_object_feats, ref_thing_object_feats def pack_things_masks(self, mask_pred, ref_mask_pred): thing_mask_pred = torch.split(mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] ref_thing_thing_mask_pred = \ torch.split(ref_mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] return thing_mask_pred, ref_thing_thing_mask_pred def get_semantic_seg(self, panoptic_seg, segments_info): results = {} masks = [] scores = [] kitti_step2cityscpaes = [11, 13] semantic_seg = np.zeros(panoptic_seg.shape) for segment in segments_info: if segment['isthing'] == True: if self.kitti_step: cat_cur = kitti_step2cityscpaes[segment["category_id"]] semantic_seg[panoptic_seg == segment["id"]] = cat_cur else: semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + self.num_stuff_classes else: # for stuff (0- n-1) if self.kitti_step: cat_cur = segment["category_id"] cat_cur -= 1 offset = 0 for thing_id in kitti_step2cityscpaes: if cat_cur + offset >= thing_id: offset += 1 cat_cur += offset semantic_seg[panoptic_seg == segment["id"]] = cat_cur else: semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1 return semantic_seg def generate_track_id_maps(self, ids, masks, panopitc_seg_maps): final_id_maps = np.zeros(panopitc_seg_maps.shape) if len(ids) == 0: return final_id_maps # assert len(things_mask_results) == len(track_results) masks = masks.bool() for i, id in enumerate(ids): mask = masks[i].cpu().numpy() final_id_maps[mask] = id return final_id_maps ================================================ FILE: knet/video/knet_quansi_dense_embed_fc_joint_train.py ================================================ import warnings import numpy as np import torch import torch.nn.functional as F import torch.nn as nn from mmcv.cnn import (ConvModule, bias_init_with_prob, build_activation_layer, build_norm_layer) from mmdet.models.builder import DETECTORS from mmdet.models.detectors import BaseDetector from mmdet.models.builder import build_head, build_neck, build_backbone, build_roi_extractor from mmdet.core import build_assigner, build_sampler from knet.video.qdtrack.builder import build_tracker from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes, sem2ins_masks_kitti_step from unitrack.mask import tensor_mask2box @DETECTORS.register_module() class VideoKNetQuansiEmbedFCJointTrain(BaseDetector): """ Simple Extension of KNet to Video KNet by the implementation of VPSFuse Net. """ def __init__(self, backbone, neck=None, rpn_head=None, roi_head=None, track_head=None, extra_neck=None, track_localization_fpn=None, tracker=None, train_cfg=None, test_cfg=None, track_train_cfg=None, pretrained=None, init_cfg=None, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, detach_mask_emd=False, cityscapes=False, kitti_step=False, cityscapes_short=False, vipseg=False, freeze_detector=False, semantic_filter=True, # linking parameters link_previous=False, bbox_roi_extractor=None, **kwargs): super(VideoKNetQuansiEmbedFCJointTrain, self).__init__(init_cfg) if pretrained: warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') backbone.pretrained = pretrained self.backbone = build_backbone(backbone) if neck is not None: self.neck = build_neck(neck) if extra_neck is not None: self.extra_neck = build_neck(extra_neck) if rpn_head is not None: rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None rpn_head_ = rpn_head.copy() rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn) self.rpn_head = build_head(rpn_head_) if roi_head is not None: # update train and test cfg here for now # TODO: refactor assigner & sampler rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None roi_head.update(train_cfg=rcnn_train_cfg) roi_head.update(test_cfg=test_cfg.rcnn) roi_head.pretrained = pretrained self.roi_head = build_head(roi_head) if track_head is not None: self.track_train_cfg = track_train_cfg self.track_head = build_head(track_head) self.init_track_assigner_sampler() if track_localization_fpn is not None: self.track_localization_fpn = build_neck(track_localization_fpn) if bbox_roi_extractor is not None: self.track_roi_extractor = build_roi_extractor( bbox_roi_extractor) if tracker is not None: self.tracker_cfg = tracker if freeze_detector: self._freeze_detector() self.train_cfg = train_cfg self.test_cfg = test_cfg self.num_proposals = self.rpn_head.num_proposals self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.thing_label_in_seg = thing_label_in_seg self.ignore_label = ignore_label self.cityscapes = cityscapes # whether to train the cityscape panoptic segmentation self.kitti_step = kitti_step # whether to train the kitti step panoptic segmentation self.cityscapes_short = cityscapes_short # whether to test the cityscape short panoptic segmentation self.vipseg = vipseg # whether to test the vip panoptic segmentation self.semantic_filter = semantic_filter self.link_previous = link_previous self.detach_mask_emd = detach_mask_emd # add embedding fcs for the final stage queries num_emb_fcs = 1 act_cfg = dict(type='ReLU', inplace=True) in_channels = 256 out_channels = 256 self.embed_fcs = nn.ModuleList() for _ in range(num_emb_fcs): self.embed_fcs.append( nn.Linear(in_channels, in_channels, bias=False)) self.embed_fcs.append( build_norm_layer(dict(type='LN'), in_channels)[1]) self.embed_fcs.append(build_activation_layer(act_cfg)) self.fc_embed = nn.Linear(in_channels, out_channels) def init_tracker(self): self.tracker = build_tracker(self.tracker_cfg) def _freeze_detector(self): self.detector = [ self.rpn_head, self.roi_head ] for model in self.detector: model.eval() for param in model.parameters(): param.requires_grad = False def init_track_assigner_sampler(self): """Initialize assigner and sampler.""" self.track_roi_assigner = build_assigner( self.track_train_cfg.assigner) self.track_share_assigner = False self.track_roi_sampler = build_sampler( self.track_train_cfg.sampler, context=self) self.track_share_sampler = False def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_semantic_seg): # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks_tensor = [] gt_sem_seg = [] gt_sem_cls = [] # batch_input_shape shoud be the same across images pad_H, pad_W = img_metas[0]['batch_input_shape'] assign_H = pad_H // self.mask_assign_stride assign_W = pad_W // self.mask_assign_stride for i, gt_mask in enumerate(gt_masks): mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device) if gt_mask.width != pad_W or gt_mask.height != pad_H: pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height) mask_tensor = F.pad(mask_tensor, pad_wh, value=0) if gt_semantic_seg is not None: # gt_semantic seg is padded by zero when forming a batch # need to convert them from 0 to ignore gt_semantic_seg[ i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label gt_semantic_seg[ i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label if self.cityscapes or self.vipseg: sem_labels, sem_seg = sem2ins_masks_cityscapes( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes, thing_label_in_seg=list(range(self.num_stuff_classes, self.num_thing_classes + self.num_stuff_classes)) ) elif self.kitti_step: sem_labels, sem_seg = sem2ins_masks_kitti_step( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=2, thing_label_in_seg=(11, 13)) else: sem_labels, sem_seg = sem2ins_masks( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes, thing_label_in_seg=self.thing_label_in_seg) if sem_seg.shape[0] == 0: gt_sem_seg.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_sem_seg.append( F.interpolate( sem_seg[None], (assign_H, assign_W), mode='bilinear', align_corners=False)[0]) gt_sem_cls.append(sem_labels) else: gt_sem_seg = None gt_sem_cls = None if mask_tensor.shape[0] == 0: gt_masks_tensor.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_masks_tensor.append( F.interpolate( mask_tensor[None], (assign_H, assign_W), # downsample to 1/4 resolution mode='bilinear', align_corners=False)[0]) return gt_masks_tensor, gt_sem_cls, gt_sem_seg def forward_train(self, img, img_metas, gt_bboxes=None, gt_labels=None, gt_bboxes_ignore=None, gt_masks=None, gt_semantic_seg=None, gt_instance_ids=None, ref_img=None, ref_img_metas=None, ref_gt_bboxes_ignore=None, ref_gt_labels=None, ref_gt_bboxes=None, ref_gt_masks=None, ref_gt_semantic_seg=None, ref_gt_instance_ids=None, proposals=None, **kwargs): """Forward function of SparseR-CNN-like network in train stage. Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see :class:`mmdet.datasets.pipelines.Collect`. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor): specify which bounding boxes can be ignored when computing the loss. gt_masks (List[Tensor], optional) : Segmentation masks for each box. But we don't support it in this architecture. proposals (List[Tensor], optional): override rpn proposals with custom proposals. Use when `with_rpn` is False. # This is for video only: ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images. Typically these should be mean centered and std scaled. 2 denotes there is two reference images for each input image. ref_img_metas (list[list[dict]]): The first list only has one element. The second list contains reference image information dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmtrack/datasets/pipelines/formatting.py:VideoCollect`. ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The Tensor contains ground truth bboxes for each reference image with shape (num_all_ref_gts, 5) in [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id start from 0, and denotes the id of reference image for each key image. ref_gt_labels (list[Tensor]): The list only has one Tensor. The Tensor contains class indices corresponding to each reference box with shape (num_all_ref_gts, 2) in [ref_img_id, class_indice]. Returns: dict[str, Tensor]: a dictionary of loss components """ batch_input_shape = tuple(img[0].size()[-2:]) for img_meta in img_metas: img_meta['batch_input_shape'] = batch_input_shape assert proposals is None, 'KNet does not support' \ ' external proposals' assert gt_masks is not None assert gt_instance_ids is not None # preprocess the reference images ref_img = ref_img.squeeze(1) # (b,3,h,w) img_h, img_w = batch_input_shape ref_masks_gt = [] for ref_gt_mask in ref_gt_masks: ref_masks_gt.append(ref_gt_mask[0]) ref_labels_gt = [] for ref_gt_label in ref_gt_labels: ref_labels_gt.append(ref_gt_label[:, 1].long()) ref_gt_labels = ref_labels_gt ref_semantic_seg_gt = ref_gt_semantic_seg.squeeze(1) ref_gt_instance_id_list = [] for ref_gt_instance_id in ref_gt_instance_ids: ref_gt_instance_id_list.append(ref_gt_instance_id[:,1].long()) ref_img_metas_new = [] for ref_img_meta in ref_img_metas: ref_img_meta[0]['batch_input_shape'] = batch_input_shape ref_img_metas_new.append(ref_img_meta[0]) # prepare the gt_match_indices gt_pids_list = [] for i in range(len(ref_gt_instance_id_list)): ref_ids = ref_gt_instance_id_list[i].cpu().data.numpy().tolist() gt_ids = gt_instance_ids[i].cpu().data.numpy().tolist() gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids] gt_pids_list.append(torch.LongTensor([gt_pids]).to(img.device)[0]) gt_match_indices = gt_pids_list # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks, gt_sem_cls, gt_sem_seg = self.preprocess_gt_masks(img_metas, gt_masks, gt_labels, gt_semantic_seg) ref_gt_masks, ref_gt_sem_cls, ref_gt_sem_seg = self.preprocess_gt_masks(ref_img_metas_new, ref_masks_gt, ref_gt_labels, ref_semantic_seg_gt) x = self.extract_feat(img) x_ref = self.extract_feat(ref_img) rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks, gt_labels, gt_sem_seg, gt_sem_cls) ref_rpn_results = self.rpn_head.forward_train(x_ref, ref_img_metas_new, ref_gt_masks, ref_labels_gt, ref_gt_sem_seg, ref_gt_sem_cls) (rpn_losses, proposal_feats, x_feats, mask_preds, cls_scores) = rpn_results (ref_rpn_losses, ref_proposal_feats, ref_x_feats, ref_mask_preds, ref_cls_scores) = ref_rpn_results losses_ref, ref_obj_feats, ref_cls_scores, ref_mask_preds, ref_scaled_mask_preds = self.roi_head.forward_train( ref_x_feats, ref_proposal_feats, ref_mask_preds, ref_cls_scores, ref_img_metas, ref_gt_masks, ref_gt_labels, gt_sem_seg=ref_gt_sem_seg, gt_sem_cls=ref_gt_sem_cls, imgs_whwh=None) if self.link_previous: losses, object_feats, cls_scores, mask_preds, scaled_mask_preds, object_feats_track = self.roi_head.forward_train_with_previous( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, gt_masks, gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, gt_bboxes=gt_bboxes, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls, imgs_whwh=None, previous_obj_feats=ref_obj_feats, previous_mask_preds=ref_scaled_mask_preds, previous_x_feats=ref_x_feats, ) else: # forward to get the current results losses, object_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.forward_train( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, gt_masks, gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, gt_bboxes=gt_bboxes, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls, imgs_whwh=None) # ===== Tracking Part -==== # # assign both key frame and reference frame tracking targets key_sampling_results, ref_sampling_results = [], [] num_imgs = len(img_metas) for i in range(num_imgs): assign_result = self.track_roi_assigner.assign( scaled_mask_preds[i][:self.num_proposals].detach(), cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(), gt_masks[i], gt_labels[i], img_meta=img_metas[i]) sampling_result = self.track_roi_sampler.sample( assign_result, mask_preds[i][:self.num_proposals].detach(), gt_masks[i]) key_sampling_results.append(sampling_result) ref_assign_result = self.track_roi_assigner.assign( ref_scaled_mask_preds[i][:self.num_proposals].detach(), ref_cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(), ref_gt_masks[i], ref_gt_labels[i], img_meta=ref_img_metas_new[i]) ref_sampling_result = self.track_roi_sampler.sample( ref_assign_result, ref_mask_preds[i][:self.num_proposals].detach(), ref_gt_masks[i]) ref_sampling_results.append(ref_sampling_result) # current is tracking object N, num_proposal, _, _, _ = object_feats_track.shape emb_feat = object_feats_track.squeeze(-2).squeeze(-1)[:, :self.num_proposals, ] for emb_layer in self.embed_fcs: emb_feat = emb_layer(emb_feat) object_feats_embed = self.fc_embed(emb_feat).view(N, self.num_proposals, -1) ref_emb_feat = ref_obj_feats.squeeze(-2).squeeze(-1)[:, :self.num_proposals, ] for emb_layer in self.embed_fcs: ref_emb_feat = emb_layer(ref_emb_feat) ref_object_feats_embed = self.fc_embed(ref_emb_feat).view(N, self.num_proposals, -1) # sampling predicted GT mask key_emb_indexs = [res.pos_inds for res in key_sampling_results] object_feats_embed_list = [] for i in range(len(key_emb_indexs)): object_feats_embed_list.append(object_feats_embed[:, key_emb_indexs[i], :].squeeze(0)) key_feats = self._track_forward(object_feats_embed_list) ref_emb_indexs = [res.pos_inds for res in ref_sampling_results] ref_object_feats_embed_list = [] for i in range(len(ref_emb_indexs)): ref_object_feats_embed_list.append(ref_object_feats_embed[:, ref_emb_indexs[i], :].squeeze(0)) ref_feats = self._track_forward(ref_object_feats_embed_list) match_feats = self.track_head.match(key_feats, ref_feats, key_sampling_results, ref_sampling_results) asso_targets = self.track_head.get_track_targets( gt_match_indices, key_sampling_results, ref_sampling_results) loss_track = self.track_head.loss(*match_feats, *asso_targets) ref_losses = self.add_ref_loss(losses_ref) ref_rpn_losses = self.add_ref_rpn_loss(ref_rpn_losses) losses.update(ref_rpn_losses) losses.update(rpn_losses) losses.update(ref_losses) losses.update(loss_track) return losses def simple_test(self, img, img_metas, rescale=False, ref_img=None, **kwargs): """Test function without test time augmentation. Args: imgs (list[torch.Tensor]): List of multiple images img_metas (list[dict]): List of image information. rescale (bool): Whether to rescale the results. Defaults to False. Returns: list[list[np.ndarray]]: BBox results of each image and classes. The outer list corresponds to each image. The inner list corresponds to each class. """ # set the dataset type if self.cityscapes and not self.kitti_step and not self.cityscapes_short and not self.vipseg: iid = img_metas[0]['iid'] fid = iid % 10000 is_first = (fid == 1) else: iid = kwargs['img_id'][0].item() fid = iid % 10000 is_first = (fid == 0) # for current frame x = self.extract_feat(img) # current frame inference rpn_results = self.rpn_head.simple_test_rpn(x, img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results # init tracker if is_first: self.init_tracker() self.obj_feats_memory = None self.x_feats_memory = None self.mask_preds_memory = None print("fid", fid) # wheter to link the previous if self.link_previous: cur_segm_results, obj_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test_with_previous( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, previous_obj_feats=self.obj_feats_memory, previous_mask_preds=self.mask_preds_memory, previous_x_feats=self.x_feats_memory, is_first=is_first ) self.obj_feats_memory = obj_feats self.x_feats_memory = x_feats self.mask_preds_memory = scaled_mask_preds else: cur_segm_results, query_output, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test( x_feats, proposal_feats, mask_preds, cls_scores, img_metas) # for tracking part _, segm_result, mask_preds, panoptic_result, query_output = cur_segm_results[0] panoptic_seg, segments_info = panoptic_result # get sorted tracking thing ids, labels, masks, score for tracking things_index_for_tracking, things_labels_for_tracking, thing_masks_for_tracking, things_score_for_tracking = \ self.get_things_id_for_tracking(panoptic_seg, segments_info) things_labels_for_tracking = torch.Tensor(things_labels_for_tracking).to(cls_scores.device).long() # get the semantic filter if self.semantic_filter: seg_preds = torch.nn.functional.interpolate(seg_preds, panoptic_seg.shape, mode='bilinear', align_corners=False) seg_preds = seg_preds.sigmoid() seg_out = seg_preds.argmax(1) semantic_thing = (seg_out < self.num_thing_classes).to(dtype=torch.float32) else: semantic_thing = 1. if len(things_labels_for_tracking) > 0: things_bbox_for_tracking = torch.zeros((len(things_score_for_tracking), 5), dtype=torch.float, device=x_feats.device) things_bbox_for_tracking[:, 4] = torch.tensor(things_score_for_tracking, device=things_bbox_for_tracking.device) thing_masks_for_tracking_final = [] for mask in thing_masks_for_tracking: thing_masks_for_tracking_final.append(torch.Tensor(mask).unsqueeze(0).to( x_feats.device).float()) thing_masks_for_tracking_final = torch.cat(thing_masks_for_tracking_final, 0) thing_masks_for_tracking = thing_masks_for_tracking_final thing_masks_for_tracking_with_semantic_filter = thing_masks_for_tracking_final * semantic_thing if len(things_labels_for_tracking) == 0: track_feats = None else: # tracking embeddings N, _, _, _ = query_output.shape emb_feat = query_output.squeeze(-2).squeeze(-1).unsqueeze(0) # (n,d,1,1) -> (1,n,d) for emb_layer in self.embed_fcs: emb_feat = emb_layer(emb_feat) object_feats_embed = self.fc_embed(emb_feat).view(1, N, -1) object_feats_embed_for_tracking = object_feats_embed.squeeze(0) track_feats = self._track_forward([object_feats_embed_for_tracking]) if track_feats is not None: things_bbox_for_tracking[:, :4] = torch.tensor(tensor_mask2box(thing_masks_for_tracking_with_semantic_filter), device=things_bbox_for_tracking.device) bboxes, labels, ids = self.tracker.match( bboxes=things_bbox_for_tracking, labels=things_labels_for_tracking, track_feats=track_feats, frame_id=fid) ids = ids + 1 ids[ids == -1] = 0 # print("track feats:", track_feats[0]) # print("id", ids) else: ids = [] track_maps = self.generate_track_id_maps(ids, thing_masks_for_tracking, panoptic_seg) semantic_map = self.get_semantic_seg(panoptic_seg, segments_info) from scripts.visualizer import trackmap2rgb, cityscapes_cat2rgb, draw_bbox_on_img vis_tracker = trackmap2rgb(track_maps) vis_sem = cityscapes_cat2rgb(semantic_map) if len(things_labels_for_tracking): vis_tracker = draw_bbox_on_img(vis_tracker, things_bbox_for_tracking.cpu().numpy()) # Visualization usage return semantic_map, track_maps, None, vis_sem, vis_tracker def _track_forward(self, track_feats, x=None, mask_pred=None): """Track head forward function used in both training and testing. We use mask pooling to get the fine grain features""" # if not self.training: # mask_pred = [mask_pred] track_feats = torch.cat(track_feats, 0) track_feats = self.track_head(track_feats) return track_feats def forward_dummy(self, img, img_metas=None): """Used for computing network flops. See `mmdetection/tools/get_flops.py` """ # backbone x = self.extract_feat(img) # rpn num_imgs = len(img) dummy_img_metas = [ dict(img_shape=(0, 0, 3)) for _ in range(num_imgs) ] rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results roi_outs = self.roi_head.simple_test_mask_preds( x_feats, proposal_feats, mask_preds, cls_scores, dummy_img_metas) return roi_outs def extract_feat(self, img): """Directly extract features from the backbone+neck.""" x = self.backbone(img) if self.with_neck: x = self.neck(x) return x @property def with_rpn(self): """bool: whether the detector has RPN""" return hasattr(self, 'rpn_head') and self.rpn_head is not None @property def with_roi_head(self): """bool: whether the detector has a RoI head""" return hasattr(self, 'roi_head') and self.roi_head is not None def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ pass def get_things_id_for_tracking(self, panoptic_seg, seg_infos): idxs = [] labels = [] masks = [] score = [] for segment in seg_infos: if segment['isthing'] == True: thing_mask = panoptic_seg == segment["id"] masks.append(thing_mask) idxs.append(segment["instance_id"]) labels.append(segment['category_id']) score.append(segment['score']) return idxs, labels, masks, score def pack_things_object(self, object_feats, ref_object_feats): object_feats, ref_object_feats = object_feats.squeeze(-1).squeeze(-1), ref_object_feats.squeeze(-1).squeeze(-1) thing_object_feats = torch.split(object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] ref_thing_object_feats = torch.split(ref_object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] return thing_object_feats, ref_thing_object_feats def pack_things_masks(self, mask_pred, ref_mask_pred): thing_mask_pred = torch.split(mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] ref_thing_thing_mask_pred = torch.split(ref_mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] return thing_mask_pred, ref_thing_thing_mask_pred def get_semantic_seg(self, panoptic_seg, segments_info): kitti_step2cityscpaes = [11, 13] semantic_seg = np.zeros(panoptic_seg.shape) for segment in segments_info: if segment['isthing'] == True: # for things if self.kitti_step: cat_cur = kitti_step2cityscpaes[segment["category_id"]] semantic_seg[panoptic_seg == segment["id"]] = cat_cur else: # city and vip_seg semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + self.num_stuff_classes else: # for stuff (0 - n-1) if self.kitti_step: cat_cur = segment["category_id"] cat_cur -= 1 offset = 0 for thing_id in kitti_step2cityscpaes: if cat_cur + offset >= thing_id: offset += 1 cat_cur += offset semantic_seg[panoptic_seg == segment["id"]] = cat_cur else: # city and vip_seg semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1 return semantic_seg def generate_track_id_maps(self, ids, masks, panopitc_seg_maps): final_id_maps = np.zeros(panopitc_seg_maps.shape) if len(ids) == 0: return final_id_maps masks = masks.bool() for i, id in enumerate(ids): mask = masks[i].cpu().numpy() final_id_maps[mask] = id return final_id_maps def add_ref_loss(self, loss_dict): track_loss ={} for k, v in loss_dict.items(): track_loss[str(k)+"_ref"] = v return track_loss def add_ref_rpn_loss(self, loss_dict): ref_rpn_loss = {} for k, v in loss_dict.items(): ref_rpn_loss[str(k) +"_ref_rpn"] = v return ref_rpn_loss ================================================ FILE: knet/video/knet_quansi_dense_embed_fc_toy_exp.py ================================================ import warnings import numpy as np import torch import torch.nn.functional as F from mmdet.models.builder import DETECTORS from mmdet.models.detectors import BaseDetector from mmdet.models.builder import build_head, build_neck, build_backbone, build_roi_extractor from mmdet.core import build_assigner, build_sampler from knet.video.qdtrack.builder import build_tracker from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes, sem2ins_masks_kitti_step from unitrack.mask import tensor_mask2box @DETECTORS.register_module() class VideoKNetQuansiEmbedFCToy(BaseDetector): """ Simple Extension of KNet to Video KNet by directly propagation the kernels. """ def __init__(self, backbone, neck=None, rpn_head=None, roi_head=None, track_head=None, extra_neck=None, track_localization_fpn=None, track_mhsa=False, tracker=None, train_cfg=None, test_cfg=None, track_train_cfg=None, pretrained=None, init_cfg=None, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, detach_mask_emd=False, cityscapes=False, kitti_step=False, freeze_detector=False, semantic_filter=True, link_previous=False, bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict( type='RoIAlign', output_size=7, sampling_ratio=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), **kwargs): super(VideoKNetQuansiEmbedFCToy, self).__init__(init_cfg) if pretrained: warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') backbone.pretrained = pretrained self.backbone = build_backbone(backbone) if neck is not None: self.neck = build_neck(neck) if extra_neck is not None: self.extra_neck = build_neck(extra_neck) if rpn_head is not None: rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None rpn_head_ = rpn_head.copy() rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn) self.rpn_head = build_head(rpn_head_) if roi_head is not None: # update train and test cfg here for now # TODO: refactor assigner & sampler rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None roi_head.update(train_cfg=rcnn_train_cfg) roi_head.update(test_cfg=test_cfg.rcnn) roi_head.pretrained = pretrained self.roi_head = build_head(roi_head) if track_head is not None: self.track_train_cfg = track_train_cfg self.track_head = build_head(track_head) self.init_track_assigner_sampler() if track_localization_fpn is not None: self.track_localization_fpn = build_neck(track_localization_fpn) if bbox_roi_extractor is not None: self.track_roi_extractor = build_roi_extractor( bbox_roi_extractor) if tracker is not None: self.tracker_cfg = tracker if freeze_detector: self._freeze_detector() self.train_cfg = train_cfg self.test_cfg = test_cfg self.num_proposals = self.rpn_head.num_proposals self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.thing_label_in_seg = thing_label_in_seg self.ignore_label = ignore_label self.cityscapes = cityscapes # whether to train the cityscape panoptic segmentation self.kitti_step = kitti_step # whether to train the kitti step panoptic segmentation self.semantic_filter = semantic_filter self.link_previous = link_previous self.detach_mask_emd = detach_mask_emd self.track_mhsa = track_mhsa # add embedding fcs for the final stage queries # num_emb_fcs = 1 # act_cfg = dict(type='ReLU', inplace=True) # in_channels = 256 # out_channels = 256 # self.embed_fcs = nn.ModuleList() # for _ in range(num_emb_fcs): # self.embed_fcs.append( # nn.Linear(in_channels, in_channels, bias=False)) # self.embed_fcs.append( # build_norm_layer(dict(type='LN'), in_channels)[1]) # self.embed_fcs.append(build_activation_layer(act_cfg)) # # self.fc_embed = nn.Linear(in_channels, out_channels) def init_tracker(self): self.tracker = build_tracker(self.tracker_cfg) def _freeze_detector(self): self.detector = [ self.rpn_head, self.roi_head ] for model in self.detector: model.eval() for param in model.parameters(): param.requires_grad = False def init_track_assigner_sampler(self): """Initialize assigner and sampler.""" self.track_roi_assigner = build_assigner( self.track_train_cfg.assigner) self.track_share_assigner = False self.track_roi_sampler = build_sampler( self.track_train_cfg.sampler, context=self) self.track_share_sampler = False def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_semantic_seg): # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks_tensor = [] gt_sem_seg = [] gt_sem_cls = [] # batch_input_shape shoud be the same across images pad_H, pad_W = img_metas[0]['batch_input_shape'] assign_H = pad_H // self.mask_assign_stride assign_W = pad_W // self.mask_assign_stride for i, gt_mask in enumerate(gt_masks): mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device) if gt_mask.width != pad_W or gt_mask.height != pad_H: pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height) mask_tensor = F.pad(mask_tensor, pad_wh, value=0) if gt_semantic_seg is not None: # gt_semantic seg is padded by zero when forming a batch # need to convert them from 0 to ignore gt_semantic_seg[ i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label gt_semantic_seg[ i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label if self.cityscapes: sem_labels, sem_seg = sem2ins_masks_cityscapes( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes, thing_label_in_seg=list(range(self.num_stuff_classes, self.num_thing_classes + self.num_stuff_classes)) ) elif self.kitti_step: sem_labels, sem_seg = sem2ins_masks_kitti_step( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=2, thing_label_in_seg=(11, 13)) else: sem_labels, sem_seg = sem2ins_masks( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes, thing_label_in_seg=self.thing_label_in_seg) if sem_seg.shape[0] == 0: gt_sem_seg.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_sem_seg.append( F.interpolate( sem_seg[None], (assign_H, assign_W), mode='bilinear', align_corners=False)[0]) gt_sem_cls.append(sem_labels) else: gt_sem_seg = None gt_sem_cls = None if mask_tensor.shape[0] == 0: gt_masks_tensor.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_masks_tensor.append( F.interpolate( mask_tensor[None], (assign_H, assign_W), # downsample to 1/4 resolution mode='bilinear', align_corners=False)[0]) return gt_masks_tensor, gt_sem_cls, gt_sem_seg def forward_train(self, img, img_metas, gt_bboxes=None, gt_labels=None, gt_bboxes_ignore=None, gt_masks=None, gt_semantic_seg=None, gt_instance_ids=None, ref_img=None, ref_img_metas=None, ref_gt_bboxes_ignore=None, ref_gt_labels=None, ref_gt_bboxes=None, ref_gt_masks=None, ref_gt_semantic_seg=None, ref_gt_instance_ids=None, proposals=None, **kwargs): """Forward function of SparseR-CNN-like network in train stage. Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see :class:`mmdet.datasets.pipelines.Collect`. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor): specify which bounding boxes can be ignored when computing the loss. gt_masks (List[Tensor], optional) : Segmentation masks for each box. But we don't support it in this architecture. proposals (List[Tensor], optional): override rpn proposals with custom proposals. Use when `with_rpn` is False. # This is for video only: ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images. Typically these should be mean centered and std scaled. 2 denotes there is two reference images for each input image. ref_img_metas (list[list[dict]]): The first list only has one element. The second list contains reference image information dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmtrack/datasets/pipelines/formatting.py:VideoCollect`. ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The Tensor contains ground truth bboxes for each reference image with shape (num_all_ref_gts, 5) in [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id start from 0, and denotes the id of reference image for each key image. ref_gt_labels (list[Tensor]): The list only has one Tensor. The Tensor contains class indices corresponding to each reference box with shape (num_all_ref_gts, 2) in [ref_img_id, class_indice]. Returns: dict[str, Tensor]: a dictionary of loss components """ batch_input_shape = tuple(img[0].size()[-2:]) for img_meta in img_metas: img_meta['batch_input_shape'] = batch_input_shape assert proposals is None, 'KNet does not support' \ ' external proposals' assert gt_masks is not None assert gt_instance_ids is not None # preprocess the reference images ref_img = ref_img.squeeze(1) # (b,3,h,w) img_h, img_w = batch_input_shape ref_masks_gt = [] for ref_gt_mask in ref_gt_masks: ref_masks_gt.append(ref_gt_mask[0]) ref_labels_gt = [] for ref_gt_label in ref_gt_labels: ref_labels_gt.append(ref_gt_label[:, 1].long()) ref_gt_labels = ref_labels_gt ref_semantic_seg_gt = ref_gt_semantic_seg.squeeze(1) ref_gt_instance_id_list = [] for ref_gt_instance_id in ref_gt_instance_ids: ref_gt_instance_id_list.append(ref_gt_instance_id[:,1].long()) ref_img_metas_new = [] for ref_img_meta in ref_img_metas: ref_img_meta[0]['batch_input_shape'] = batch_input_shape ref_img_metas_new.append(ref_img_meta[0]) # prepare the gt_match_indices gt_pids_list = [] for i in range(len(ref_gt_instance_id_list)): ref_ids = ref_gt_instance_id_list[i].cpu().data.numpy().tolist() gt_ids = gt_instance_ids[i].cpu().data.numpy().tolist() gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids] gt_pids_list.append(torch.LongTensor([gt_pids]).to(img.device)[0]) gt_match_indices = gt_pids_list # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks, gt_sem_cls, gt_sem_seg = self.preprocess_gt_masks(img_metas, gt_masks, gt_labels, gt_semantic_seg) ref_gt_masks, ref_gt_sem_cls, ref_gt_sem_seg = self.preprocess_gt_masks(ref_img_metas_new, ref_masks_gt, ref_gt_labels, ref_semantic_seg_gt) x = self.extract_feat(img) x_ref = self.extract_feat(ref_img) rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks, gt_labels, gt_sem_seg, gt_sem_cls) # simple forward to get the reference results self.rpn_head.eval() ref_rpn_results = self.rpn_head.simple_test_rpn(x_ref, ref_img_metas_new) self.rpn_head.train() (rpn_losses, proposal_feats, x_feats, mask_preds, cls_scores) = rpn_results (ref_proposal_feats, ref_x_feats, ref_mask_preds, ref_cls_scores, ref_seg_preds) = ref_rpn_results ref_obj_feats, ref_cls_scores, ref_mask_preds, ref_scaled_mask_preds = self.roi_head.simple_test_mask_preds( ref_x_feats, ref_proposal_feats, ref_mask_preds, ref_cls_scores, ref_img_metas_new, ) if self.link_previous: losses, object_feats, cls_scores, mask_preds, scaled_mask_preds, object_feats_track = self.roi_head.forward_train_with_previous( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, gt_masks, gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, gt_bboxes=gt_bboxes, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls, imgs_whwh=None, previous_obj_feats=ref_obj_feats, previous_mask_preds=ref_scaled_mask_preds, previous_x_feats=ref_x_feats, ) else: # forward to get the current results losses, object_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.forward_train( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, gt_masks, gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, gt_bboxes=gt_bboxes, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls, imgs_whwh=None) # ===== Tracking Part -==== # # assign both key frame and reference frame tracking targets key_sampling_results, ref_sampling_results = [], [] num_imgs = len(img_metas) for i in range(num_imgs): assign_result = self.track_roi_assigner.assign( scaled_mask_preds[i][:self.num_proposals].detach(), cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(), gt_masks[i], gt_labels[i], img_meta=img_metas[i]) sampling_result = self.track_roi_sampler.sample( assign_result, mask_preds[i][:self.num_proposals].detach(), gt_masks[i]) key_sampling_results.append(sampling_result) ref_assign_result = self.track_roi_assigner.assign( ref_scaled_mask_preds[i][:self.num_proposals].detach(), ref_cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(), ref_gt_masks[i], ref_gt_labels[i], img_meta=ref_img_metas_new[i]) ref_sampling_result = self.track_roi_sampler.sample( ref_assign_result, ref_mask_preds[i][:self.num_proposals].detach(), ref_gt_masks[i]) ref_sampling_results.append(ref_sampling_result) if self.detach_mask_emd: object_feats = object_feats.detach() ref_obj_feats = ref_obj_feats.detach() if self.link_previous: object_feats = object_feats_track N, num_proposal, _, _, _ = object_feats.shape emb_feat = object_feats.squeeze(-2).squeeze(-1)[:, :self.num_proposals, ] for emb_layer in self.embed_fcs: emb_feat = emb_layer(emb_feat) object_feats_embed = self.fc_embed(emb_feat).view(N, self.num_proposals, -1) ref_emb_feat = ref_obj_feats.squeeze(-2).squeeze(-1)[:, :self.num_proposals, ] for emb_layer in self.embed_fcs: ref_emb_feat = emb_layer(ref_emb_feat) ref_object_feats_embed = self.fc_embed(ref_emb_feat).view(N, self.num_proposals, -1) # sampling predicted GT mask key_emb_indexs = [res.pos_inds for res in key_sampling_results] object_feats_embed_list = [] for i in range(len(key_emb_indexs)): object_feats_embed_list.append(object_feats_embed[:, key_emb_indexs[i], :].squeeze(0)) key_feats = self._track_forward(object_feats_embed_list) ref_emb_indexs = [res.pos_inds for res in ref_sampling_results] ref_object_feats_embed_list = [] for i in range(len(ref_emb_indexs)): ref_object_feats_embed_list.append(ref_object_feats_embed[:, ref_emb_indexs[i], :].squeeze(0)) ref_feats = self._track_forward(ref_object_feats_embed_list) match_feats = self.track_head.match(key_feats, ref_feats, key_sampling_results, ref_sampling_results) asso_targets = self.track_head.get_track_targets( gt_match_indices, key_sampling_results, ref_sampling_results) loss_track = self.track_head.loss(*match_feats, *asso_targets) losses.update(loss_track) losses.update(rpn_losses) return losses def simple_test(self, img, img_metas, rescale=False, ref_img=None, **kwargs): """Test function without test time augmentation. Args: imgs (list[torch.Tensor]): List of multiple images img_metas (list[dict]): List of image information. rescale (bool): Whether to rescale the results. Defaults to False. Returns: list[list[np.ndarray]]: BBox results of each image and classes. The outer list corresponds to each image. The inner list corresponds to each class. """ # set the dataset type # whether is the first frame for such clips if self.cityscapes and not self.kitti_step: iid = img_metas[0]['iid'] fid = iid % 10000 is_first = (fid == 1) else: iid = kwargs['img_id'][0].item() fid = iid % 10000 is_first = (fid == 0) if is_first: self.init_tracker() self.obj_feats_memory = None self.x_feats_memory = None self.mask_preds_memory = None # for current frame x = self.extract_feat(img) # current frame inference rpn_results = self.rpn_head.simple_test_rpn(x, img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results cur_segm_results, query_output, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test( x_feats, proposal_feats, mask_preds, cls_scores, img_metas) # for tracking part _, segm_result, mask_preds, panoptic_result, query_output = cur_segm_results[0] panoptic_seg, segments_info = panoptic_result # get sorted tracking thing ids, labels, masks, score for tracking things_index_for_tracking, things_labels_for_tracking, thing_masks_for_tracking, things_score_for_tracking = \ self.get_things_id_for_tracking(panoptic_seg, segments_info) things_labels_for_tracking = torch.Tensor(things_labels_for_tracking).to(cls_scores.device).long() # get the semantic filter if self.semantic_filter: seg_preds = torch.nn.functional.interpolate(seg_preds, panoptic_seg.shape, mode='bilinear', align_corners=False) seg_preds = seg_preds.sigmoid() seg_out = seg_preds.argmax(1) semantic_thing = (seg_out < self.num_thing_classes).to(dtype=torch.float32) else: semantic_thing = 1. if len(things_labels_for_tracking) > 0: things_bbox_for_tracking = torch.zeros((len(things_score_for_tracking), 5), dtype=torch.float, device=x_feats.device) things_bbox_for_tracking[:, 4] = torch.tensor(things_score_for_tracking, device=things_bbox_for_tracking.device) thing_masks_for_tracking_final = [] for mask in thing_masks_for_tracking: thing_masks_for_tracking_final.append(torch.Tensor(mask).unsqueeze(0).to( x_feats.device).float()) thing_masks_for_tracking_final = torch.cat(thing_masks_for_tracking_final, 0) thing_masks_for_tracking = thing_masks_for_tracking_final thing_masks_for_tracking_with_semantic_filter = thing_masks_for_tracking_final * semantic_thing if len(things_labels_for_tracking) == 0: track_feats = None else: # tracking embeddings N, _, _, _ = query_output.shape emb_feat = query_output.squeeze(-2).squeeze(-1).unsqueeze(0) # (n,d,1,1) -> (1,n,d) # for emb_layer in self.embed_fcs: # emb_feat = emb_layer(emb_feat) # object_feats_embed = self.fc_embed(emb_feat).view(1, N, -1) track_feats = emb_feat.squeeze(0) # tracking embedding features # track_feats = self._track_forward([object_feats_embed_for_tracking]) if track_feats is not None: things_bbox_for_tracking[:, :4] = torch.tensor(tensor_mask2box(thing_masks_for_tracking_with_semantic_filter), device=things_bbox_for_tracking.device) bboxes, labels, ids = self.tracker.match( bboxes=things_bbox_for_tracking, labels=things_labels_for_tracking, track_feats=track_feats, frame_id=fid) ids = ids + 1 ids[ids == -1] = 0 else: ids = [] track_maps = self.generate_track_id_maps(ids, thing_masks_for_tracking, panoptic_seg) semantic_map = self.get_semantic_seg(panoptic_seg, segments_info) from scripts.visualizer import trackmap2rgb, cityscapes_cat2rgb, draw_bbox_on_img vis_tracker = trackmap2rgb(track_maps) vis_sem = cityscapes_cat2rgb(semantic_map) if len(things_labels_for_tracking): vis_tracker = draw_bbox_on_img(vis_tracker, things_bbox_for_tracking.cpu().numpy()) # Visualization usage return semantic_map, track_maps, None, vis_sem, vis_tracker def _track_forward(self, track_feats, x=None, mask_pred=None): """Track head forward function used in both training and testing. We use mask pooling to get the fine grain features""" # if not self.training: # mask_pred = [mask_pred] # bbox_list = batch_mask2boxlist(mask_pred) # track_rois = bboxlist2roi(bbox_list) # track_rois = track_rois.clamp(min=0.0) # track_feats = self.track_roi_extractor(x[:self.track_roi_extractor.num_inputs], track_rois) track_feats = torch.cat(track_feats, 0) # print(track_feats.shape) # print(track_feats.shape) # track_feats = track_feats track_feats = self.track_head(track_feats) return track_feats def forward_dummy(self, img): """Used for computing network flops. See `mmdetection/tools/get_flops.py` """ # backbone x = self.extract_feat(img) # rpn num_imgs = len(img) dummy_img_metas = [ dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs) ] rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats, dummy_img_metas) return roi_outs def extract_feat(self, img): """Directly extract features from the backbone+neck.""" x = self.backbone(img) if self.with_neck: x = self.neck(x) return x @property def with_rpn(self): """bool: whether the detector has RPN""" return hasattr(self, 'rpn_head') and self.rpn_head is not None @property def with_roi_head(self): """bool: whether the detector has a RoI head""" return hasattr(self, 'roi_head') and self.roi_head is not None def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ pass def get_things_id_for_tracking(self, panoptic_seg, seg_infos): idxs = [] labels = [] masks = [] score = [] for segment in seg_infos: if segment['isthing'] == True: thing_mask = panoptic_seg == segment["id"] masks.append(thing_mask) idxs.append(segment["instance_id"]) labels.append(segment['category_id']) score.append(segment['score']) return idxs, labels, masks, score def pack_things_object(self, object_feats, ref_object_feats): object_feats, ref_object_feats = object_feats.squeeze(-1).squeeze(-1), ref_object_feats.squeeze(-1).squeeze(-1) thing_object_feats = torch.split(object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] ref_thing_object_feats = torch.split(ref_object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] return thing_object_feats, ref_thing_object_feats def pack_things_masks(self, mask_pred, ref_mask_pred): thing_mask_pred = torch.split(mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] ref_thing_thing_mask_pred = torch.split(ref_mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] return thing_mask_pred, ref_thing_thing_mask_pred def get_semantic_seg(self, panoptic_seg, segments_info): results = {} masks = [] scores = [] kitti_step2cityscpaes = [11, 13] semantic_seg = np.zeros(panoptic_seg.shape) for segment in segments_info: if segment['isthing'] == True: if self.kitti_step: cat_cur = kitti_step2cityscpaes[segment["category_id"]] semantic_seg[panoptic_seg == segment["id"]] = cat_cur else: semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + self.num_stuff_classes else: # for stuff (0- n-1) if self.kitti_step: cat_cur = segment["category_id"] cat_cur -= 1 offset = 0 for thing_id in kitti_step2cityscpaes: if cat_cur + offset >= thing_id: offset += 1 cat_cur += offset semantic_seg[panoptic_seg == segment["id"]] = cat_cur else: semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1 return semantic_seg def generate_track_id_maps(self, ids, masks, panopitc_seg_maps): final_id_maps = np.zeros(panopitc_seg_maps.shape) if len(ids) == 0: return final_id_maps # assert len(things_mask_results) == len(track_results) masks = masks.bool() for i, id in enumerate(ids): mask = masks[i].cpu().numpy() final_id_maps[mask] = id return final_id_maps ================================================ FILE: knet/video/knet_quansi_dense_roi_gt_box.py ================================================ import warnings import numpy as np import torch import torch.nn.functional as F from mmcv.cnn import ConvModule from mmdet.models.builder import DETECTORS from mmdet.models.detectors import BaseDetector from mmdet.models.builder import build_head, build_neck, build_backbone, build_roi_extractor from mmdet.core import build_assigner, build_sampler from knet.video.qdtrack.builder import build_tracker from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes, sem2ins_masks_kitti_step from unitrack.mask import tensor_mask2box from unitrack.utils.mask import mask2box, batch_mask2boxlist, bboxlist2roi @DETECTORS.register_module() class VideoKNetQuansiTrackROIGTBox(BaseDetector): """ Simple Extension of KNet to Video KNet by the implementation of VPSFuse Net. """ def __init__(self, backbone, neck=None, rpn_head=None, roi_head=None, track_head=None, extra_neck=None, track_localization_fpn=None, tracker=None, train_cfg=None, test_cfg=None, track_train_cfg=None, pretrained=None, init_cfg=None, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, cityscapes=False, kitti_step=False, freeze_detector=False, semantic_filter=False, # linking parameters link_previous=False, bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict( type='RoIAlign', output_size=7, sampling_ratio=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), **kwargs): super(VideoKNetQuansiTrackROIGTBox, self).__init__(init_cfg) if pretrained: warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') backbone.pretrained = pretrained self.backbone = build_backbone(backbone) if neck is not None: self.neck = build_neck(neck) if extra_neck is not None: self.extra_neck = build_neck(extra_neck) if rpn_head is not None: rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None rpn_head_ = rpn_head.copy() rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn) self.rpn_head = build_head(rpn_head_) if roi_head is not None: # update train and test cfg here for now # TODO: refactor assigner & sampler rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None roi_head.update(train_cfg=rcnn_train_cfg) roi_head.update(test_cfg=test_cfg.rcnn) roi_head.pretrained = pretrained self.roi_head = build_head(roi_head) if track_head is not None: self.track_train_cfg = track_train_cfg self.track_head = build_head(track_head) self.init_track_assigner_sampler() if track_localization_fpn is not None: self.track_localization_fpn = build_neck(track_localization_fpn) self.track_roi_extractor = build_roi_extractor( bbox_roi_extractor) if tracker is not None: self.tracker_cfg = tracker if freeze_detector: self._freeze_detector() self.train_cfg = train_cfg self.test_cfg = test_cfg self.num_proposals = self.rpn_head.num_proposals self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.thing_label_in_seg = thing_label_in_seg self.ignore_label = ignore_label self.cityscapes = cityscapes # whether to train the cityscape panoptic segmentation self.kitti_step = kitti_step # whether to train the kitti step panoptic segmentation self.semantic_filter = semantic_filter self.link_previous = link_previous def init_tracker(self): self.tracker = build_tracker(self.tracker_cfg) def _freeze_detector(self): self.detector = [ self.rpn_head, self.roi_head ] for model in self.detector: model.eval() for param in model.parameters(): param.requires_grad = False def init_track_assigner_sampler(self): """Initialize assigner and sampler.""" self.track_roi_assigner = build_assigner( self.track_train_cfg.assigner) self.track_share_assigner = False self.track_roi_sampler = build_sampler( self.track_train_cfg.sampler, context=self) self.track_share_sampler = False def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_semantic_seg): # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks_tensor = [] gt_sem_seg = [] gt_sem_cls = [] # batch_input_shape shoud be the same across images pad_H, pad_W = img_metas[0]['batch_input_shape'] assign_H = pad_H // self.mask_assign_stride assign_W = pad_W // self.mask_assign_stride for i, gt_mask in enumerate(gt_masks): mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device) if gt_mask.width != pad_W or gt_mask.height != pad_H: pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height) mask_tensor = F.pad(mask_tensor, pad_wh, value=0) if gt_semantic_seg is not None: # gt_semantic seg is padded by zero when forming a batch # need to convert them from 0 to ignore gt_semantic_seg[ i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label gt_semantic_seg[ i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label if self.cityscapes: sem_labels, sem_seg = sem2ins_masks_cityscapes( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes, thing_label_in_seg=list(range(self.num_stuff_classes, self.num_thing_classes + self.num_stuff_classes)) ) elif self.kitti_step: sem_labels, sem_seg = sem2ins_masks_kitti_step( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=2, thing_label_in_seg=(11, 13)) else: sem_labels, sem_seg = sem2ins_masks( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes, thing_label_in_seg=self.thing_label_in_seg) if sem_seg.shape[0] == 0: gt_sem_seg.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_sem_seg.append( F.interpolate( sem_seg[None], (assign_H, assign_W), mode='bilinear', align_corners=False)[0]) gt_sem_cls.append(sem_labels) else: gt_sem_seg = None gt_sem_cls = None if mask_tensor.shape[0] == 0: gt_masks_tensor.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_masks_tensor.append( F.interpolate( mask_tensor[None], (assign_H, assign_W), # downsample to 1/4 resolution mode='bilinear', align_corners=False)[0]) return gt_masks_tensor, gt_sem_cls, gt_sem_seg def forward_train(self, img, img_metas, gt_bboxes=None, gt_labels=None, gt_bboxes_ignore=None, gt_masks=None, gt_semantic_seg=None, gt_instance_ids=None, ref_img=None, ref_img_metas=None, ref_gt_bboxes_ignore=None, ref_gt_labels=None, ref_gt_bboxes=None, ref_gt_masks=None, ref_gt_semantic_seg=None, ref_gt_instance_ids=None, proposals=None, **kwargs): """Forward function of SparseR-CNN-like network in train stage. Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see :class:`mmdet.datasets.pipelines.Collect`. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor): specify which bounding boxes can be ignored when computing the loss. gt_masks (List[Tensor], optional) : Segmentation masks for each box. But we don't support it in this architecture. proposals (List[Tensor], optional): override rpn proposals with custom proposals. Use when `with_rpn` is False. # This is for video only: ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images. Typically these should be mean centered and std scaled. 2 denotes there is two reference images for each input image. ref_img_metas (list[list[dict]]): The first list only has one element. The second list contains reference image information dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmtrack/datasets/pipelines/formatting.py:VideoCollect`. ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The Tensor contains ground truth bboxes for each reference image with shape (num_all_ref_gts, 5) in [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id start from 0, and denotes the id of reference image for each key image. ref_gt_labels (list[Tensor]): The list only has one Tensor. The Tensor contains class indices corresponding to each reference box with shape (num_all_ref_gts, 2) in [ref_img_id, class_indice]. Returns: dict[str, Tensor]: a dictionary of loss components """ batch_input_shape = tuple(img[0].size()[-2:]) for img_meta in img_metas: img_meta['batch_input_shape'] = batch_input_shape assert proposals is None, 'KNet does not support' \ ' external proposals' assert gt_masks is not None assert gt_instance_ids is not None # preprocess the reference images ref_img = ref_img.squeeze(1) # (b,3,h,w) img_h, img_w = batch_input_shape ref_masks_gt = [] for ref_gt_mask in ref_gt_masks: ref_masks_gt.append(ref_gt_mask[0]) ref_labels_gt = [] for ref_gt_label in ref_gt_labels: ref_labels_gt.append(ref_gt_label[:, 1].long()) ref_gt_labels = ref_labels_gt ref_semantic_seg_gt = ref_gt_semantic_seg.squeeze(1) ref_gt_instance_id_list = [] for ref_gt_instance_id in ref_gt_instance_ids: ref_gt_instance_id_list.append(ref_gt_instance_id[:,1].long()) ref_img_metas_new = [] for ref_img_meta in ref_img_metas: ref_img_meta[0]['batch_input_shape'] = batch_input_shape ref_img_metas_new.append(ref_img_meta[0]) # prepare the gt_match_indices gt_pids_list = [] for i in range(len(ref_gt_instance_id_list)): ref_ids = ref_gt_instance_id_list[i].cpu().data.numpy().tolist() gt_ids = gt_instance_ids[i].cpu().data.numpy().tolist() gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids] gt_pids_list.append(torch.LongTensor([gt_pids]).to(img.device)[0]) gt_match_indices = gt_pids_list # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks, gt_sem_cls, gt_sem_seg = self.preprocess_gt_masks(img_metas, gt_masks, gt_labels, gt_semantic_seg) ref_gt_masks, ref_gt_sem_cls, ref_gt_sem_seg = self.preprocess_gt_masks(ref_img_metas_new, ref_masks_gt, ref_gt_labels, ref_semantic_seg_gt) x = self.extract_feat(img) x_ref = self.extract_feat(ref_img) rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks, gt_labels, gt_sem_seg, gt_sem_cls) # simple forward to get the reference results self.rpn_head.eval() ref_rpn_results = self.rpn_head.simple_test_rpn(x_ref, ref_img_metas_new) self.rpn_head.train() (rpn_losses, proposal_feats, x_feats, mask_preds, cls_scores) = rpn_results (ref_proposal_feats, ref_x_feats, ref_mask_preds, ref_cls_scores, ref_seg_preds) = ref_rpn_results ref_obj_feats, ref_cls_scores, ref_mask_preds, ref_scaled_mask_preds = self.roi_head.simple_test_mask_preds( ref_x_feats, ref_proposal_feats, ref_mask_preds, ref_cls_scores, ref_img_metas_new, ) if self.link_previous: losses, object_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.forward_train_with_previous( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, gt_masks, gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, gt_bboxes=gt_bboxes, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls, imgs_whwh=None, previous_obj_feats=ref_obj_feats, previous_mask_preds=ref_scaled_mask_preds, previous_x_feats=ref_x_feats, ) else: # forward to get the current results losses, object_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.forward_train( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, gt_masks, gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, gt_bboxes=gt_bboxes, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls, imgs_whwh=None) # ===== Tracking Part -==== # # assign both key frame and reference frame tracking targets key_sampling_results, ref_sampling_results = [], [] num_imgs = len(img_metas) for i in range(num_imgs): assign_result = self.track_roi_assigner.assign( scaled_mask_preds[i][:self.num_proposals].detach(), cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(), gt_masks[i], gt_labels[i], img_meta=img_metas[i]) sampling_result = self.track_roi_sampler.sample( assign_result, mask_preds[i][:self.num_proposals].detach(), gt_masks[i]) key_sampling_results.append(sampling_result) ref_assign_result = self.track_roi_assigner.assign( ref_scaled_mask_preds[i][:self.num_proposals].detach(), ref_cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(), ref_gt_masks[i], ref_gt_labels[i], img_meta=ref_img_metas_new[i]) ref_sampling_result = self.track_roi_sampler.sample( ref_assign_result, ref_mask_preds[i][:self.num_proposals].detach(), ref_gt_masks[i]) ref_sampling_results.append(ref_sampling_result) # roi feature embeddings key_masks = [res.pos_gt_masks for res in key_sampling_results] for i in range(len(key_masks)): key_masks[i] = F.interpolate(key_masks[i].unsqueeze(0), size=(img_h, img_w), mode="bilinear", align_corners=False).squeeze(0) key_masks[i] = (key_masks[i].sigmoid() > 0.5).float() key_feats = self._track_forward(x, key_masks) # roi feature embeddings ref_masks = [res.pos_gt_masks for res in ref_sampling_results] for i in range(len(ref_masks)): ref_masks[i] = F.interpolate(ref_masks[i].unsqueeze(0), size=(img_h, img_w), mode="bilinear", align_corners=False).squeeze(0) ref_masks[i] = (ref_masks[i].sigmoid() > 0.5).float() ref_feats = self._track_forward(x_ref, ref_masks) match_feats = self.track_head.match(key_feats, ref_feats, key_sampling_results, ref_sampling_results) asso_targets = self.track_head.get_track_targets( gt_match_indices, key_sampling_results, ref_sampling_results) loss_track = self.track_head.loss(*match_feats, *asso_targets) losses.update(loss_track) losses.update(rpn_losses) return losses def simple_test(self, img, img_metas, rescale=False, ref_img=None, **kwargs): """Test function without test time augmentation. Args: imgs (list[torch.Tensor]): List of multiple images img_metas (list[dict]): List of image information. rescale (bool): Whether to rescale the results. Defaults to False. Returns: list[list[np.ndarray]]: BBox results of each image and classes. The outer list corresponds to each image. The inner list corresponds to each class. """ # whether is the first frame for such clips # assert 'city' in img_metas[0]['filename'] and 'iid' in img_metas[0] if "city" in img_metas[0]['filename']: iid = img_metas[0]['iid'] fid = iid % 10000 is_first = (fid == 1) elif "motchallenge" in img_metas[0]['filename']: iid = kwargs['img_id'][0].item() fid = iid % 10000 is_first = (fid == 1) if is_first: print("First detected on {}".format(fid)) else: iid = kwargs['img_id'][0].item() fid = iid % 10000 is_first = (fid == 0) if is_first: self.init_tracker() self.obj_feats_memory = None self.x_feats_memory = None self.mask_preds_memory = None # for current frame x = self.extract_feat(img) # current frame inference rpn_results = self.rpn_head.simple_test_rpn(x, img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results if self.link_previous: cur_segm_results, query_output, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test_with_previous( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, previous_obj_feats=self.obj_feats_memory, previous_mask_preds=self.mask_preds_memory, previous_x_feats=self.x_feats_memory, ) self.obj_feats_memory = query_output self.x_feats_memory = x_feats self.mask_preds_memory = scaled_mask_preds else: cur_segm_results, query_output, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test( x_feats, proposal_feats, mask_preds, cls_scores, img_metas) # for tracking part _, segm_result, mask_preds, panoptic_result, _ = cur_segm_results[0] panoptic_seg, segments_info = panoptic_result if self.semantic_filter: seg_preds = torch.nn.functional.interpolate(seg_preds, panoptic_seg.shape, mode='bilinear', align_corners=False) seg_preds = seg_preds.sigmoid() seg_out = seg_preds.argmax(1) semantic_thing = (seg_out < self.num_thing_classes).to(dtype=torch.float32) else: semantic_thing = 1. # get sorted tracking thing ids, labels, masks, score for tracking things_index_for_tracking, things_labels_for_tracking, thing_masks_for_tracking, things_score_for_tracking = \ self.get_things_id_for_tracking(panoptic_seg, segments_info) things_labels_for_tracking = torch.Tensor(things_labels_for_tracking).to(cls_scores.device).long() if len(things_labels_for_tracking) > 0: things_bbox_for_tracking = torch.zeros((len(things_score_for_tracking), 5), dtype=torch.float, device=x_feats.device) things_bbox_for_tracking[:, 4] = torch.tensor(things_score_for_tracking, device=things_bbox_for_tracking.device) thing_masks_for_tracking_final = [] for mask in thing_masks_for_tracking: thing_masks_for_tracking_final.append(torch.Tensor(mask).unsqueeze(0).to( x_feats.device).float()) thing_masks_for_tracking_final = torch.cat(thing_masks_for_tracking_final, 0) thing_masks_for_tracking = thing_masks_for_tracking_final thing_masks_for_tracking_with_semantic_filter = thing_masks_for_tracking_final * semantic_thing if len(things_labels_for_tracking) == 0: track_feats = None else: # tracking embedding features track_feats = self._track_forward(x, thing_masks_for_tracking_with_semantic_filter) if track_feats is not None: # assert len(things_id_for_tracking) == len(things_labels_for_tracking) things_bbox_for_tracking[:, :4] = torch.tensor(tensor_mask2box(thing_masks_for_tracking_with_semantic_filter), device=things_bbox_for_tracking.device) bboxes, labels, ids = self.tracker.match( bboxes=things_bbox_for_tracking, labels=things_labels_for_tracking, track_feats=track_feats, frame_id=fid) ids = ids + 1 ids[ids == -1] = 0 else: ids = [] track_maps = self.generate_track_id_maps(ids, thing_masks_for_tracking, panoptic_seg) semantic_map = self.get_semantic_seg(panoptic_seg, segments_info) from scripts.visualizer import trackmap2rgb, cityscapes_cat2rgb, draw_bbox_on_img vis_tracker = trackmap2rgb(track_maps) vis_sem = cityscapes_cat2rgb(semantic_map) if len(things_labels_for_tracking): vis_tracker = draw_bbox_on_img(vis_tracker, things_bbox_for_tracking.cpu().numpy()) # Visualization end return semantic_map, track_maps, None, vis_sem, vis_tracker def _track_forward(self, x, mask_pred): """Track head forward function used in both training and testing. We use mask pooling to get the fine grain features""" if not self.training: mask_pred = [mask_pred] bbox_list = batch_mask2boxlist(mask_pred) track_rois = bboxlist2roi(bbox_list) track_rois = track_rois.clamp(min=0.0) track_feats = self.track_roi_extractor(x[:self.track_roi_extractor.num_inputs], track_rois) track_feats = self.track_head(track_feats) return track_feats def forward_dummy(self, img): """Used for computing network flops. See `mmdetection/tools/get_flops.py` """ # backbone x = self.extract_feat(img) # rpn num_imgs = len(img) dummy_img_metas = [ dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs) ] rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats, dummy_img_metas) return roi_outs def extract_feat(self, img): """Directly extract features from the backbone+neck.""" x = self.backbone(img) if self.with_neck: x = self.neck(x) return x @property def with_rpn(self): """bool: whether the detector has RPN""" return hasattr(self, 'rpn_head') and self.rpn_head is not None @property def with_roi_head(self): """bool: whether the detector has a RoI head""" return hasattr(self, 'roi_head') and self.roi_head is not None def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ pass def get_things_id_for_tracking(self, panoptic_seg, seg_infos): idxs = [] labels = [] masks = [] score = [] for segment in seg_infos: if segment['isthing'] == True: thing_mask = panoptic_seg == segment["id"] masks.append(thing_mask) idxs.append(segment["instance_id"]) labels.append(segment['category_id']) score.append(segment['score']) return idxs, labels, masks, score def pack_things_object(self, object_feats, ref_object_feats): object_feats, ref_object_feats = object_feats.squeeze(-1).squeeze(-1), ref_object_feats.squeeze(-1).squeeze(-1) thing_object_feats = torch.split(object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] ref_thing_object_feats = torch.split(ref_object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] return thing_object_feats, ref_thing_object_feats def pack_things_masks(self, mask_pred, ref_mask_pred): thing_mask_pred = torch.split(mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] ref_thing_thing_mask_pred = torch.split(ref_mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] return thing_mask_pred, ref_thing_thing_mask_pred def get_semantic_seg(self, panoptic_seg, segments_info): results = {} masks = [] scores = [] kitti_step2cityscpaes = [11, 13] semantic_seg = np.zeros(panoptic_seg.shape) for segment in segments_info: if segment['isthing'] == True: if self.kitti_step: cat_cur = kitti_step2cityscpaes[segment["category_id"]] semantic_seg[panoptic_seg == segment["id"]] = cat_cur else: semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + 11 else: # for stuff (0- n-1) if self.kitti_step: cat_cur = segment["category_id"] cat_cur -= 1 offset = 0 for thing_id in kitti_step2cityscpaes: if cat_cur + offset >= thing_id: offset += 1 cat_cur += offset semantic_seg[panoptic_seg == segment["id"]] = cat_cur else: semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1 return semantic_seg def generate_track_id_maps(self, ids, masks, panopitc_seg_maps): final_id_maps = np.zeros(panopitc_seg_maps.shape) if len(ids) == 0: return final_id_maps # assert len(things_mask_results) == len(track_results) masks = masks.bool() for i, id in enumerate(ids): mask = masks[i].cpu().numpy() final_id_maps[mask] = id return final_id_maps ================================================ FILE: knet/video/knet_quansi_dense_roi_gt_box_joint_train.py ================================================ import warnings import numpy as np import torch import torch.nn.functional as F from mmcv.cnn import ConvModule from mmdet.models.builder import DETECTORS from mmdet.models.detectors import BaseDetector from mmdet.models.builder import build_head, build_neck, build_backbone, build_roi_extractor from mmdet.core import build_assigner, build_sampler from knet.video.qdtrack.builder import build_tracker from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes, sem2ins_masks_kitti_step from unitrack.mask import tensor_mask2box from unitrack.utils.mask import mask2box, batch_mask2boxlist, bboxlist2roi # RoI box based Video K-Net baseline. @DETECTORS.register_module() class VideoKNetQuansiTrackROIGTBoxJointTrain(BaseDetector): """ Simple Extension of KNet to Video KNet by the implementation of VPSFuse Net. """ def __init__(self, backbone, neck=None, rpn_head=None, roi_head=None, track_head=None, extra_neck=None, track_localization_fpn=None, tracker=None, train_cfg=None, test_cfg=None, track_train_cfg=None, pretrained=None, init_cfg=None, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, cityscapes=False, kitti_step=False, freeze_detector=False, semantic_filter=False, # linking parameters link_previous=False, bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict( type='RoIAlign', output_size=7, sampling_ratio=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), **kwargs): super(VideoKNetQuansiTrackROIGTBoxJointTrain, self).__init__(init_cfg) if pretrained: warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') backbone.pretrained = pretrained self.backbone = build_backbone(backbone) if neck is not None: self.neck = build_neck(neck) if extra_neck is not None: self.extra_neck = build_neck(extra_neck) if rpn_head is not None: rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None rpn_head_ = rpn_head.copy() rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn) self.rpn_head = build_head(rpn_head_) if roi_head is not None: # update train and test cfg here for now # TODO: refactor assigner & sampler rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None roi_head.update(train_cfg=rcnn_train_cfg) roi_head.update(test_cfg=test_cfg.rcnn) roi_head.pretrained = pretrained self.roi_head = build_head(roi_head) if track_head is not None: self.track_train_cfg = track_train_cfg self.track_head = build_head(track_head) self.init_track_assigner_sampler() if track_localization_fpn is not None: self.track_localization_fpn = build_neck(track_localization_fpn) self.track_roi_extractor = build_roi_extractor( bbox_roi_extractor) if tracker is not None: self.tracker_cfg = tracker if freeze_detector: self._freeze_detector() self.train_cfg = train_cfg self.test_cfg = test_cfg self.num_proposals = self.rpn_head.num_proposals self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.thing_label_in_seg = thing_label_in_seg self.ignore_label = ignore_label self.cityscapes = cityscapes # whether to train the cityscape panoptic segmentation self.kitti_step = kitti_step # whether to train the kitti step panoptic segmentation self.semantic_filter = semantic_filter self.link_previous = link_previous def init_tracker(self): self.tracker = build_tracker(self.tracker_cfg) def _freeze_detector(self): self.detector = [ self.rpn_head, self.roi_head ] for model in self.detector: model.eval() for param in model.parameters(): param.requires_grad = False def init_track_assigner_sampler(self): """Initialize assigner and sampler.""" self.track_roi_assigner = build_assigner( self.track_train_cfg.assigner) self.track_share_assigner = False self.track_roi_sampler = build_sampler( self.track_train_cfg.sampler, context=self) self.track_share_sampler = False def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_semantic_seg): # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks_tensor = [] gt_sem_seg = [] gt_sem_cls = [] # batch_input_shape shoud be the same across images pad_H, pad_W = img_metas[0]['batch_input_shape'] assign_H = pad_H // self.mask_assign_stride assign_W = pad_W // self.mask_assign_stride for i, gt_mask in enumerate(gt_masks): mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device) if gt_mask.width != pad_W or gt_mask.height != pad_H: pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height) mask_tensor = F.pad(mask_tensor, pad_wh, value=0) if gt_semantic_seg is not None: # gt_semantic seg is padded by zero when forming a batch # need to convert them from 0 to ignore gt_semantic_seg[ i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label gt_semantic_seg[ i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label if self.cityscapes: sem_labels, sem_seg = sem2ins_masks_cityscapes( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes) elif self.kitti_step: sem_labels, sem_seg = sem2ins_masks_kitti_step( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=2, thing_label_in_seg=(11, 13)) else: sem_labels, sem_seg = sem2ins_masks( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes, thing_label_in_seg=self.thing_label_in_seg) if sem_seg.shape[0] == 0: gt_sem_seg.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_sem_seg.append( F.interpolate( sem_seg[None], (assign_H, assign_W), mode='bilinear', align_corners=False)[0]) gt_sem_cls.append(sem_labels) else: gt_sem_seg = None gt_sem_cls = None if mask_tensor.shape[0] == 0: gt_masks_tensor.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_masks_tensor.append( F.interpolate( mask_tensor[None], (assign_H, assign_W), # downsample to 1/4 resolution mode='bilinear', align_corners=False)[0]) return gt_masks_tensor, gt_sem_cls, gt_sem_seg def forward_train(self, img, img_metas, gt_bboxes=None, gt_labels=None, gt_bboxes_ignore=None, gt_masks=None, gt_semantic_seg=None, gt_instance_ids=None, ref_img=None, ref_img_metas=None, ref_gt_bboxes_ignore=None, ref_gt_labels=None, ref_gt_bboxes=None, ref_gt_masks=None, ref_gt_semantic_seg=None, ref_gt_instance_ids=None, proposals=None, **kwargs): """Forward function of SparseR-CNN-like network in train stage. Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see :class:`mmdet.datasets.pipelines.Collect`. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor): specify which bounding boxes can be ignored when computing the loss. gt_masks (List[Tensor], optional) : Segmentation masks for each box. But we don't support it in this architecture. proposals (List[Tensor], optional): override rpn proposals with custom proposals. Use when `with_rpn` is False. # This is for video only: ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images. Typically these should be mean centered and std scaled. 2 denotes there is two reference images for each input image. ref_img_metas (list[list[dict]]): The first list only has one element. The second list contains reference image information dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmtrack/datasets/pipelines/formatting.py:VideoCollect`. ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The Tensor contains ground truth bboxes for each reference image with shape (num_all_ref_gts, 5) in [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id start from 0, and denotes the id of reference image for each key image. ref_gt_labels (list[Tensor]): The list only has one Tensor. The Tensor contains class indices corresponding to each reference box with shape (num_all_ref_gts, 2) in [ref_img_id, class_indice]. Returns: dict[str, Tensor]: a dictionary of loss components """ batch_input_shape = tuple(img[0].size()[-2:]) for img_meta in img_metas: img_meta['batch_input_shape'] = batch_input_shape assert proposals is None, 'KNet does not support' \ ' external proposals' assert gt_masks is not None assert gt_instance_ids is not None # preprocess the reference images ref_img = ref_img.squeeze(1) # (b,3,h,w) img_h, img_w = batch_input_shape ref_masks_gt = [] for ref_gt_mask in ref_gt_masks: ref_masks_gt.append(ref_gt_mask[0]) ref_labels_gt = [] for ref_gt_label in ref_gt_labels: ref_labels_gt.append(ref_gt_label[:, 1].long()) ref_gt_labels = ref_labels_gt ref_semantic_seg_gt = ref_gt_semantic_seg.squeeze(1) ref_gt_instance_id_list = [] for ref_gt_instance_id in ref_gt_instance_ids: ref_gt_instance_id_list.append(ref_gt_instance_id[:,1].long()) ref_img_metas_new = [] for ref_img_meta in ref_img_metas: ref_img_meta[0]['batch_input_shape'] = batch_input_shape ref_img_metas_new.append(ref_img_meta[0]) # prepare the gt_match_indices gt_pids_list = [] for i in range(len(ref_gt_instance_id_list)): ref_ids = ref_gt_instance_id_list[i].cpu().data.numpy().tolist() gt_ids = gt_instance_ids[i].cpu().data.numpy().tolist() gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids] gt_pids_list.append(torch.LongTensor([gt_pids]).to(img.device)[0]) gt_match_indices = gt_pids_list # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks, gt_sem_cls, gt_sem_seg = self.preprocess_gt_masks(img_metas, gt_masks, gt_labels, gt_semantic_seg) ref_gt_masks, ref_gt_sem_cls, ref_gt_sem_seg = self.preprocess_gt_masks(ref_img_metas_new, ref_masks_gt, ref_gt_labels, ref_semantic_seg_gt) x = self.extract_feat(img) x_ref = self.extract_feat(ref_img) # current frame rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks, gt_labels, gt_sem_seg, gt_sem_cls) # simple forward to get the reference results # self.rpn_head.eval() # ref_rpn_results = self.rpn_head.simple_test_rpn(x_ref, ref_img_metas_new) # self.rpn_head.train() # reference frame ref_rpn_results = self.rpn_head.forward_train(x_ref, ref_img_metas_new, ref_gt_masks, ref_labels_gt, ref_gt_sem_seg, ref_gt_sem_cls) (rpn_losses, proposal_feats, x_feats, mask_preds, cls_scores) = rpn_results (ref_rpn_losses, ref_proposal_feats, ref_x_feats, ref_mask_preds, ref_cls_scores) = ref_rpn_results losses_ref, ref_obj_feats, ref_cls_scores, ref_mask_preds, ref_scaled_mask_preds = self.roi_head.forward_train( ref_x_feats, ref_proposal_feats, ref_mask_preds, ref_cls_scores, ref_img_metas, ref_gt_masks, ref_gt_labels, gt_sem_seg=ref_gt_sem_seg, gt_sem_cls=ref_gt_sem_cls, imgs_whwh=None) if self.link_previous: losses, object_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.forward_train_with_previous( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, gt_masks, gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, gt_bboxes=gt_bboxes, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls, imgs_whwh=None, previous_obj_feats=ref_obj_feats, previous_mask_preds=ref_scaled_mask_preds, previous_x_feats=ref_x_feats, ) else: # forward to get the current results losses, object_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.forward_train( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, gt_masks, gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, gt_bboxes=gt_bboxes, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls, imgs_whwh=None) # ===== Tracking Part -==== # # assign both key frame and reference frame tracking targets key_sampling_results, ref_sampling_results = [], [] num_imgs = len(img_metas) for i in range(num_imgs): assign_result = self.track_roi_assigner.assign( scaled_mask_preds[i][:self.num_proposals].detach(), cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(), gt_masks[i], gt_labels[i], img_meta=img_metas[i]) sampling_result = self.track_roi_sampler.sample( assign_result, mask_preds[i][:self.num_proposals].detach(), gt_masks[i]) key_sampling_results.append(sampling_result) ref_assign_result = self.track_roi_assigner.assign( ref_scaled_mask_preds[i][:self.num_proposals].detach(), ref_cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(), ref_gt_masks[i], ref_gt_labels[i], img_meta=ref_img_metas_new[i]) ref_sampling_result = self.track_roi_sampler.sample( ref_assign_result, ref_mask_preds[i][:self.num_proposals].detach(), ref_gt_masks[i]) ref_sampling_results.append(ref_sampling_result) # roi feature embeddings key_masks = [res.pos_gt_masks for res in key_sampling_results] for i in range(len(key_masks)): key_masks[i] = F.interpolate(key_masks[i].unsqueeze(0), size=(img_h, img_w), mode="bilinear", align_corners=False).squeeze(0) key_masks[i] = (key_masks[i].sigmoid() > 0.5).float() key_feats = self._track_forward(x, key_masks) # roi feature embeddings ref_masks = [res.pos_gt_masks for res in ref_sampling_results] for i in range(len(ref_masks)): ref_masks[i] = F.interpolate(ref_masks[i].unsqueeze(0), size=(img_h, img_w), mode="bilinear", align_corners=False).squeeze(0) ref_masks[i] = (ref_masks[i].sigmoid() > 0.5).float() ref_feats = self._track_forward(x_ref, ref_masks) match_feats = self.track_head.match(key_feats, ref_feats, key_sampling_results, ref_sampling_results) asso_targets = self.track_head.get_track_targets( gt_match_indices, key_sampling_results, ref_sampling_results) loss_track = self.track_head.loss(*match_feats, *asso_targets) losses_ref = self.add_ref_loss(losses_ref) ref_rpn_losses = self.add_ref_rpn_loss(ref_rpn_losses) losses.update(ref_rpn_losses) losses.update(rpn_losses) losses.update(losses_ref) losses.update(loss_track) return losses def simple_test(self, img, img_metas, rescale=False, ref_img=None, **kwargs): """Test function without test time augmentation. Args: imgs (list[torch.Tensor]): List of multiple images img_metas (list[dict]): List of image information. rescale (bool): Whether to rescale the results. Defaults to False. Returns: list[list[np.ndarray]]: BBox results of each image and classes. The outer list corresponds to each image. The inner list corresponds to each class. """ # whether is the first frame for such clips # assert 'city' in img_metas[0]['filename'] and 'iid' in img_metas[0] if "city" in img_metas[0]['filename']: iid = img_metas[0]['iid'] fid = iid % 10000 is_first = (fid == 1) else: iid = kwargs['img_id'][0].item() fid = iid % 10000 is_first = (fid == 0) if is_first: self.init_tracker() self.obj_feats_memory = None self.x_feats_memory = None self.mask_preds_memory = None # for current frame x = self.extract_feat(img) # current frame inference rpn_results = self.rpn_head.simple_test_rpn(x, img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results if self.link_previous: cur_segm_results, query_output, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test_with_previous( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, previous_obj_feats=self.obj_feats_memory, previous_mask_preds=self.mask_preds_memory, previous_x_feats=self.x_feats_memory, ) self.obj_feats_memory = query_output self.x_feats_memory = x_feats self.mask_preds_memory = scaled_mask_preds else: cur_segm_results, query_output, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test( x_feats, proposal_feats, mask_preds, cls_scores, img_metas) # for tracking part _, segm_result, mask_preds, panoptic_result = cur_segm_results[0] panoptic_seg, segments_info = panoptic_result if self.semantic_filter: seg_preds = torch.nn.functional.interpolate(seg_preds, panoptic_seg.shape, mode='bilinear', align_corners=False) seg_preds = seg_preds.sigmoid() seg_out = seg_preds.argmax(1) semantic_thing = (seg_out < self.num_thing_classes).to(dtype=torch.float32) else: semantic_thing = 1. # get sorted tracking thing ids, labels, masks, score for tracking things_index_for_tracking, things_labels_for_tracking, thing_masks_for_tracking, things_score_for_tracking = \ self.get_things_id_for_tracking(panoptic_seg, segments_info) things_labels_for_tracking = torch.Tensor(things_labels_for_tracking).to(cls_scores.device).long() if len(things_labels_for_tracking) > 0: things_bbox_for_tracking = torch.zeros((len(things_score_for_tracking), 5), dtype=torch.float, device=x_feats.device) things_bbox_for_tracking[:, 4] = torch.tensor(things_score_for_tracking, device=things_bbox_for_tracking.device) thing_masks_for_tracking_final = [] for mask in thing_masks_for_tracking: thing_masks_for_tracking_final.append(torch.Tensor(mask).unsqueeze(0).to( x_feats.device).float()) thing_masks_for_tracking_final = torch.cat(thing_masks_for_tracking_final, 0) thing_masks_for_tracking = thing_masks_for_tracking_final thing_masks_for_tracking_with_semantic_filter = thing_masks_for_tracking_final * semantic_thing if len(things_labels_for_tracking) == 0: track_feats = None else: # tracking embedding features track_feats = self._track_forward(x, thing_masks_for_tracking_with_semantic_filter) if track_feats is not None: # assert len(things_id_for_tracking) == len(things_labels_for_tracking) things_bbox_for_tracking[:, :4] = torch.tensor(tensor_mask2box(thing_masks_for_tracking_with_semantic_filter), device=things_bbox_for_tracking.device) bboxes, labels, ids = self.tracker.match( bboxes=things_bbox_for_tracking, labels=things_labels_for_tracking, track_feats=track_feats, frame_id=fid) ids = ids + 1 ids[ids == -1] = 0 else: ids = [] track_maps = self.generate_track_id_maps(ids, thing_masks_for_tracking, panoptic_seg) semantic_map = self.get_semantic_seg(panoptic_seg, segments_info) from scripts.visualizer import trackmap2rgb, cityscapes_cat2rgb, draw_bbox_on_img vis_tracker = trackmap2rgb(track_maps) vis_sem = cityscapes_cat2rgb(semantic_map) if len(things_labels_for_tracking): vis_tracker = draw_bbox_on_img(vis_tracker, things_bbox_for_tracking.cpu().numpy()) # Visualization end return semantic_map, track_maps, None, vis_sem, vis_tracker def _track_forward(self, x, mask_pred): """Track head forward function used in both training and testing. We use mask pooling to get the fine grain features""" if not self.training: mask_pred = [mask_pred] bbox_list = batch_mask2boxlist(mask_pred) track_rois = bboxlist2roi(bbox_list) track_rois = track_rois.clamp(min=0.0) track_feats = self.track_roi_extractor(x[:self.track_roi_extractor.num_inputs], track_rois) track_feats = self.track_head(track_feats) return track_feats def forward_dummy(self, img): """Used for computing network flops. See `mmdetection/tools/get_flops.py` """ # backbone x = self.extract_feat(img) # rpn num_imgs = len(img) dummy_img_metas = [ dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs) ] rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats, dummy_img_metas) return roi_outs def extract_feat(self, img): """Directly extract features from the backbone+neck.""" x = self.backbone(img) if self.with_neck: x = self.neck(x) return x @property def with_rpn(self): """bool: whether the detector has RPN""" return hasattr(self, 'rpn_head') and self.rpn_head is not None @property def with_roi_head(self): """bool: whether the detector has a RoI head""" return hasattr(self, 'roi_head') and self.roi_head is not None def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ pass def get_things_id_for_tracking(self, panoptic_seg, seg_infos): idxs = [] labels = [] masks = [] score = [] for segment in seg_infos: if segment['isthing'] == True: thing_mask = panoptic_seg == segment["id"] masks.append(thing_mask) idxs.append(segment["instance_id"]) labels.append(segment['category_id']) score.append(segment['score']) return idxs, labels, masks, score def pack_things_object(self, object_feats, ref_object_feats): object_feats, ref_object_feats = object_feats.squeeze(-1).squeeze(-1), ref_object_feats.squeeze(-1).squeeze(-1) thing_object_feats = torch.split(object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] ref_thing_object_feats = torch.split(ref_object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] return thing_object_feats, ref_thing_object_feats def pack_things_masks(self, mask_pred, ref_mask_pred): thing_mask_pred = torch.split(mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] ref_thing_thing_mask_pred = torch.split(ref_mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] return thing_mask_pred, ref_thing_thing_mask_pred def get_semantic_seg(self, panoptic_seg, segments_info): results = {} masks = [] scores = [] kitti_step2cityscpaes = [11, 13] semantic_seg = np.zeros(panoptic_seg.shape) for segment in segments_info: if segment['isthing'] == True: if self.kitti_step: cat_cur = kitti_step2cityscpaes[segment["category_id"]] semantic_seg[panoptic_seg == segment["id"]] = cat_cur else: semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + 11 else: # for stuff (0- n-1) if self.kitti_step: cat_cur = segment["category_id"] cat_cur -= 1 offset = 0 for thing_id in kitti_step2cityscpaes: if cat_cur + offset >= thing_id: offset += 1 cat_cur += offset semantic_seg[panoptic_seg == segment["id"]] = cat_cur else: semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1 return semantic_seg def generate_track_id_maps(self, ids, masks, panopitc_seg_maps): final_id_maps = np.zeros(panopitc_seg_maps.shape) if len(ids) == 0: return final_id_maps # assert len(things_mask_results) == len(track_results) masks = masks.bool() for i, id in enumerate(ids): mask = masks[i].cpu().numpy() final_id_maps[mask] = id return final_id_maps def add_ref_loss(self, loss_dict): track_loss ={} for k, v in loss_dict.items(): track_loss[str(k)+"_ref"] = v return track_loss def add_ref_rpn_loss(self, loss_dict): ref_rpn_loss = {} for k, v in loss_dict.items(): ref_rpn_loss[str(k) +"_ref_rpn"] = v return ref_rpn_loss ================================================ FILE: knet/video/knet_track_head.py ================================================ import warnings import numpy as np import torch import torch.nn.functional as F from mmdet.models.builder import DETECTORS from mmdet.models.detectors import BaseDetector from mmdet.models.builder import build_head, build_neck, build_backbone from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes @DETECTORS.register_module() class VideoKNetFuseTrack(BaseDetector): """ Simple Extension of KNet to Video KNet by the implementation of VPSFuse Net. """ def __init__(self, backbone, neck=None, rpn_head=None, roi_head=None, track_head=None, extra_neck=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, cityscapes=False, **kwargs): super(VideoKNetFuseTrack, self).__init__(init_cfg) if pretrained: warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') backbone.pretrained = pretrained self.backbone = build_backbone(backbone) if neck is not None: self.neck = build_neck(neck) if extra_neck is not None: self.extra_neck = build_neck(extra_neck) if rpn_head is not None: rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None rpn_head_ = rpn_head.copy() rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn) self.rpn_head = build_head(rpn_head_) if roi_head is not None: # update train and test cfg here for now # TODO: refactor assigner & sampler rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None roi_head.update(train_cfg=rcnn_train_cfg) roi_head.update(test_cfg=test_cfg.rcnn) roi_head.pretrained = pretrained self.roi_head = build_head(roi_head) if track_head is not None: self.track_head = build_head(track_head) self.train_cfg = train_cfg self.test_cfg = test_cfg self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.thing_label_in_seg = thing_label_in_seg self.ignore_label = ignore_label self.cityscapes = cityscapes # whether to train the cityscape panoptic segmentation def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_semantic_seg): # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks_tensor = [] gt_sem_seg = [] gt_sem_cls = [] # batch_input_shape shoud be the same across images pad_H, pad_W = img_metas[0]['batch_input_shape'] assign_H = pad_H // self.mask_assign_stride assign_W = pad_W // self.mask_assign_stride for i, gt_mask in enumerate(gt_masks): mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device) if gt_mask.width != pad_W or gt_mask.height != pad_H: pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height) mask_tensor = F.pad(mask_tensor, pad_wh, value=0) if gt_semantic_seg is not None: # gt_semantic seg is padded by zero when forming a batch # need to convert them from 0 to ignore gt_semantic_seg[ i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label gt_semantic_seg[ i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label if self.cityscapes: sem_labels, sem_seg = sem2ins_masks_cityscapes( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes) else: sem_labels, sem_seg = sem2ins_masks( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes, thing_label_in_seg=self.thing_label_in_seg) if sem_seg.shape[0] == 0: gt_sem_seg.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_sem_seg.append( F.interpolate( sem_seg[None], (assign_H, assign_W), mode='bilinear', align_corners=False)[0]) gt_sem_cls.append(sem_labels) else: gt_sem_seg = None gt_sem_cls = None if mask_tensor.shape[0] == 0: gt_masks_tensor.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_masks_tensor.append( F.interpolate( mask_tensor[None], (assign_H, assign_W), # downsample to 1/4 resolution mode='bilinear', align_corners=False)[0]) return gt_masks_tensor, gt_sem_cls, gt_sem_seg def forward_train(self, img, img_metas, gt_bboxes=None, gt_labels=None, gt_bboxes_ignore=None, gt_masks=None, gt_semantic_seg=None, gt_instance_ids=None, ref_img=None, ref_img_metas=None, ref_gt_bboxes_ignore=None, ref_gt_labels=None, ref_gt_bboxes=None, ref_gt_masks=None, ref_gt_semantic_seg=None, ref_gt_instance_ids=None, proposals=None, **kwargs): """Forward function of SparseR-CNN-like network in train stage. Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see :class:`mmdet.datasets.pipelines.Collect`. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor): specify which bounding boxes can be ignored when computing the loss. gt_masks (List[Tensor], optional) : Segmentation masks for each box. But we don't support it in this architecture. proposals (List[Tensor], optional): override rpn proposals with custom proposals. Use when `with_rpn` is False. # This is for video only: ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images. Typically these should be mean centered and std scaled. 2 denotes there is two reference images for each input image. ref_img_metas (list[list[dict]]): The first list only has one element. The second list contains reference image information dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmtrack/datasets/pipelines/formatting.py:VideoCollect`. ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The Tensor contains ground truth bboxes for each reference image with shape (num_all_ref_gts, 5) in [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id start from 0, and denotes the id of reference image for each key image. ref_gt_labels (list[Tensor]): The list only has one Tensor. The Tensor contains class indices corresponding to each reference box with shape (num_all_ref_gts, 2) in [ref_img_id, class_indice]. Returns: dict[str, Tensor]: a dictionary of loss components """ batch_input_shape = tuple(img[0].size()[-2:]) for img_meta in img_metas: img_meta['batch_input_shape'] = batch_input_shape assert proposals is None, 'KNet does not support' \ ' external proposals' assert gt_masks is not None assert gt_instance_ids is not None # preprocess the reference images ref_img = ref_img.squeeze(1) # (b,3,h,w) ref_masks_gt = [] for ref_gt_mask in ref_gt_masks: ref_masks_gt.append(ref_gt_mask[0]) ref_labels_gt = [] for ref_gt_label in ref_gt_labels: ref_labels_gt.append(ref_gt_label[:, 1].long()) ref_gt_labels = ref_labels_gt ref_semantic_seg_gt = ref_gt_semantic_seg.squeeze(1) ref_gt_instance_id_list = [] for ref_gt_instance_id in ref_gt_instance_ids: ref_gt_instance_id_list.append(ref_gt_instance_id[:,1].long()) ref_img_metas_new = [] for ref_img_meta in ref_img_metas: ref_img_meta[0]['batch_input_shape'] = batch_input_shape ref_img_metas_new.append(ref_img_meta[0]) gt_pids_list =[] for i in range(len(ref_gt_instance_id_list)): ref_ids = ref_gt_instance_id_list[i].cpu().data.numpy().tolist() gt_ids = gt_instance_ids[i].cpu().data.numpy().tolist() gt_pids = [ref_ids.index(i) + 1 if i in ref_ids else 0 for i in gt_ids] gt_pids_list.append(torch.LongTensor([gt_pids]).to(img.device)[0]) gt_pids = gt_pids_list # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks, gt_sem_cls, gt_sem_seg = self.preprocess_gt_masks(img_metas, gt_masks, gt_labels, gt_semantic_seg) ref_gt_masks, ref_gt_sem_cls, ref_gt_sem_seg = self.preprocess_gt_masks(ref_img_metas_new, ref_masks_gt, ref_labels_gt, ref_semantic_seg_gt) x = self.extract_feat(img) x_ref = self.extract_feat(ref_img) rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks, gt_labels, gt_sem_seg, gt_sem_cls) ref_rpn_results = self.rpn_head.forward_train(x_ref, ref_img_metas_new, ref_gt_masks, ref_labels_gt, ref_gt_sem_seg, ref_gt_sem_cls) (rpn_losses, proposal_feats, x_feats, mask_preds, cls_scores) = rpn_results (ref_rpn_losses, ref_proposal_feats, ref_x_feats, ref_mask_preds, ref_cls_scores) = ref_rpn_results losses, sample_results, object_feats = self.roi_head.forward_train( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, gt_masks, gt_labels, gt_pids=gt_pids, gt_bboxes_ignore=gt_bboxes_ignore, gt_bboxes=gt_bboxes, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls, imgs_whwh=None) ref_losses, ref_sample_results, ref_object_feats = self.roi_head.forward_train( ref_x_feats, ref_proposal_feats, ref_mask_preds, ref_cls_scores, ref_img_metas, ref_gt_masks, ref_gt_labels, gt_bboxes=ref_gt_bboxes, gt_bboxes_ignore=ref_gt_bboxes_ignore, gt_sem_seg=ref_gt_sem_seg, gt_sem_cls=ref_gt_sem_cls, imgs_whwh=None) proposals_nums = [self.roi_head.num_proposals] * img.size()[0] ref_proposals_nums = proposals_nums object_feats, ref_object_feats = self.pack_things_object(object_feats, ref_object_feats) match_score = self.track_head(object_feats, ref_object_feats, proposals_nums, ref_proposals_nums) track_loss = self.track_head.loss(match_score, sample_results) # format the loss ref_rpn_losses = self.add_ref_rpn_loss(ref_rpn_losses) ref_losses = self.add_ref_rpn_loss(ref_losses) losses.update(ref_rpn_losses) losses.update(ref_losses) losses.update(track_loss) losses.update(rpn_losses) return losses def simple_test(self, img, img_metas, rescale=False, ref_img=None): """Test function without test time augmentation. Args: imgs (list[torch.Tensor]): List of multiple images img_metas (list[dict]): List of image information. rescale (bool): Whether to rescale the results. Defaults to False. Returns: list[list[np.ndarray]]: BBox results of each image and classes. The outer list corresponds to each image. The inner list corresponds to each class. """ if ref_img is not None: ref_img = ref_img[0] # whether is the first frame for such clips assert 'city' in img_metas[0]['filename'] and 'iid' in img_metas[0] iid = img_metas[0]['iid'] fid = iid % 10000 is_first = (fid == 1) # for current frame x = self.extract_feat(img) rpn_results = self.rpn_head.simple_test_rpn(x, img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results if not is_first: ref_x = self.extract_feat(ref_img) ref_rpn_results = self.rpn_head.simple_test_rpn(ref_x, img_metas) (ref_proposal_feats, ref_x_feats, ref_mask_preds, ref_cls_scores, ref_seg_preds) = ref_rpn_results x_fuse = self.combine(ref_x_feats + x_feats) cur_segm_results, cur_object_query = self.roi_head.simple_test( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, imgs_whwh=None, rescale=rescale) bbox_result, segm_result, panoptic_result = cur_segm_results[0] panoptic_seg, segments_info = panoptic_result cur_results, sseg_results = self.pack_stuff_things_result(panoptic_seg, segments_info) if is_first: self.track_query = cur_object_query if not is_first: track_seg_results = self.track_roi_head.simple_test( x_fuse, self.track_query, ref_mask_preds, ref_cls_scores, img_metas, imgs_whwh=None, rescale=rescale ) bbox_result, segm_result, panoptic_result = track_seg_results[0] track_panoptic_seg, track_segments_info = panoptic_result track_results, ref_sseg_results = self.pack_stuff_things_result(track_panoptic_seg, track_segments_info) # update the tracking query self.track_query = cur_object_query if is_first: self.tracker.reset_all() init_track_results = self.tracker.init_track(cur_results) track_maps = self.generate_track_id_maps(init_track_results, panoptic_seg) elif not is_first: results = self.tracker.step(cur_results, track_results) track_maps = self.generate_track_id_maps(results, panoptic_seg) return cur_segm_results, track_maps, sseg_results def forward_dummy(self, img): """Used for computing network flops. See `mmdetection/tools/get_flops.py` """ # backbone x = self.extract_feat(img) # rpn num_imgs = len(img) dummy_img_metas = [ dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs) ] rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats, dummy_img_metas) return roi_outs def extract_feat(self, img): """Directly extract features from the backbone+neck.""" x = self.backbone(img) if self.with_neck: x = self.neck(x) return x @property def with_rpn(self): """bool: whether the detector has RPN""" return hasattr(self, 'rpn_head') and self.rpn_head is not None @property def with_roi_head(self): """bool: whether the detector has a RoI head""" return hasattr(self, 'roi_head') and self.roi_head is not None def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ pass def pack_things_object(self, object_feats, ref_object_feats): object_feats, ref_object_feats = object_feats.squeeze(-1).squeeze(-1), ref_object_feats.squeeze(-1).squeeze(-1) thing_object_feats = torch.split(object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] ref_thing_object_feats = torch.split(ref_object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] return thing_object_feats, ref_thing_object_feats def add_track_loss(self, loss_dict): track_loss ={} for k,v in loss_dict.items(): track_loss[str(k)+"_track"] = v return track_loss def add_ref_rpn_loss(self, loss_dict): ref_rpn_loss = {} for k,v in loss_dict.items(): ref_rpn_loss[str(k) +"_ref"] = v return ref_rpn_loss def pack_stuff_things_result(self, panoptic_seg, segments_info): results = {} masks = [] scores = [] semantic_seg = np.zeros(panoptic_seg.shape) for segment in segments_info: if segment['isthing'] == True: thing_mask = panoptic_seg == segment["id"] masks.append(thing_mask) scores.append(segment["score"]) # for things to shift the labels # (n - c) semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + 11 else: # for stuff (0- n-1) semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1 results["masks"] = np.array(masks) # (N) results["scores"] = np.array(scores) # (N,H,W) return results, semantic_seg def generate_track_id_maps(self, track_results, panopitc_seg_maps): final_id_maps = np.zeros(panopitc_seg_maps.shape) # assert len(things_mask_results) == len(track_results) for track in track_results: id = track["tracking_id"] mask = track["mask"] final_id_maps[mask] = id return final_id_maps ================================================ FILE: knet/video/knet_track_head_roi_align.py ================================================ import warnings import numpy as np import torch import torch.nn.functional as F from mmdet.models.builder import DETECTORS from mmdet.models.detectors import BaseDetector from mmdet.models.builder import build_head, build_neck, build_backbone from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes @DETECTORS.register_module() class VideoKNetFuseROITrack(BaseDetector): """ Simple Extension of KNet to Video KNet by the implementation of VPSFuse Net. """ def __init__(self, backbone, neck=None, rpn_head=None, roi_head=None, track_head=None, extra_neck=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, cityscapes=False, **kwargs): super(VideoKNetFuseROITrack, self).__init__(init_cfg) if pretrained: warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') backbone.pretrained = pretrained self.backbone = build_backbone(backbone) if neck is not None: self.neck = build_neck(neck) if extra_neck is not None: self.extra_neck = build_neck(extra_neck) if rpn_head is not None: rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None rpn_head_ = rpn_head.copy() rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn) self.rpn_head = build_head(rpn_head_) if roi_head is not None: # update train and test cfg here for now # TODO: refactor assigner & sampler rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None roi_head.update(train_cfg=rcnn_train_cfg) roi_head.update(test_cfg=test_cfg.rcnn) roi_head.pretrained = pretrained self.roi_head = build_head(roi_head) if track_head is not None: self.track_head = build_head(track_head) self.train_cfg = train_cfg self.test_cfg = test_cfg self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.thing_label_in_seg = thing_label_in_seg self.ignore_label = ignore_label self.cityscapes = cityscapes # whether to train the cityscape panoptic segmentation def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_semantic_seg): # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks_tensor = [] gt_sem_seg = [] gt_sem_cls = [] # batch_input_shape shoud be the same across images pad_H, pad_W = img_metas[0]['batch_input_shape'] assign_H = pad_H // self.mask_assign_stride assign_W = pad_W // self.mask_assign_stride for i, gt_mask in enumerate(gt_masks): mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device) if gt_mask.width != pad_W or gt_mask.height != pad_H: pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height) mask_tensor = F.pad(mask_tensor, pad_wh, value=0) if gt_semantic_seg is not None: # gt_semantic seg is padded by zero when forming a batch # need to convert them from 0 to ignore gt_semantic_seg[ i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label gt_semantic_seg[ i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label if self.cityscapes: sem_labels, sem_seg = sem2ins_masks_cityscapes( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes) else: sem_labels, sem_seg = sem2ins_masks( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes, thing_label_in_seg=self.thing_label_in_seg) if sem_seg.shape[0] == 0: gt_sem_seg.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_sem_seg.append( F.interpolate( sem_seg[None], (assign_H, assign_W), mode='bilinear', align_corners=False)[0]) gt_sem_cls.append(sem_labels) else: gt_sem_seg = None gt_sem_cls = None if mask_tensor.shape[0] == 0: gt_masks_tensor.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_masks_tensor.append( F.interpolate( mask_tensor[None], (assign_H, assign_W), # downsample to 1/4 resolution mode='bilinear', align_corners=False)[0]) return gt_masks_tensor, gt_sem_cls, gt_sem_seg def forward_train(self, img, img_metas, gt_bboxes=None, gt_labels=None, gt_bboxes_ignore=None, gt_masks=None, gt_semantic_seg=None, gt_instance_ids=None, ref_img=None, ref_img_metas=None, ref_gt_bboxes_ignore=None, ref_gt_labels=None, ref_gt_bboxes=None, ref_gt_masks=None, ref_gt_semantic_seg=None, ref_gt_instance_ids=None, proposals=None, **kwargs): """Forward function of SparseR-CNN-like network in train stage. Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see :class:`mmdet.datasets.pipelines.Collect`. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor): specify which bounding boxes can be ignored when computing the loss. gt_masks (List[Tensor], optional) : Segmentation masks for each box. But we don't support it in this architecture. proposals (List[Tensor], optional): override rpn proposals with custom proposals. Use when `with_rpn` is False. # This is for video only: ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images. Typically these should be mean centered and std scaled. 2 denotes there is two reference images for each input image. ref_img_metas (list[list[dict]]): The first list only has one element. The second list contains reference image information dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmtrack/datasets/pipelines/formatting.py:VideoCollect`. ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The Tensor contains ground truth bboxes for each reference image with shape (num_all_ref_gts, 5) in [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id start from 0, and denotes the id of reference image for each key image. ref_gt_labels (list[Tensor]): The list only has one Tensor. The Tensor contains class indices corresponding to each reference box with shape (num_all_ref_gts, 2) in [ref_img_id, class_indice]. Returns: dict[str, Tensor]: a dictionary of loss components """ batch_input_shape = tuple(img[0].size()[-2:]) for img_meta in img_metas: img_meta['batch_input_shape'] = batch_input_shape assert proposals is None, 'KNet does not support' \ ' external proposals' assert gt_masks is not None assert gt_instance_ids is not None # preprocess the reference images ref_img = ref_img.squeeze(1) # (b,3,h,w) ref_masks_gt = [] for ref_gt_mask in ref_gt_masks: ref_masks_gt.append(ref_gt_mask[0]) ref_labels_gt = [] for ref_gt_label in ref_gt_labels: ref_labels_gt.append(ref_gt_label[:, 1].long()) ref_gt_labels = ref_labels_gt ref_semantic_seg_gt = ref_gt_semantic_seg.squeeze(1) ref_gt_instance_id_list = [] for ref_gt_instance_id in ref_gt_instance_ids: ref_gt_instance_id_list.append(ref_gt_instance_id[:,1].long()) ref_img_metas_new = [] for ref_img_meta in ref_img_metas: ref_img_meta[0]['batch_input_shape'] = batch_input_shape ref_img_metas_new.append(ref_img_meta[0]) gt_pids_list =[] for i in range(len(ref_gt_instance_id_list)): ref_ids = ref_gt_instance_id_list[i].cpu().data.numpy().tolist() gt_ids = gt_instance_ids[i].cpu().data.numpy().tolist() gt_pids = [ref_ids.index(i) + 1 if i in ref_ids else 0 for i in gt_ids] gt_pids_list.append(torch.LongTensor([gt_pids]).to(img.device)[0]) gt_pids = gt_pids_list # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks, gt_sem_cls, gt_sem_seg = self.preprocess_gt_masks(img_metas, gt_masks, gt_labels, gt_semantic_seg) ref_gt_masks, ref_gt_sem_cls, ref_gt_sem_seg = self.preprocess_gt_masks(ref_img_metas_new, ref_masks_gt, ref_labels_gt, ref_semantic_seg_gt) x = self.extract_feat(img) x_ref = self.extract_feat(ref_img) rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks, gt_labels, gt_sem_seg, gt_sem_cls) ref_rpn_results = self.rpn_head.forward_train(x_ref, ref_img_metas_new, ref_gt_masks, ref_labels_gt, ref_gt_sem_seg, ref_gt_sem_cls) (rpn_losses, proposal_feats, x_feats, mask_preds, cls_scores) = rpn_results (ref_rpn_losses, ref_proposal_feats, ref_x_feats, ref_mask_preds, ref_cls_scores) = ref_rpn_results losses, sample_results, object_feats, mask_preds = self.roi_head.forward_train( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, gt_masks, gt_labels, gt_pids=gt_pids, gt_bboxes_ignore=gt_bboxes_ignore, gt_bboxes=gt_bboxes, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls, imgs_whwh=None) ref_losses, ref_sample_results, ref_object_feats, ref_mask_preds = self.roi_head.forward_train( ref_x_feats, ref_proposal_feats, ref_mask_preds, ref_cls_scores, ref_img_metas, ref_gt_masks, ref_gt_labels, gt_bboxes=ref_gt_bboxes, gt_bboxes_ignore=ref_gt_bboxes_ignore, gt_sem_seg=ref_gt_sem_seg, gt_sem_cls=ref_gt_sem_cls, imgs_whwh=None) proposals_nums = [self.roi_head.num_proposals] * img.size()[0] ref_proposals_nums = proposals_nums thing_mask_preds, ref_thing_mask_preds = self.pack_things_masks(mask_preds, ref_mask_preds) match_score = self.track_head(x, x_ref, thing_mask_preds, ref_thing_mask_preds, proposals_nums, ref_proposals_nums) track_loss = self.track_head.loss(match_score, sample_results) # format the loss ref_rpn_losses = self.add_ref_rpn_loss(ref_rpn_losses) ref_losses = self.add_ref_rpn_loss(ref_losses) losses.update(ref_rpn_losses) losses.update(ref_losses) losses.update(track_loss) losses.update(rpn_losses) return losses def simple_test(self, img, img_metas, rescale=False, ref_img=None): """Test function without test time augmentation. Args: imgs (list[torch.Tensor]): List of multiple images img_metas (list[dict]): List of image information. rescale (bool): Whether to rescale the results. Defaults to False. Returns: list[list[np.ndarray]]: BBox results of each image and classes. The outer list corresponds to each image. The inner list corresponds to each class. """ if ref_img is not None: ref_img = ref_img[0] # whether is the first frame for such clips assert 'city' in img_metas[0]['filename'] and 'iid' in img_metas[0] iid = img_metas[0]['iid'] fid = iid % 10000 is_first = (fid == 1) # for current frame x = self.extract_feat(img) rpn_results = self.rpn_head.simple_test_rpn(x, img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results if not is_first: ref_x = self.extract_feat(ref_img) ref_rpn_results = self.rpn_head.simple_test_rpn(ref_x, img_metas) (ref_proposal_feats, ref_x_feats, ref_mask_preds, ref_cls_scores, ref_seg_preds) = ref_rpn_results x_fuse = self.combine(ref_x_feats + x_feats) cur_segm_results, cur_object_query = self.roi_head.simple_test( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, imgs_whwh=None, rescale=rescale) bbox_result, segm_result, panoptic_result = cur_segm_results[0] panoptic_seg, segments_info = panoptic_result cur_results, sseg_results = self.pack_stuff_things_result(panoptic_seg, segments_info) if is_first: self.track_query = cur_object_query if not is_first: track_seg_results = self.track_roi_head.simple_test( x_fuse, self.track_query, ref_mask_preds, ref_cls_scores, img_metas, imgs_whwh=None, rescale=rescale ) bbox_result, segm_result, panoptic_result = track_seg_results[0] track_panoptic_seg, track_segments_info = panoptic_result track_results, ref_sseg_results = self.pack_stuff_things_result(track_panoptic_seg, track_segments_info) # update the tracking query self.track_query = cur_object_query if is_first: self.tracker.reset_all() init_track_results = self.tracker.init_track(cur_results) track_maps = self.generate_track_id_maps(init_track_results, panoptic_seg) elif not is_first: results = self.tracker.step(cur_results, track_results) track_maps = self.generate_track_id_maps(results, panoptic_seg) return cur_segm_results, track_maps, sseg_results def forward_dummy(self, img): """Used for computing network flops. See `mmdetection/tools/get_flops.py` """ # backbone x = self.extract_feat(img) # rpn num_imgs = len(img) dummy_img_metas = [ dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs) ] rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats, dummy_img_metas) return roi_outs def extract_feat(self, img): """Directly extract features from the backbone+neck.""" x = self.backbone(img) if self.with_neck: x = self.neck(x) return x @property def with_rpn(self): """bool: whether the detector has RPN""" return hasattr(self, 'rpn_head') and self.rpn_head is not None @property def with_roi_head(self): """bool: whether the detector has a RoI head""" return hasattr(self, 'roi_head') and self.roi_head is not None def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ pass def pack_things_object(self, object_feats, ref_object_feats): object_feats, ref_object_feats = object_feats.squeeze(-1).squeeze(-1), ref_object_feats.squeeze(-1).squeeze(-1) thing_object_feats = torch.split(object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] ref_thing_object_feats = torch.split(ref_object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] return thing_object_feats, ref_thing_object_feats def pack_things_masks(self, mask_pred, ref_mask_pred): thing_mask_pred = torch.split(mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] ref_thing_thing_mask_pred= torch.split(ref_mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0] return thing_mask_pred, ref_thing_thing_mask_pred def add_track_loss(self, loss_dict): track_loss ={} for k,v in loss_dict.items(): track_loss[str(k)+"_track"] = v return track_loss def add_ref_rpn_loss(self, loss_dict): ref_rpn_loss = {} for k,v in loss_dict.items(): ref_rpn_loss[str(k) +"_ref"] = v return ref_rpn_loss def pack_stuff_things_result(self, panoptic_seg, segments_info): results = {} masks = [] scores = [] semantic_seg = np.zeros(panoptic_seg.shape) for segment in segments_info: if segment['isthing'] == True: thing_mask = panoptic_seg == segment["id"] masks.append(thing_mask) scores.append(segment["score"]) # for things to shift the labels # (n - c) semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + 11 else: # for stuff (0- n-1) semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1 results["masks"] = np.array(masks) # (N) results["scores"] = np.array(scores) # (N,H,W) return results, semantic_seg def generate_track_id_maps(self, track_results, panopitc_seg_maps): final_id_maps = np.zeros(panopitc_seg_maps.shape) # assert len(things_mask_results) == len(track_results) for track in track_results: id = track["tracking_id"] mask = track["mask"] final_id_maps[mask] = id return final_id_maps ================================================ FILE: knet/video/knet_uni_track.py ================================================ import warnings import numpy as np import torch import torch.nn.functional as F from mmdet.models.builder import DETECTORS from mmdet.models.detectors import BaseDetector from mmdet.models.builder import build_head, build_neck, build_backbone from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes from unitrack.mask import MaskAssociationTracker @DETECTORS.register_module() class VideoKNetUniTrack(BaseDetector): def __init__(self, backbone, neck=None, rpn_head=None, roi_head=None, track_roi_head=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, kitti_step=False, cityscapes=False, uni_tracker_cfg=None, **kwargs): super(VideoKNetUniTrack, self).__init__(init_cfg) if pretrained: warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') backbone.pretrained = pretrained self.backbone = build_backbone(backbone) if neck is not None: self.neck = build_neck(neck) if rpn_head is not None: rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None rpn_head_ = rpn_head.copy() rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn) self.rpn_head = build_head(rpn_head_) if roi_head is not None: # update train and test cfg here for now # TODO: refactor assigner & sampler rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None roi_head.update(train_cfg=rcnn_train_cfg) roi_head.update(test_cfg=test_cfg.rcnn) roi_head.pretrained = pretrained self.roi_head = build_head(roi_head) self.tracker = MaskAssociationTracker(uni_tracker_cfg) self.img0 = None self.train_cfg = train_cfg self.test_cfg = test_cfg self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.thing_label_in_seg = thing_label_in_seg self.ignore_label = ignore_label self.cityscapes = cityscapes # whether to train the cityscape panoptic segmentation self.kitti_step = kitti_step # whether to use kitti step dataset def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_semantic_seg): # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks_tensor = [] gt_sem_seg = [] gt_sem_cls = [] # batch_input_shape shoud be the same across images pad_H, pad_W = img_metas[0]['batch_input_shape'] assign_H = pad_H // self.mask_assign_stride assign_W = pad_W // self.mask_assign_stride for i, gt_mask in enumerate(gt_masks): mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device) if gt_mask.width != pad_W or gt_mask.height != pad_H: pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height) mask_tensor = F.pad(mask_tensor, pad_wh, value=0) if gt_semantic_seg is not None: # gt_semantic seg is padded by zero when forming a batch # need to convert them from 0 to ignore gt_semantic_seg[ i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label gt_semantic_seg[ i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label if self.cityscapes: sem_labels, sem_seg = sem2ins_masks_cityscapes( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes) else: sem_labels, sem_seg = sem2ins_masks( gt_semantic_seg[i], ignore_label=self.ignore_label, label_shift=self.num_thing_classes, thing_label_in_seg=self.thing_label_in_seg) if sem_seg.shape[0] == 0: gt_sem_seg.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_sem_seg.append( F.interpolate( sem_seg[None], (assign_H, assign_W), mode='bilinear', align_corners=False)[0]) gt_sem_cls.append(sem_labels) else: gt_sem_seg = None gt_sem_cls = None if mask_tensor.shape[0] == 0: gt_masks_tensor.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_masks_tensor.append( F.interpolate( mask_tensor[None], (assign_H, assign_W), # downsample to 1/4 resolution mode='bilinear', align_corners=False)[0]) return gt_masks_tensor, gt_sem_cls, gt_sem_seg def forward_train(self, img, img_metas, gt_bboxes=None, gt_labels=None, gt_bboxes_ignore=None, gt_masks=None, gt_semantic_seg=None, ref_img=None, ref_img_metas=None, ref_gt_bboxes_ignore=None, ref_gt_labels=None, ref_gt_masks=None, ref_gt_semantic_seg=None, proposals=None, **kwargs): """Forward function of SparseR-CNN-like network in train stage. Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see :class:`mmdet.datasets.pipelines.Collect`. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor): specify which bounding boxes can be ignored when computing the loss. gt_masks (List[Tensor], optional) : Segmentation masks for each box. But we don't support it in this architecture. proposals (List[Tensor], optional): override rpn proposals with custom proposals. Use when `with_rpn` is False. # This is for video only: ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images. Typically these should be mean centered and std scaled. 2 denotes there is two reference images for each input image. ref_img_metas (list[list[dict]]): The first list only has one element. The second list contains reference image information dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmtrack/datasets/pipelines/formatting.py:VideoCollect`. ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The Tensor contains ground truth bboxes for each reference image with shape (num_all_ref_gts, 5) in [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id start from 0, and denotes the id of reference image for each key image. ref_gt_labels (list[Tensor]): The list only has one Tensor. The Tensor contains class indices corresponding to each reference box with shape (num_all_ref_gts, 2) in [ref_img_id, class_indice]. Returns: dict[str, Tensor]: a dictionary of loss components """ batch_input_shape = tuple(img[0].size()[-2:]) for img_meta in img_metas: img_meta['batch_input_shape'] = batch_input_shape assert proposals is None, 'KNet does not support' \ ' external proposals' assert gt_masks is not None # preprocess the reference images ref_img = ref_img.squeeze(1) # (b,3,h,w) ref_masks_gt = [] for ref_gt_mask in ref_gt_masks: ref_masks_gt.append(ref_gt_mask[0]) ref_labels_gt = [] for ref_gt_label in ref_gt_labels: ref_labels_gt.append(ref_gt_label[:, 1].long()) ref_gt_labels = ref_labels_gt ref_semantic_seg_gt = ref_gt_semantic_seg.squeeze(1) ref_img_metas_new = [] for ref_img_meta in ref_img_metas: ref_img_meta[0]['batch_input_shape'] = batch_input_shape ref_img_metas_new.append(ref_img_meta[0]) # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks, gt_sem_cls, gt_sem_seg = self.preprocess_gt_masks(img_metas, gt_masks, gt_labels, gt_semantic_seg) ref_gt_masks, ref_gt_sem_cls, ref_gt_sem_seg = self.preprocess_gt_masks(ref_img_metas_new, ref_masks_gt, ref_labels_gt, ref_semantic_seg_gt) x = self.extract_feat(img) x_ref = self.extract_feat(ref_img) rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks, gt_labels, gt_sem_seg, gt_sem_cls) ref_rpn_results = self.rpn_head.forward_train(x_ref, ref_img_metas_new, ref_gt_masks, ref_labels_gt, ref_gt_sem_seg, ref_gt_sem_cls) (rpn_losses, proposal_feats, x_feats, mask_preds, cls_scores) = rpn_results (ref_rpn_losses, ref_proposal_feats, ref_x_feats, ref_mask_preds, ref_cls_scores) = ref_rpn_results x_fuse = self.combine(ref_x_feats + x_feats) losses, cur_object_query = self.roi_head.forward_train( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, gt_masks, gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, gt_bboxes=gt_bboxes, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls, imgs_whwh=None) track_query_loss = self.track_roi_head.forward_train( x_fuse, cur_object_query, ref_mask_preds, ref_cls_scores, ref_img_metas_new, ref_gt_masks, ref_gt_labels, gt_sem_seg=ref_gt_sem_seg, gt_sem_cls=ref_gt_sem_cls, imgs_whwh=None ) track_query_loss = self.add_track_loss(track_query_loss) ref_rpn_losses = self.add_ref_rpn_loss(ref_rpn_losses) # single frame loss # query track loss for reference frame losses.update(ref_rpn_losses) losses.update(rpn_losses) losses.update(track_query_loss) return losses def simple_test(self, img, img_metas, rescale=False, ref_img=None): """Test function without test time augmentation. Args: imgs (list[torch.Tensor]): List of multiple images img_metas (list[dict]): List of image information. rescale (bool): Whether to rescale the results. Defaults to False. Returns: list[list[np.ndarray]]: BBox results of each image and classes. The outer list corresponds to each image. The inner list corresponds to each class. """ if ref_img is not None: ref_img = ref_img[0] # whether is the first frame for such clips assert 'city' in img_metas[0]['filename'] and 'iid' in img_metas[0] iid = img_metas[0]['iid'] fid = iid % 10000 is_first = (fid == 1) # for current frame x = self.extract_feat(img) rpn_results = self.rpn_head.simple_test_rpn(x, img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results # Changed from the notation above, need further check. cur_segm_results, object_feats, cls_score, mask_preds, scaled_mask_preds = self.roi_head.simple_test( x_feats, proposal_feats, mask_preds, cls_scores, img_metas) bbox_result, segm_result, thing_mask_preds, panoptic_result = cur_segm_results[0] panoptic_seg, segments_info = panoptic_result cur_results, sseg_results = self.pack_stuff_things_result(panoptic_seg, segments_info) if is_first: self.img0 = img self.tracker.reset_all() if len(cur_results["masks"]) == 0: track_maps = np.zeros(panoptic_seg.shape) else: init_track_results = self.tracker.update(img, self.img0, cur_results["masks"]) track_maps = self.generate_track_id_maps(init_track_results, panoptic_seg) else: if len(cur_results["masks"]) == 0: track_maps = np.zeros(panoptic_seg.shape) else: results = self.tracker.update(img, self.img0, cur_results["masks"]) track_maps = self.generate_track_id_maps(results, panoptic_seg) semantic_map = self.get_semantic_seg(panoptic_seg, segments_info) from scripts.visualizer import trackmap2rgb, cityscapes_cat2rgb, draw_bbox_on_img vis_tracker = trackmap2rgb(track_maps) vis_sem = cityscapes_cat2rgb(semantic_map) return semantic_map, track_maps, None,vis_sem, vis_tracker def forward_dummy(self, img): """Used for computing network flops. See `mmdetection/tools/get_flops.py` """ # backbone x = self.extract_feat(img) # rpn num_imgs = len(img) dummy_img_metas = [ dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs) ] rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results # roi_head roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats, dummy_img_metas) return roi_outs def extract_feat(self, img): """Directly extract features from the backbone+neck.""" x = self.backbone(img) if self.with_neck: x = self.neck(x) return x @property def with_rpn(self): """bool: whether the detector has RPN""" return hasattr(self, 'rpn_head') and self.rpn_head is not None @property def with_roi_head(self): """bool: whether the detector has a RoI head""" return hasattr(self, 'roi_head') and self.roi_head is not None def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ pass def add_track_loss(self, loss_dict): track_loss ={} for k, v in loss_dict.items(): track_loss[str(k)+"_track"] = v return track_loss def add_ref_rpn_loss(self, loss_dict): ref_rpn_loss = {} for k, v in loss_dict.items(): ref_rpn_loss[str(k) +"_ref"] = v return ref_rpn_loss def pack_stuff_things_result(self, panoptic_seg, segments_info): results = {} masks = [] scores = [] semantic_seg = np.zeros(panoptic_seg.shape) for segment in segments_info: if segment['isthing'] == True: thing_mask = panoptic_seg == segment["id"] masks.append(thing_mask) scores.append(segment["score"]) # for things to shift the labels # (n - c) semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + 11 else: # for stuff (0- n-1) semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1 results["masks"] = np.array(masks) # (N) results["scores"] = np.array(scores) # (N,H,W) return results, semantic_seg def generate_track_id_maps(self, track_results, panopitc_seg_maps): final_id_maps = np.zeros(panopitc_seg_maps.shape) # print(" current track results: ", len(track_results)) for track in track_results: id = track.track_id mask = track.mask final_id_maps[mask] = id return final_id_maps def get_semantic_seg(self, panoptic_seg, segments_info): results = {} masks = [] scores = [] kitti_step2cityscpaes = [11, 13] semantic_seg = np.zeros(panoptic_seg.shape) for segment in segments_info: if segment['isthing'] == True: if self.kitti_step: cat_cur = kitti_step2cityscpaes[segment["category_id"]] semantic_seg[panoptic_seg == segment["id"]] = cat_cur else: semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + 11 else: # for stuff (0- n-1) if self.kitti_step: cat_cur = segment["category_id"] cat_cur -= 1 offset = 0 for thing_id in kitti_step2cityscpaes: if cat_cur + offset >= thing_id: offset += 1 cat_cur += offset semantic_seg[panoptic_seg == segment["id"]] = cat_cur else: semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1 return semantic_seg ================================================ FILE: knet/video/mask_hungarian_assigner.py ================================================ import numpy as np import torch from mmdet.core import AssignResult, BaseAssigner, reduce_mean from mmdet.core.bbox.builder import BBOX_ASSIGNERS from mmdet.core.bbox.match_costs.builder import MATCH_COST, build_match_cost try: from scipy.optimize import linear_sum_assignment except ImportError: linear_sum_assignment = None @MATCH_COST.register_module() class DiceCost(object): """DiceCost. Args: weight (int | float, optional): loss_weight pred_act (bool): Whether to activate the prediction before calculating cost Examples: >>> from mmdet.core.bbox.match_costs.match_cost import BBoxL1Cost >>> import torch >>> self = BBoxL1Cost() >>> bbox_pred = torch.rand(1, 4) >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]]) >>> factor = torch.tensor([10, 8, 10, 8]) >>> self(bbox_pred, gt_bboxes, factor) tensor([[1.6172, 1.6422]]) """ def __init__(self, weight=1., pred_act=False, act_mode='sigmoid', eps=1e-3): self.weight = weight self.pred_act = pred_act self.act_mode = act_mode self.eps = eps def dice_loss(cls, input, target, eps=1e-3): input = input.reshape(input.size()[0], -1) target = target.reshape(target.size()[0], -1).float() # einsum saves 10x memory # a = torch.sum(input[:, None] * target[None, ...], -1) a = torch.einsum('nh,mh->nm', input, target) b = torch.sum(input * input, 1) + eps c = torch.sum(target * target, 1) + eps d = (2 * a) / (b[:, None] + c[None, ...]) # 1 is a constance that will not affect the matching, so ommitted return -d def __call__(self, mask_preds, gt_masks): """ Args: bbox_pred (Tensor): Predicted boxes with normalized coordinates (cx, cy, w, h), which are all in range [0, 1]. Shape [num_query, 4]. gt_bboxes (Tensor): Ground truth boxes with normalized coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. Returns: torch.Tensor: bbox_cost value with weight """ if self.pred_act and self.act_mode == 'sigmoid': mask_preds = mask_preds.sigmoid() elif self.pred_act: mask_preds = mask_preds.softmax(dim=0) dice_cost = self.dice_loss(mask_preds, gt_masks, self.eps) return dice_cost * self.weight @MATCH_COST.register_module() class MaskCost(object): """MaskCost. Args: weight (int | float, optional): loss_weight """ def __init__(self, weight=1., pred_act=False, act_mode='sigmoid'): self.weight = weight self.pred_act = pred_act self.act_mode = act_mode def __call__(self, cls_pred, target): """ Args: cls_pred (Tensor): Predicted classification logits, shape [num_query, num_class]. gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). Returns: torch.Tensor: cls_cost value with weight """ if self.pred_act and self.act_mode == 'sigmoid': cls_pred = cls_pred.sigmoid() elif self.pred_act: cls_pred = cls_pred.softmax(dim=0) num_proposals = cls_pred.shape[0] num_gts, H, W = target.shape # flatten_cls_pred = cls_pred.view(num_proposals, -1) # eingum is ~10 times faster than matmul pos_cost = torch.einsum('nhw,mhw->nm', cls_pred, target) neg_cost = torch.einsum('nhw,mhw->nm', 1 - cls_pred, 1 - target) # flatten_target = target.view(num_gts, -1).t() # pos_cost = flatten_cls_pred.matmul(flatten_target) # neg_cost = (1 - flatten_cls_pred).matmul(1 - flatten_target) cls_cost = -(pos_cost + neg_cost) / (H * W) return cls_cost * self.weight @BBOX_ASSIGNERS.register_module() class MaskHungarianAssigner(BaseAssigner): """Computes one-to-one matching between predictions and ground truth. This class computes an assignment between the targets and the predictions based on the costs. The costs are weighted sum of three components: classfication cost, regression L1 cost and regression iou cost. The targets don't include the no_object, so generally there are more predictions than targets. After the one-to-one matching, the un-matched are treated as backgrounds. Thus each query prediction will be assigned with `0` or a positive integer indicating the ground truth index: - 0: negative sample, no assigned gt - positive integer: positive sample, index (1-based) of assigned gt Args: cls_weight (int | float, optional): The scale factor for classification cost. Default 1.0. bbox_weight (int | float, optional): The scale factor for regression L1 cost. Default 1.0. iou_weight (int | float, optional): The scale factor for regression iou cost. Default 1.0. iou_calculator (dict | optional): The config for the iou calculation. Default type `BboxOverlaps2D`. iou_mode (str | optional): "iou" (intersection over union), "iof" (intersection over foreground), or "giou" (generalized intersection over union). Default "giou". """ def __init__(self, cls_cost=dict(type='ClassificationCost', weight=1.), mask_cost=dict(type='SigmoidCost', weight=1.0), dice_cost=dict(), boundary_cost=None, topk=1): self.cls_cost = build_match_cost(cls_cost) self.mask_cost = build_match_cost(mask_cost) self.dice_cost = build_match_cost(dice_cost) if boundary_cost is not None: self.boundary_cost = build_match_cost(boundary_cost) else: self.boundary_cost = None self.topk = topk def assign(self, bbox_pred, cls_pred, gt_bboxes, gt_labels, img_meta=None, gt_bboxes_ignore=None, eps=1e-7): """Computes one-to-one matching based on the weighted costs. This method assign each query prediction to a ground truth or background. The `assigned_gt_inds` with -1 means don't care, 0 means negative sample, and positive number is the index (1-based) of assigned gt. The assignment is done in the following steps, the order matters. 1. assign every prediction to -1 2. compute the weighted costs 3. do Hungarian matching on CPU based on the costs 4. assign all to 0 (background) first, then for each matched pair between predictions and gts, treat this prediction as foreground and assign the corresponding gt index (plus 1) to it. Args: bbox_pred (Tensor): Predicted boxes with normalized coordinates (cx, cy, w, h), which are all in range [0, 1]. Shape [num_query, 4]. cls_pred (Tensor): Predicted classification logits, shape [num_query, num_class]. gt_bboxes (Tensor): Ground truth boxes with unnormalized coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). img_meta (dict): Meta information for current image. gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are labelled as `ignored`. Default None. eps (int | float, optional): A value added to the denominator for numerical stability. Default 1e-7. Returns: :obj:`AssignResult`: The assigned result. """ assert gt_bboxes_ignore is None, \ 'Only case when gt_bboxes_ignore is None is supported.' num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) # 1. assign -1 by default assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long) assigned_labels = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long) if num_gts == 0 or num_bboxes == 0: # No ground truth or boxes, return empty assignment if num_gts == 0: # No ground truth, assign all to background assigned_gt_inds[:] = 0 return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels) # 2. compute the weighted costs # classification and bboxcost. if self.cls_cost.weight != 0 and cls_pred is not None: cls_cost = self.cls_cost(cls_pred, gt_labels) else: cls_cost = 0 if self.mask_cost.weight != 0: reg_cost = self.mask_cost(bbox_pred, gt_bboxes) else: reg_cost = 0 if self.dice_cost.weight != 0: dice_cost = self.dice_cost(bbox_pred, gt_bboxes) else: dice_cost = 0 if self.boundary_cost is not None and self.boundary_cost.weight != 0: b_cost = self.boundary_cost(bbox_pred, gt_bboxes) else: b_cost = 0 cost = cls_cost + reg_cost + dice_cost + b_cost # 3. do Hungarian matching on CPU using linear_sum_assignment cost = cost.detach().cpu() if linear_sum_assignment is None: raise ImportError('Please run "pip install scipy" ' 'to install scipy first.') if self.topk == 1: matched_row_inds, matched_col_inds = linear_sum_assignment(cost) else: topk_matched_row_inds = [] topk_matched_col_inds = [] for i in range(self.topk): matched_row_inds, matched_col_inds = linear_sum_assignment( cost) topk_matched_row_inds.append(matched_row_inds) topk_matched_col_inds.append(matched_col_inds) cost[matched_row_inds] = 1e10 matched_row_inds = np.concatenate(topk_matched_row_inds) matched_col_inds = np.concatenate(topk_matched_col_inds) matched_row_inds = torch.from_numpy(matched_row_inds).to( bbox_pred.device) matched_col_inds = torch.from_numpy(matched_col_inds).to( bbox_pred.device) # 4. assign backgrounds and foregrounds # assign all indices to backgrounds first assigned_gt_inds[:] = 0 # assign foregrounds based on matching results assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels) @BBOX_ASSIGNERS.register_module() class MaskHungarianAssignerWithEmbed(BaseAssigner): """Computes one-to-one matching between predictions and ground truth. This class computes an assignment between the targets and the predictions based on the costs. The costs are weighted sum of three components: classfication cost, regression L1 cost and regression iou cost. The targets don't include the no_object, so generally there are more predictions than targets. After the one-to-one matching, the un-matched are treated as backgrounds. Thus each query prediction will be assigned with `0` or a positive integer indicating the ground truth index: - 0: negative sample, no assigned gt - positive integer: positive sample, index (1-based) of assigned gt Args: cls_weight (int | float, optional): The scale factor for classification cost. Default 1.0. bbox_weight (int | float, optional): The scale factor for regression L1 cost. Default 1.0. iou_weight (int | float, optional): The scale factor for regression iou cost. Default 1.0. iou_calculator (dict | optional): The config for the iou calculation. Default type `BboxOverlaps2D`. iou_mode (str | optional): "iou" (intersection over union), "iof" (intersection over foreground), or "giou" (generalized intersection over union). Default "giou". """ def __init__(self, cls_cost=dict(type='ClassificationCost', weight=1.), mask_cost=dict(type='SigmoidCost', weight=1.0), dice_cost=dict(), boundary_cost=None, topk=1): self.cls_cost = build_match_cost(cls_cost) self.mask_cost = build_match_cost(mask_cost) self.dice_cost = build_match_cost(dice_cost) if boundary_cost is not None: self.boundary_cost = build_match_cost(boundary_cost) else: self.boundary_cost = None self.topk = topk def assign(self, bbox_pred, cls_pred, gt_bboxes, gt_labels, embed_pred=None, img_meta=None, gt_bboxes_ignore=None, eps=1e-7): """Computes one-to-one matching based on the weighted costs. This method assign each query prediction to a ground truth or background. The `assigned_gt_inds` with -1 means don't care, 0 means negative sample, and positive number is the index (1-based) of assigned gt. The assignment is done in the following steps, the order matters. 1. assign every prediction to -1 2. compute the weighted costs 3. do Hungarian matching on CPU based on the costs 4. assign all to 0 (background) first, then for each matched pair between predictions and gts, treat this prediction as foreground and assign the corresponding gt index (plus 1) to it. Args: bbox_pred (Tensor): Predicted boxes with normalized coordinates (cx, cy, w, h), which are all in range [0, 1]. Shape [num_query, 4]. cls_pred (Tensor): Predicted classification logits, shape [num_query, num_class]. gt_bboxes (Tensor): Ground truth boxes with unnormalized coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). img_meta (dict): Meta information for current image. gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are labelled as `ignored`. Default None. eps (int | float, optional): A value added to the denominator for numerical stability. Default 1e-7. Returns: :obj:`AssignResult`: The assigned result. """ assert gt_bboxes_ignore is None, \ 'Only case when gt_bboxes_ignore is None is supported.' num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) # 1. assign -1 by default assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long) assigned_labels = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long) if num_gts == 0 or num_bboxes == 0: # No ground truth or boxes, return empty assignment if num_gts == 0: # No ground truth, assign all to background assigned_gt_inds[:] = 0 return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels) # 2. compute the weighted costs # classification and bboxcost. if self.cls_cost.weight != 0 and cls_pred is not None: cls_cost = self.cls_cost(cls_pred, gt_labels) else: cls_cost = 0 if self.mask_cost.weight != 0: reg_cost = self.mask_cost(bbox_pred, gt_bboxes) else: reg_cost = 0 if self.dice_cost.weight != 0: dice_cost = self.dice_cost(bbox_pred, gt_bboxes) else: dice_cost = 0 if self.boundary_cost is not None and self.boundary_cost.weight != 0: b_cost = self.boundary_cost(bbox_pred, gt_bboxes) else: b_cost = 0 cost = cls_cost + reg_cost + dice_cost + b_cost # 3. do Hungarian matching on CPU using linear_sum_assignment cost = cost.detach().cpu() if linear_sum_assignment is None: raise ImportError('Please run "pip install scipy" ' 'to install scipy first.') if self.topk == 1: matched_row_inds, matched_col_inds = linear_sum_assignment(cost) else: topk_matched_row_inds = [] topk_matched_col_inds = [] for i in range(self.topk): matched_row_inds, matched_col_inds = linear_sum_assignment( cost) topk_matched_row_inds.append(matched_row_inds) topk_matched_col_inds.append(matched_col_inds) cost[matched_row_inds] = 1e10 matched_row_inds = np.concatenate(topk_matched_row_inds) matched_col_inds = np.concatenate(topk_matched_col_inds) matched_row_inds = torch.from_numpy(matched_row_inds).to( bbox_pred.device) matched_col_inds = torch.from_numpy(matched_col_inds).to( bbox_pred.device) # 4. assign backgrounds and foregrounds # assign all indices to backgrounds first assigned_gt_inds[:] = 0 # assign foregrounds based on matching results assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels) ================================================ FILE: knet/video/mask_pseudo_sampler.py ================================================ import torch from mmdet.core.bbox import BaseSampler, SamplingResult from mmdet.core.bbox.builder import BBOX_SAMPLERS class MaskSamplingResult(SamplingResult): """Bbox sampling result. Example: >>> # xdoctest: +IGNORE_WANT >>> from mmdet.core.bbox.samplers.sampling_result import * # NOQA >>> self = SamplingResult.random(rng=10) >>> print(f'self = {self}') self = """ def __init__(self, pos_inds, neg_inds, masks, gt_masks, assign_result, gt_flags): self.pos_inds = pos_inds self.neg_inds = neg_inds self.pos_masks = masks[pos_inds] self.neg_masks = masks[neg_inds] self.pos_is_gt = gt_flags[pos_inds] self.num_gts = gt_masks.shape[0] self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 if gt_masks.numel() == 0: # hack for index error case assert self.pos_assigned_gt_inds.numel() == 0 self.pos_gt_masks = torch.empty_like(gt_masks) else: self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :] if assign_result.labels is not None: self.pos_gt_labels = assign_result.labels[pos_inds] else: self.pos_gt_labels = None @property def masks(self): """torch.Tensor: concatenated positive and negative boxes""" return torch.cat([self.pos_masks, self.neg_masks]) def __nice__(self): data = self.info.copy() data['pos_masks'] = data.pop('pos_masks').shape data['neg_masks'] = data.pop('neg_masks').shape parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())] body = ' ' + ',\n '.join(parts) return '{\n' + body + '\n}' @property def info(self): """Returns a dictionary of info about the object.""" return { 'pos_inds': self.pos_inds, 'neg_inds': self.neg_inds, 'pos_masks': self.pos_masks, 'neg_masks': self.neg_masks, 'pos_is_gt': self.pos_is_gt, 'num_gts': self.num_gts, 'pos_assigned_gt_inds': self.pos_assigned_gt_inds, } ================================================ FILE: knet/video/qdtrack/builder.py ================================================ from mmcv.utils import Registry from mmcv.cnn import build_model_from_cfg as build TRACKERS = Registry('tracker') def build_tracker(cfg): """Build tracker.""" return build(cfg, TRACKERS) ================================================ FILE: knet/video/qdtrack/losses/__init__.py ================================================ from .l2_loss import L2Loss from .multipos_cross_entropy_loss import MultiPosCrossEntropyLoss __all__ = ['L2Loss', 'MultiPosCrossEntropyLoss'] ================================================ FILE: knet/video/qdtrack/losses/l2_loss.py ================================================ import numpy as np import torch import torch.nn as nn from mmdet.models import LOSSES, weighted_loss @weighted_loss def l2_loss(pred, target): """L2 loss. Args: pred (torch.Tensor): The prediction. target (torch.Tensor): The learning target of the prediction. Returns: torch.Tensor: Calculated loss """ assert pred.size() == target.size() and target.numel() > 0 loss = torch.abs(pred - target)**2 return loss @LOSSES.register_module() class L2Loss(nn.Module): """L2 loss. Args: reduction (str, optional): The method to reduce the loss. Options are "none", "mean" and "sum". loss_weight (float, optional): The weight of loss. """ def __init__(self, neg_pos_ub=-1, pos_margin=-1, neg_margin=-1, hard_mining=False, reduction='mean', loss_weight=1.0): super(L2Loss, self).__init__() self.neg_pos_ub = neg_pos_ub self.pos_margin = pos_margin self.neg_margin = neg_margin self.hard_mining = hard_mining self.reduction = reduction self.loss_weight = loss_weight def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None): """Forward function. Args: pred (torch.Tensor): The prediction. target (torch.Tensor): The learning target of the prediction. weight (torch.Tensor, optional): The weight of loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) pred, weight, avg_factor = self.update_weight(pred, target, weight, avg_factor) loss_bbox = self.loss_weight * l2_loss( pred, target, weight, reduction=reduction, avg_factor=avg_factor) return loss_bbox def update_weight(self, pred, target, weight, avg_factor): if weight is None: weight = target.new_ones(target.size()) invalid_inds = weight <= 0 target[invalid_inds] = -1 pos_inds = target == 1 neg_inds = target == 0 if self.pos_margin > 0: pred[pos_inds] -= self.pos_margin if self.neg_margin > 0: pred[neg_inds] -= self.neg_margin pred = torch.clamp(pred, min=0, max=1) num_pos = int((target == 1).sum()) num_neg = int((target == 0).sum()) if self.neg_pos_ub > 0 and num_neg / (num_pos + 1 ) > self.neg_pos_ub: num_neg = num_pos * self.neg_pos_ub neg_idx = torch.nonzero(target == 0, as_tuple=False) if self.hard_mining: costs = l2_loss( pred, target, reduction='none')[neg_idx[:, 0], neg_idx[:, 1]].detach() neg_idx = neg_idx[costs.topk(num_neg)[1], :] else: neg_idx = self.random_choice(neg_idx, num_neg) new_neg_inds = neg_inds.new_zeros(neg_inds.size()).bool() new_neg_inds[neg_idx[:, 0], neg_idx[:, 1]] = True invalid_neg_inds = torch.logical_xor(neg_inds, new_neg_inds) weight[invalid_neg_inds] = 0 avg_factor = (weight > 0).sum() return pred, weight, avg_factor @staticmethod def random_choice(gallery, num): """Random select some elements from the gallery. It seems that Pytorch's implementation is slower than numpy so we use numpy to randperm the indices. """ assert len(gallery) >= num if isinstance(gallery, list): gallery = np.array(gallery) cands = np.arange(len(gallery)) np.random.shuffle(cands) rand_inds = cands[:num] if not isinstance(gallery, np.ndarray): rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device) return gallery[rand_inds] ================================================ FILE: knet/video/qdtrack/losses/multipos_cross_entropy_loss.py ================================================ import torch import torch.nn as nn from mmdet.models import LOSSES, weight_reduce_loss def multi_pos_cross_entropy(pred, label, weight=None, reduction='mean', avg_factor=None): # element-wise losses # pos_inds = (label == 1).float() # neg_inds = (label == 0).float() # exp_pos = (torch.exp(-1 * pred) * pos_inds).sum(dim=1) # exp_neg = (torch.exp(pred.clamp(max=80)) * neg_inds).sum(dim=1) # loss = torch.log(1 + exp_pos * exp_neg) # a more numerical stable implementation. pos_inds = (label == 1) neg_inds = (label == 0) pred_pos = pred * pos_inds.float() pred_neg = pred * neg_inds.float() # use -inf to mask out unwanted elements. pred_pos[neg_inds] = pred_pos[neg_inds] + float('inf') pred_neg[pos_inds] = pred_neg[pos_inds] + float('-inf') _pos_expand = torch.repeat_interleave(pred_pos, pred.shape[1], dim=1) _neg_expand = pred_neg.repeat(1, pred.shape[1]) x = torch.nn.functional.pad((_neg_expand - _pos_expand), (0, 1), "constant", 0) loss = torch.logsumexp(x, dim=1) # apply weights and do the reduction if weight is not None: weight = weight.float() loss = weight_reduce_loss( loss, weight=weight, reduction=reduction, avg_factor=avg_factor) return loss @LOSSES.register_module() class MultiPosCrossEntropyLoss(nn.Module): def __init__(self, reduction='mean', loss_weight=1.0): super(MultiPosCrossEntropyLoss, self).__init__() self.reduction = reduction self.loss_weight = loss_weight def forward(self, cls_score, label, weight=None, avg_factor=None, reduction_override=None, **kwargs): assert cls_score.size() == label.size() assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) loss_cls = self.loss_weight * multi_pos_cross_entropy( cls_score, label, weight, reduction=reduction, avg_factor=avg_factor, **kwargs) return loss_cls ================================================ FILE: knet/video/qdtrack/track/__init__.py ================================================ from .similarity import cal_similarity from .transforms import track2result, restore_result __all__ = ['cal_similarity', 'track2result', 'restore_result'] ================================================ FILE: knet/video/qdtrack/track/similarity.py ================================================ import torch import torch.nn.functional as F def cal_similarity(key_embeds, ref_embeds, method='dot_product', temperature=-1): assert method in ['dot_product', 'cosine'] if key_embeds.size(0) == 0 or ref_embeds.size(0) == 0: return torch.zeros((key_embeds.size(0), ref_embeds.size(0)), device=key_embeds.device) if method == 'cosine': key_embeds = F.normalize(key_embeds, p=2, dim=1) ref_embeds = F.normalize(ref_embeds, p=2, dim=1) return torch.mm(key_embeds, ref_embeds.t()) elif method == 'dot_product': if temperature > 0: dists = cal_similarity(key_embeds, ref_embeds, method='cosine') dists /= temperature return dists else: return torch.mm(key_embeds, ref_embeds.t()) ================================================ FILE: knet/video/qdtrack/track/transforms.py ================================================ import numpy as np import torch def track2result(bboxes, labels, ids, num_classes): valid_inds = ids > -1 bboxes = bboxes[valid_inds] labels = labels[valid_inds] ids = ids[valid_inds] if bboxes.shape[0] == 0: return [np.zeros((0, 6), dtype=np.float32) for i in range(num_classes)] else: if isinstance(bboxes, torch.Tensor): bboxes = bboxes.cpu().numpy() labels = labels.cpu().numpy() ids = ids.cpu().numpy() return [ np.concatenate((ids[labels == i, None], bboxes[labels == i, :]), axis=1) for i in range(num_classes) ] def restore_result(result, return_ids=False): labels = [] for i, bbox in enumerate(result): labels.extend([i] * bbox.shape[0]) bboxes = np.concatenate(result, axis=0).astype(np.float32) labels = np.array(labels, dtype=np.int64) if return_ids: ids = bboxes[:, 0].astype(np.int64) bboxes = bboxes[:, 1:] return bboxes, labels, ids else: return bboxes, labels ================================================ FILE: knet/video/qdtrack/trackers/__init__.py ================================================ from .quasi_dense_embed_tracker import QuasiDenseEmbedTracker from .tao_tracker import TaoTracker __all__ = ['QuasiDenseEmbedTracker', 'TaoTracker'] ================================================ FILE: knet/video/qdtrack/trackers/quasi_dense_embed_tracker.py ================================================ import torch import torch.nn.functional as F from mmdet.core import bbox_overlaps from ..builder import TRACKERS @TRACKERS.register_module() class QuasiDenseEmbedTracker(object): def __init__(self, init_score_thr=0.8, obj_score_thr=0.5, match_score_thr=0.5, memo_tracklet_frames=10, memo_backdrop_frames=1, memo_momentum=0.8, nms_conf_thr=0.5, nms_backdrop_iou_thr=0.3, nms_class_iou_thr=0.7, with_cats=True, match_metric='bisoftmax'): assert 0 <= memo_momentum <= 1.0 assert memo_tracklet_frames >= 0 assert memo_backdrop_frames >= 0 self.init_score_thr = init_score_thr self.obj_score_thr = obj_score_thr self.match_score_thr = match_score_thr self.memo_tracklet_frames = memo_tracklet_frames self.memo_backdrop_frames = memo_backdrop_frames self.memo_momentum = memo_momentum self.nms_conf_thr = nms_conf_thr self.nms_backdrop_iou_thr = nms_backdrop_iou_thr self.nms_class_iou_thr = nms_class_iou_thr self.with_cats = with_cats assert match_metric in ['bisoftmax', 'softmax', 'cosine'] self.match_metric = match_metric self.num_tracklets = 0 self.tracklets = dict() self.backdrops = [] @property def empty(self): return False if self.tracklets else True def update_memo(self, ids, bboxes, embeds, labels, frame_id): tracklet_inds = ids > -1 # update memo for id, bbox, embed, label in zip(ids[tracklet_inds], bboxes[tracklet_inds], embeds[tracklet_inds], labels[tracklet_inds]): id = int(id) if id in self.tracklets.keys(): velocity = (bbox - self.tracklets[id]['bbox']) / ( frame_id - self.tracklets[id]['last_frame']) self.tracklets[id]['bbox'] = bbox self.tracklets[id]['embed'] = ( 1 - self.memo_momentum ) * self.tracklets[id]['embed'] + self.memo_momentum * embed self.tracklets[id]['last_frame'] = frame_id self.tracklets[id]['label'] = label self.tracklets[id]['velocity'] = ( self.tracklets[id]['velocity'] * self.tracklets[id]['acc_frame'] + velocity) / ( self.tracklets[id]['acc_frame'] + 1) self.tracklets[id]['acc_frame'] += 1 else: self.tracklets[id] = dict( bbox=bbox, embed=embed, label=label, last_frame=frame_id, velocity=torch.zeros_like(bbox), acc_frame=0) backdrop_inds = torch.nonzero(ids == -1, as_tuple=False).squeeze(1) ious = bbox_overlaps(bboxes[backdrop_inds, :-1], bboxes[:, :-1]) for i, ind in enumerate(backdrop_inds): if (ious[i, :ind] > self.nms_backdrop_iou_thr).any(): backdrop_inds[i] = -1 backdrop_inds = backdrop_inds[backdrop_inds > -1] self.backdrops.insert( 0, dict( bboxes=bboxes[backdrop_inds], embeds=embeds[backdrop_inds], labels=labels[backdrop_inds])) # pop memo invalid_ids = [] for k, v in self.tracklets.items(): if frame_id - v['last_frame'] >= self.memo_tracklet_frames: invalid_ids.append(k) for invalid_id in invalid_ids: self.tracklets.pop(invalid_id) if len(self.backdrops) > self.memo_backdrop_frames: self.backdrops.pop() @property def memo(self): memo_embeds = [] memo_ids = [] memo_bboxes = [] memo_labels = [] memo_vs = [] for k, v in self.tracklets.items(): memo_bboxes.append(v['bbox'][None, :]) memo_embeds.append(v['embed'][None, :]) memo_ids.append(k) memo_labels.append(v['label'].view(1, 1)) memo_vs.append(v['velocity'][None, :]) memo_ids = torch.tensor(memo_ids, dtype=torch.long).view(1, -1) for backdrop in self.backdrops: backdrop_ids = torch.full((1, backdrop['embeds'].size(0)), -1, dtype=torch.long) backdrop_vs = torch.zeros_like(backdrop['bboxes']) memo_bboxes.append(backdrop['bboxes']) memo_embeds.append(backdrop['embeds']) memo_ids = torch.cat([memo_ids, backdrop_ids], dim=1) memo_labels.append(backdrop['labels'][:, None]) memo_vs.append(backdrop_vs) memo_bboxes = torch.cat(memo_bboxes, dim=0) memo_embeds = torch.cat(memo_embeds, dim=0) memo_labels = torch.cat(memo_labels, dim=0).squeeze(1) memo_vs = torch.cat(memo_vs, dim=0) return memo_bboxes, memo_labels, memo_embeds, memo_ids.squeeze( 0), memo_vs def match(self, bboxes, labels, track_feats, frame_id, asso_tau=-1): _, inds = bboxes[:, -1].sort(descending=True) bboxes = bboxes[inds, :] labels = labels[inds] embeds = track_feats[inds, :] # hack we do not consider the nms since we use # # duplicate removal for potential backdrops and cross classes valids = bboxes.new_ones((bboxes.size(0))) ious = bbox_overlaps(bboxes[:, :-1], bboxes[:, :-1]) for i in range(1, bboxes.size(0)): thr = self.nms_backdrop_iou_thr if bboxes[ i, -1] < self.obj_score_thr else self.nms_class_iou_thr if (ious[i, :i] > thr).any(): valids[i] = 0 valids = valids == 1 bboxes = bboxes[valids, :] labels = labels[valids] embeds = embeds[valids, :] # init ids container ids = torch.full((bboxes.size(0), ), -1, dtype=torch.long) # match if buffer is not empty if bboxes.size(0) > 0 and not self.empty: (memo_bboxes, memo_labels, memo_embeds, memo_ids, memo_vs) = self.memo if self.match_metric == 'bisoftmax': feats = torch.mm(embeds, memo_embeds.t()) d2t_scores = feats.softmax(dim=1) t2d_scores = feats.softmax(dim=0) scores = (d2t_scores + t2d_scores) / 2 elif self.match_metric == 'softmax': feats = torch.mm(embeds, memo_embeds.t()) scores = feats.softmax(dim=1) elif self.match_metric == 'cosine': scores = torch.mm( F.normalize(embeds, p=2, dim=1), F.normalize(memo_embeds, p=2, dim=1).t()) else: raise NotImplementedError if self.with_cats: cat_same = labels.view(-1, 1) == memo_labels.view(1, -1) scores *= cat_same.float().to(scores.device) for i in range(bboxes.size(0)): conf, memo_ind = torch.max(scores[i, :], dim=0) id = memo_ids[memo_ind] if conf > self.match_score_thr: if id > -1: if bboxes[i, -1] > self.obj_score_thr: ids[i] = id scores[:i, memo_ind] = 0 scores[i + 1:, memo_ind] = 0 else: if conf > self.nms_conf_thr: ids[i] = -2 new_inds = (ids == -1) & (bboxes[:, 4] > self.init_score_thr).cpu() num_news = new_inds.sum() ids[new_inds] = torch.arange( self.num_tracklets, self.num_tracklets + num_news, dtype=torch.long) self.num_tracklets += num_news self.update_memo(ids, bboxes, embeds, labels, frame_id) return bboxes, labels, ids ================================================ FILE: knet/video/qdtrack/trackers/tao_tracker.py ================================================ import os import random from collections import defaultdict import cv2 import mmcv import numpy as np import seaborn as sns import torch from mmcv.image import imread, imwrite from mmcv.visualization import color_val, imshow from mmdet.core import bbox_overlaps from knet.video.qdtrack.track.similarity import cal_similarity from ..builder import TRACKERS @TRACKERS.register_module() class TaoTracker(object): def __init__(self, init_score_thr=0.0001, obj_score_thr=0.0001, match_score_thr=0.5, memo_frames=10, momentum_embed=0.8, momentum_obj_score=0.5, obj_score_diff_thr=1.0, distractor_nms_thr=0.3, distractor_score_thr=0.5, match_metric='bisoftmax', match_with_cosine=True): self.init_score_thr = init_score_thr self.obj_score_thr = obj_score_thr self.match_score_thr = match_score_thr self.memo_frames = memo_frames self.momentum_embed = momentum_embed self.momentum_obj_score = momentum_obj_score self.obj_score_diff_thr = obj_score_diff_thr self.distractor_nms_thr = distractor_nms_thr self.distractor_score_thr = distractor_score_thr assert match_metric in ['bisoftmax', 'cosine'] self.match_metric = match_metric self.match_with_cosine = match_with_cosine self.reset() def reset(self): self.num_tracklets = 0 self.tracklets = dict() # for analysis self.pred_tracks = defaultdict(lambda: defaultdict(list)) self.gt_tracks = defaultdict(lambda: defaultdict(list)) @property def valid_ids(self): valid_ids = [] for k, v in self.gt_tracks.items(): valid_ids.extend(v['ids']) return list(set(valid_ids)) @property def empty(self): return False if self.tracklets else True def update_memo(self, ids, bboxes, labels, embeds, frame_id): tracklet_inds = ids > -1 # update memo for id, bbox, embed, label in zip(ids[tracklet_inds], bboxes[tracklet_inds], embeds[tracklet_inds], labels[tracklet_inds]): id = int(id) if id in self.tracklets: self.tracklets[id]['bboxes'].append(bbox) self.tracklets[id]['labels'].append(label) self.tracklets[id]['embeds'] = ( 1 - self.momentum_embed ) * self.tracklets[id]['embeds'] + self.momentum_embed * embed self.tracklets[id]['frame_ids'].append(frame_id) else: self.tracklets[id] = dict( bboxes=[bbox], labels=[label], embeds=embed, frame_ids=[frame_id]) # pop memo invalid_ids = [] for k, v in self.tracklets.items(): if frame_id - v['frame_ids'][-1] >= self.memo_frames: invalid_ids.append(k) for invalid_id in invalid_ids: self.tracklets.pop(invalid_id) @property def memo(self): memo_ids = [] memo_bboxes = [] memo_labels = [] memo_embeds = [] for k, v in self.tracklets.items(): memo_ids.append(k) memo_bboxes.append(v['bboxes'][-1][None, :]) memo_labels.append(v['labels'][-1].view(1, 1)) memo_embeds.append(v['embeds'][None, :]) memo_ids = torch.tensor(memo_ids, dtype=torch.long).view(1, -1) memo_bboxes = torch.cat(memo_bboxes, dim=0) memo_embeds = torch.cat(memo_embeds, dim=0) memo_labels = torch.cat(memo_labels, dim=0).squeeze(1) return memo_bboxes, memo_labels, memo_embeds, memo_ids.squeeze(0) def init_tracklets(self, ids, obj_scores): new_objs = (ids == -1) & (obj_scores > self.init_score_thr).cpu() num_new_objs = new_objs.sum() ids[new_objs] = torch.arange( self.num_tracklets, self.num_tracklets + num_new_objs, dtype=torch.long) self.num_tracklets += num_new_objs return ids def match(self, bboxes, labels, track_feats, frame_id, temperature=-1, **kwargs): if track_feats is None: ids = torch.full((bboxes.size(0), ), -1, dtype=torch.long) return bboxes, labels, ids # all objects is valid here valid_inds = labels > -1 # nms low_inds = torch.nonzero( bboxes[:, -1] < self.distractor_score_thr, as_tuple=False).squeeze(1) cat_same = labels[low_inds].view(-1, 1) == labels.view(1, -1) ious = bbox_overlaps(bboxes[low_inds, :-1], bboxes[:, :-1]) ious *= cat_same.to(ious.device) for i, ind in enumerate(low_inds): if (ious[i, :ind] > self.distractor_nms_thr).any(): valid_inds[ind] = False bboxes = bboxes[valid_inds] labels = labels[valid_inds] embeds = track_feats[valid_inds] # match if buffer is not empty if bboxes.size(0) > 0 and not self.empty: memo_bboxes, memo_labels, memo_embeds, memo_ids = self.memo if self.match_metric == 'bisoftmax': sims = cal_similarity( embeds, memo_embeds, method='dot_product', temperature=temperature) cat_same = labels.view(-1, 1) == memo_labels.view(1, -1) exps = torch.exp(sims) * cat_same.to(sims.device) d2t_scores = exps / (exps.sum(dim=1).view(-1, 1) + 1e-6) t2d_scores = exps / (exps.sum(dim=0).view(1, -1) + 1e-6) cos_scores = cal_similarity( embeds, memo_embeds, method='cosine') cos_scores *= cat_same.to(cos_scores.device) scores = (d2t_scores + t2d_scores) / 2 if self.match_with_cosine: scores = (scores + cos_scores) / 2 elif self.match_metric == 'cosine': cos_scores = cal_similarity( embeds, memo_embeds, method='cosine') cat_same = labels.view(-1, 1) == memo_labels.view(1, -1) scores = cos_scores * cat_same.float().to(cos_scores.device) else: raise NotImplementedError() if 'metas' in kwargs: raw_scores = scores.clone() obj_score_diffs = torch.abs( bboxes[:, -1].view(-1, 1).expand_as(scores) - memo_bboxes[:, -1].view(1, -1).expand_as(scores)) num_objs = bboxes.size(0) ids = torch.full((num_objs, ), -1, dtype=torch.long) for i in range(num_objs): if bboxes[i, -1] < self.obj_score_thr: continue conf, memo_ind = torch.max(scores[i, :], dim=0) obj_score_diff = obj_score_diffs[i, memo_ind] if (conf > self.match_score_thr) and (obj_score_diff < self.obj_score_diff_thr): ids[i] = memo_ids[memo_ind] scores[:i, memo_ind] = 0 scores[i + 1:, memo_ind] = 0 m = self.momentum_obj_score bboxes[i, -1] = m * bboxes[i, -1] + ( 1 - m) * memo_bboxes[memo_ind, -1] else: ids = torch.full((bboxes.size(0), ), -1, dtype=torch.long) # init tracklets ids = self.init_tracklets(ids, bboxes[:, -1]) self.update_memo(ids, bboxes, labels, embeds, frame_id) # ---------------- if 'metas' in kwargs and kwargs['metas'].analyze: metas = kwargs['metas'] gt_bboxes, gt_labels, gt_ids = [ metas['bboxes'], metas['labels'], metas['instance_ids'] ] gt_bboxes = torch.cat( (gt_bboxes, torch.zeros(gt_bboxes.size(0), 1)), dim=1) if bboxes.size(0) == 0 or gt_bboxes.size(0) == 0: return bboxes, labels, ids fns = torch.ones(gt_bboxes.size(0), dtype=torch.long) fps = torch.ones(bboxes.size(0), dtype=torch.long) sw_fps = torch.zeros(bboxes.size(0), dtype=torch.long) idsw = torch.zeros(bboxes.size(0), dtype=torch.long) ious = bbox_overlaps(bboxes[:, :4], gt_bboxes[:, :4]) same_cat = labels.view(-1, 1) == gt_labels.view(1, -1) ious *= same_cat.float().to(ious.device) gt_inds = torch.full(ids.size(), -1, dtype=torch.long) for i, bbox in enumerate(bboxes): max_iou, j = ious[i].max(dim=0) if max_iou > 0.5: fps[i], fns[j] = 0, 0 gt_inds[i] = j ious[:, j] = -1 gt_id = int(gt_ids[j]) pred_id = int(ids[i]) if len(self.gt_tracks[gt_id]['ids']) > 0: if pred_id != self.gt_tracks[gt_id]['ids'][-1]: idsw[i] = 1 else: if pred_id in self.pred_tracks: idsw[i] = 1 self.gt_tracks[gt_id]['scores'].append( float(f'{bbox[-1]:.3f}')) self.gt_tracks[gt_id]['ids'].append(pred_id) self.gt_tracks[gt_id]['frame_ids'].append( metas.img_info['frame_id']) for i, id in enumerate(ids): id = int(id) self.pred_tracks[id]['scores'].append( float(f'{bboxes[i, -1]:.3f}')) if metas.img_info['frame_id'] > 0: memo_ind = torch.nonzero( memo_ids == id, as_tuple=False).squeeze(1) else: memo_ind = [] if len(memo_ind) > 0: self.pred_tracks[id]['match_scores'].append( float(f'{raw_scores[i, memo_ind[0]]:.3f}')) else: self.pred_tracks[id]['match_scores'].append(-1) if gt_inds[i] == -1: self.pred_tracks[id]['ids'].append(-1) else: self.pred_tracks[id]['ids'].append(int(gt_ids[gt_inds[i]])) self.pred_tracks[id]['frame_ids'].append( metas.img_info['frame_id']) if fps[i]: if id in self.valid_ids: sw_fps[i] = 1 continue fp_inds = sw_fps == 1 # red fn_inds = fns == 1 # yellow idsw_inds = idsw == 1 # cyan tp_inds = fps == 0 # green tp_inds[idsw_inds] = 0 os.makedirs(metas.out_file.rsplit('/', 1)[0], exist_ok=True) img = metas.img_name # black if idsw_inds.any(): sw_ids = ids[idsw_inds] memo_inds = (memo_ids.view(-1, 1) == sw_ids.view( 1, -1)).sum(dim=1) > 0 img = imshow_tracklets( img, memo_bboxes[memo_inds].numpy(), memo_labels[memo_inds].numpy(), memo_ids[memo_inds].numpy(), color='magenta', show=False) img = imshow_tracklets( img, bboxes[tp_inds].numpy(), labels[tp_inds].numpy(), ids[tp_inds].numpy(), color='green', show=False) img = imshow_tracklets( img, bboxes[fp_inds].numpy(), labels[fp_inds].numpy(), ids[fp_inds].numpy(), color='red', show=False) img = imshow_tracklets( img, bboxes=gt_bboxes[fn_inds, :].numpy(), labels=gt_labels[fn_inds].numpy(), color='yellow', show=False) img = imshow_tracklets( img, bboxes[idsw_inds].numpy(), labels[idsw_inds].numpy(), ids[idsw_inds].numpy(), color='cyan', show=False, out_file=metas.out_file) return bboxes, labels, ids def random_color(seed): random.seed(seed) colors = sns.color_palette() color = random.choice(colors) return color def imshow_tracklets(img, bboxes, labels=None, ids=None, thickness=2, font_scale=0.4, show=False, win_name='', color=None, out_file=None): assert bboxes.ndim == 2 assert labels.ndim == 1 assert bboxes.shape[0] == labels.shape[0] # assert bboxes.shape[1] == 4 or bboxes.shape[1] == 5 if isinstance(img, str): img = imread(img) i = 0 if bboxes.shape[0] == 0: if out_file is not None: imwrite(img, out_file) return img if isinstance(bboxes, torch.Tensor): bboxes = bboxes.numpy() labels = labels.numpy() ids = ids.numpy() for bbox, label in zip(bboxes, labels): x1, y1, x2, y2, _ = bbox.astype(np.int32) if ids is not None: if color is None: bbox_color = random_color(ids[i]) bbox_color = [int(255 * _c) for _c in bbox_color][::-1] else: bbox_color = mmcv.color_val(color) img[y1:y1 + 12, x1:x1 + 20, :] = bbox_color cv2.putText( img, str(ids[i]), (x1, y1 + 10), cv2.FONT_HERSHEY_COMPLEX, font_scale, color=color_val('black')) else: if color is None: bbox_color = color_val('green') else: bbox_color = mmcv.color_val(color) cv2.rectangle(img, (x1, y1), (x2, y2), bbox_color, thickness=thickness) if bbox[-1] < 0: bbox[-1] = np.nan # label_text = '{:.02f}'.format(bbox[-1]) # img[y1 - 12:y1, x1:x1 + 30, :] = bbox_color # cv2.putText( # img, # label_text, (x1, y1 - 2), # cv2.FONT_HERSHEY_COMPLEX, # font_scale, # color=color_val('black')) i += 1 if show: imshow(img, win_name) if out_file is not None: imwrite(img, out_file) return img ================================================ FILE: knet/video/track_heads.py ================================================ """ This file implements several tracking heads """ import numpy as np import torch import torch.nn as nn from mmcv.cnn import ConvModule, normal_init from mmdet.models.builder import HEADS, build_head, build_loss, build_roi_extractor from mmdet.models.losses import accuracy from mmdet.core import multi_apply, bbox2roi from knet.video.qdtrack.track import cal_similarity from unitrack.utils.mask import mask2box, batch_mask2boxlist, bboxlist2roi @HEADS.register_module() class QueryTrackHead(nn.Module): """Tracking head, predict tracking features and match with reference objects Use dynamic option to deal with different number of objects in different images. A non-match entry is added to the reference objects with all-zero features. Object matched with the non-match entry is considered as a new object. """ def __init__(self, num_fcs=2, in_channels=256, fc_out_channels=1024, match_coeff=None, bbox_dummy_iou=0, dynamic=True, loss_match=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)): super(QueryTrackHead, self).__init__() self.in_channels = in_channels self.match_coeff = match_coeff self.bbox_dummy_iou = bbox_dummy_iou self.num_fcs = num_fcs self.fcs = nn.ModuleList() for i in range(num_fcs): out_channels = (in_channels if i < num_fcs - 1 else fc_out_channels) fc = nn.Linear(in_channels, out_channels) self.fcs.append(fc) self.relu = nn.ReLU(inplace=True) self.debug_imgs = None self.dynamic = dynamic assert self.dynamic == True, "Naive tracking embedding head must be dynamic" #### modification self.loss_match = build_loss(loss_match) def init_weights(self): for fc in self.fcs: nn.init.normal_(fc.weight, 0, 0.01) nn.init.constant_(fc.bias, 0) def compute_comp_scores(self, match_ll, bbox_scores, bbox_ious, label_delta, add_bbox_dummy=False): # compute comprehensive matching score based on matchig likelihood, # bbox confidence, and ious if add_bbox_dummy: bbox_iou_dummy = torch.ones(bbox_ious.size(0), 1).to(torch.cuda.current_device()) * self.bbox_dummy_iou bbox_ious = torch.cat((bbox_iou_dummy, bbox_ious), dim=1) label_dummy = torch.ones(bbox_ious.size(0), 1).to(torch.cuda.current_device()) label_delta = torch.cat((label_dummy, label_delta), dim=1) if self.match_coeff is None: return match_ll else: # match coeff needs to be length of 3 assert (len(self.match_coeff) == 3) return (match_ll + self.match_coeff[0] * torch.log(bbox_scores) + self.match_coeff[1] * bbox_ious + self.match_coeff[2] * label_delta) def forward(self, x, ref_x, x_n, ref_x_n): # x and ref_x are the grouped bbox features of current and reference frame # x_n are the numbers of proposals in the current images in the mini-batch, # ref_x_n are the numbers of ground truth bboxes in the reference images. # here we compute a correlation matrix of x and ref_x # we also add a all 0 column denote no matching assert len(x_n) == len(ref_x_n) # ==> the batch size should be the same. b, N, d = x.size() x = x.reshape(b*N, d) ref_x = ref_x.reshape(b*N, d) for idx, fc in enumerate(self.fcs): x = fc(x) ref_x = fc(ref_x) if idx < len(self.fcs) - 1: x = self.relu(x) ref_x = self.relu(ref_x) n = len(x_n) x_split = torch.split(x, x_n, dim=0) ref_x_split = torch.split(ref_x, ref_x_n, dim=0) prods = [] for i in range(n): prod = torch.mm(x_split[i], torch.transpose(ref_x_split[i], 0, 1)) prods.append(prod) if self.dynamic: match_score = [] for prod in prods: m = prod.size(0) dummy = torch.zeros(m, 1).to(torch.cuda.current_device()) prod_ext = torch.cat([dummy, prod], dim=1) match_score.append(prod_ext) return match_score def loss(self, match_score, sampling_results): losses = dict() n = len(match_score) x_n = [s.size(0) for s in match_score] ids, id_weights = self.get_targets(sampling_results) ids = torch.split(ids, x_n, dim=0) id_weights = torch.split(id_weights, x_n, dim=0) loss_match = 0.0 match_acc = 0.0 n_total = 0 for score, cur_ids, cur_weights in zip(match_score, ids, id_weights): valid_idx = torch.nonzero(cur_weights).squeeze() if len(valid_idx.size()) == 0: continue n_valid = valid_idx.size(0) n_total += n_valid loss_match_per_batch = self.loss_match(score, cur_ids, cur_weights) match_acc += accuracy( torch.index_select(score, 0, valid_idx), torch.index_select(cur_ids, 0, valid_idx)) * n_valid loss_match += loss_match_per_batch if loss_match == 0.0: losses['loss_match'] = ids[0].sum() * 0 else: losses['loss_match'] = loss_match / n return losses def get_targets(self, sampling_results, concat=True, ): pos_inds_list = [res.pos_inds for res in sampling_results] neg_inds_list = [res.neg_inds for res in sampling_results] pos_mask_list = [res.pos_masks for res in sampling_results] neg_mask_list = [res.neg_masks for res in sampling_results] pos_gt_pid_list = [res.pos_gt_pids for res in sampling_results] ids, id_weights = multi_apply( self._get_target_single, pos_inds_list, neg_inds_list, pos_mask_list, neg_mask_list, pos_gt_pid_list) if concat: ids = torch.cat(ids, 0) id_weights = torch.cat(id_weights, 0) return ids, id_weights def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask, pos_gt_pid_list): num_pos = pos_mask.size(0) num_neg = neg_mask.size(0) num_samples = num_pos + num_neg ids = pos_mask.new_zeros((num_samples,), dtype=torch.long) ids_weights = pos_mask.new_zeros((num_samples,)) if num_pos > 0: ids[pos_inds] = pos_gt_pid_list ids_weights[pos_inds] = 1.0 if num_neg > 0: ids_weights[neg_inds] = 0.0 return ids, ids_weights @HEADS.register_module() class TrackHeadWithROIAlign(nn.Module): """Tracking head, predict tracking features and match with reference objects Use dynamic option to deal with different number of objects in different images. A non-match entry is added to the reference objects with all-zero features. Object matched with the non-match entry is considered as a new object. """ def __init__(self, num_fcs=2, in_channels=256, fc_out_channels=1024, match_coeff=None, bbox_dummy_iou=0, dynamic=True, bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict( type='RoIAlign', output_size=7, sampling_ratio=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), loss_match=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)): super(TrackHeadWithROIAlign, self).__init__() assert bbox_roi_extractor is not None self.in_channels = in_channels self.match_coeff = match_coeff self.bbox_dummy_iou = bbox_dummy_iou self.num_fcs = num_fcs self.fcs = nn.ModuleList() for i in range(num_fcs): out_channels = (in_channels if i < num_fcs - 1 else fc_out_channels) fc = nn.Linear(in_channels, out_channels) self.fcs.append(fc) self.relu = nn.ReLU(inplace=True) self.debug_imgs = None self.dynamic = dynamic assert self.dynamic == True, "Naive tracking embedding head must be dynamic" self.bbox_roi_extractor = build_roi_extractor( bbox_roi_extractor) #### modification self.loss_match = build_loss(loss_match) def init_weights(self): for fc in self.fcs: nn.init.normal_(fc.weight, 0, 0.01) nn.init.constant_(fc.bias, 0) def compute_comp_scores(self, match_ll, bbox_scores, bbox_ious, label_delta, add_bbox_dummy=False): # compute comprehensive matching score based on matchig likelihood, # bbox confidence, and ious if add_bbox_dummy: bbox_iou_dummy = torch.ones(bbox_ious.size(0), 1).to(torch.cuda.current_device()) * self.bbox_dummy_iou bbox_ious = torch.cat((bbox_iou_dummy, bbox_ious), dim=1) label_dummy = torch.ones(bbox_ious.size(0), 1).to(torch.cuda.current_device()) label_delta = torch.cat((label_dummy, label_delta), dim=1) if self.match_coeff is None: return match_ll else: # match coeff needs to be length of 3 assert (len(self.match_coeff) == 3) return (match_ll + self.match_coeff[0] * torch.log(bbox_scores) + self.match_coeff[1] * bbox_ious + self.match_coeff[2] * label_delta) def forward(self, x, ref_x, mask_pred, ref_mask_pred, x_n, ref_x_n): """ Args: x: backbone feature of current frame ref_x: backbone feature of reference frame mask_pred: mask prediction of current frame ref_mask_pred: reference mask prediction x_n: number of proposal ref_x_n: number of proposal in ref frame Returns: """ # print("mask shape ",mask_pred.shape) bbox_pred = batch_mask2boxlist(mask_pred) ref_bbox_pred = batch_mask2boxlist(ref_mask_pred) # rois = bboxlist2roi(bbox_pred) # ref_rois = bboxlist2roi(ref_bbox_pred) x = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) ref_x = self.bbox_roi_extractor( ref_x[:self.bbox_roi_extractor.num_inputs], ref_rois) # x and ref_x are the grouped bbox features of current and reference frame # x_n are the numbers of proposals in the current images in the mini-batch, # ref_x_n are the numbers of ground truth bboxes in the reference images. # here we compute a correlation matrix of x and ref_x # we also add a all 0 column denote no matching b, N, d = x.size() x = x.reshape(b*N, d) ref_x = ref_x.reshape(b*N, d) for idx, fc in enumerate(self.fcs): x = fc(x) ref_x = fc(ref_x) if idx < len(self.fcs) - 1: x = self.relu(x) ref_x = self.relu(ref_x) n = len(x_n) x_split = torch.split(x, x_n, dim=0) ref_x_split = torch.split(ref_x, ref_x_n, dim=0) prods = [] for i in range(n): prod = torch.mm(x_split[i], torch.transpose(ref_x_split[i], 0, 1)) prods.append(prod) if self.dynamic: match_score = [] for prod in prods: m = prod.size(0) dummy = torch.zeros(m, 1).to(torch.cuda.current_device()) prod_ext = torch.cat([dummy, prod], dim=1) match_score.append(prod_ext) return match_score def loss(self, match_score, sampling_results): losses = dict() n = len(match_score) x_n = [s.size(0) for s in match_score] ids, id_weights = self.get_targets(sampling_results) ids = torch.split(ids, x_n, dim=0) id_weights = torch.split(id_weights, x_n, dim=0) loss_match = torch.zeros(0).to(torch.cuda.current_device()) match_acc = 0. n_total = 0 for score, cur_ids, cur_weights in zip(match_score, ids, id_weights): valid_idx = torch.nonzero(cur_weights).squeeze() if len(valid_idx.size()) == 0: continue n_valid = valid_idx.size(0) n_total += n_valid loss_match += self.loss_match( score, cur_ids, cur_weights) match_acc += accuracy( torch.index_select(score, 0, valid_idx), torch.index_select(cur_ids, 0, valid_idx)) * n_valid losses['loss_match'] = loss_match / n if n_total > 0: losses['match_acc'] = match_acc / n_total return losses def get_targets(self, sampling_results, concat=True, ): pos_inds_list = [res.pos_inds for res in sampling_results] neg_inds_list = [res.neg_inds for res in sampling_results] pos_mask_list = [res.pos_masks for res in sampling_results] neg_mask_list = [res.neg_masks for res in sampling_results] pos_gt_pid_list = [res.pos_gt_pids for res in sampling_results] ids, id_weights = multi_apply( self._get_target_single, pos_inds_list, neg_inds_list, pos_mask_list, neg_mask_list, pos_gt_pid_list) if concat: ids = torch.cat(ids, 0) id_weights = torch.cat(id_weights, 0) return ids, id_weights def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask, pos_gt_pid_list): num_pos = pos_mask.size(0) num_neg = neg_mask.size(0) num_samples = num_pos + num_neg ids = pos_mask.new_zeros((num_samples,), dtype=torch.long) ids_weights = pos_mask.new_zeros((num_samples,)) if num_pos > 0: ids[pos_inds] = pos_gt_pid_list ids_weights[pos_inds] = 1.0 if num_neg > 0: ids_weights[neg_inds] = 0.0 return ids, ids_weights @HEADS.register_module() class QuasiDenseMaskEmbedHead(nn.Module): def __init__(self, num_convs=4, num_fcs=1, roi_feat_size=7, in_channels=256, conv_out_channels=256, fc_out_channels=1024, embed_channels=256, conv_cfg=None, norm_cfg=None, softmax_temp=-1, loss_track=dict( type='MultiPosCrossEntropyLoss', loss_weight=0.25), loss_track_aux=dict( type='L2Loss', sample_ratio=3, margin=0.3, loss_weight=1.0, hard_mining=True)): super(QuasiDenseMaskEmbedHead, self).__init__() self.num_convs = num_convs self.num_fcs = num_fcs self.roi_feat_size = roi_feat_size self.in_channels = in_channels self.conv_out_channels = conv_out_channels self.fc_out_channels = fc_out_channels self.embed_channels = embed_channels self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.relu = nn.ReLU(inplace=True) self.convs, self.fcs, last_layer_dim = self._add_conv_fc_branch( self.num_convs, self.num_fcs, self.in_channels) self.fc_embed = nn.Linear(last_layer_dim, embed_channels) self.softmax_temp = softmax_temp self.loss_track = build_loss(loss_track) if loss_track_aux is not None: self.loss_track_aux = build_loss(loss_track_aux) else: self.loss_track_aux = None def _add_conv_fc_branch(self, num_convs, num_fcs, in_channels): last_layer_dim = in_channels # add branch specific conv layers convs = nn.ModuleList() if num_convs > 0: for i in range(num_convs): conv_in_channels = ( last_layer_dim if i == 0 else self.conv_out_channels) convs.append( ConvModule( conv_in_channels, self.conv_out_channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg)) last_layer_dim = self.conv_out_channels # add branch specific fc layers fcs = nn.ModuleList() if num_fcs > 0: last_layer_dim *= (self.roi_feat_size * self.roi_feat_size) for i in range(num_fcs): fc_in_channels = ( last_layer_dim if i == 0 else self.fc_out_channels) fcs.append(nn.Linear(fc_in_channels, self.fc_out_channels)) last_layer_dim = self.fc_out_channels return convs, fcs, last_layer_dim def init_weights(self): for m in self.fcs: if isinstance(m, nn.Linear): nn.init.xavier_uniform_(m.weight) nn.init.constant_(m.bias, 0) nn.init.normal_(self.fc_embed.weight, 0, 0.01) nn.init.constant_(self.fc_embed.bias, 0) def forward(self, x): if self.num_convs > 0: for i, conv in enumerate(self.convs): x = conv(x) x = x.view(x.size(0), -1) if self.num_fcs > 0: for i, fc in enumerate(self.fcs): x = self.relu(fc(x)) x = self.fc_embed(x) return x def get_track_targets(self, gt_match_indices, key_sampling_results, ref_sampling_results): track_targets = [] track_weights = [] for _gt_match_indices, key_res, ref_res in zip(gt_match_indices, key_sampling_results, ref_sampling_results): targets = _gt_match_indices.new_zeros( (key_res.pos_masks.size(0), ref_res.pos_masks.size(0)), dtype=torch.int) _match_indices = _gt_match_indices[key_res.pos_assigned_gt_inds] pos2pos = (_match_indices.view( -1, 1) == ref_res.pos_assigned_gt_inds.view(1, -1)).int() targets[:, :pos2pos.size(1)] = pos2pos weights = (targets.sum(dim=1) > 0).float() track_targets.append(targets) track_weights.append(weights) return track_targets, track_weights def match(self, key_embeds, ref_embeds, key_sampling_results, ref_sampling_results): num_key_rois = [res.pos_masks.size(0) for res in key_sampling_results] key_embeds = torch.split(key_embeds, num_key_rois) num_ref_rois = [res.pos_masks.size(0) for res in ref_sampling_results] ref_embeds = torch.split(ref_embeds, num_ref_rois) dists, cos_dists = [], [] for key_embed, ref_embed in zip(key_embeds, ref_embeds): dist = cal_similarity( key_embed, ref_embed, method='dot_product', temperature=self.softmax_temp) dists.append(dist) if self.loss_track_aux is not None: cos_dist = cal_similarity( key_embed, ref_embed, method='cosine') cos_dists.append(cos_dist) else: cos_dists.append(None) return dists, cos_dists def loss(self, dists, cos_dists, targets, weights): losses = dict() loss_track = 0. loss_track_aux = 0. for _dists, _cos_dists, _targets, _weights in zip( dists, cos_dists, targets, weights): loss_track += self.loss_track( _dists, _targets, _weights, avg_factor=_weights.sum()) if self.loss_track_aux is not None: loss_track_aux += self.loss_track_aux(_cos_dists, _targets) losses['loss_track'] = loss_track / len(dists) if self.loss_track_aux is not None: losses['loss_track_aux'] = loss_track_aux / len(dists) return losses @staticmethod def random_choice(gallery, num): """Random select some elements from the gallery. It seems that Pytorch's implementation is slower than numpy so we use numpy to randperm the indices. """ assert len(gallery) >= num if isinstance(gallery, list): gallery = np.array(gallery) cands = np.arange(len(gallery)) np.random.shuffle(cands) rand_inds = cands[:num] if not isinstance(gallery, np.ndarray): rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device) return gallery[rand_inds] @HEADS.register_module() class QuasiDenseMaskEmbedHeadGTMask(nn.Module): def __init__(self, num_convs=4, num_fcs=1, roi_feat_size=7, in_channels=256, conv_out_channels=256, fc_out_channels=1024, embed_channels=256, conv_cfg=None, norm_cfg=None, softmax_temp=-1, loss_track=dict( type='MultiPosCrossEntropyLoss', loss_weight=0.25), loss_track_aux=dict( type='L2Loss', sample_ratio=3, margin=0.3, loss_weight=1.0, hard_mining=True)): super(QuasiDenseMaskEmbedHeadGTMask, self).__init__() self.num_convs = num_convs self.num_fcs = num_fcs self.roi_feat_size = roi_feat_size self.in_channels = in_channels self.conv_out_channels = conv_out_channels self.fc_out_channels = fc_out_channels self.embed_channels = embed_channels self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.relu = nn.ReLU(inplace=True) self.convs, self.fcs, last_layer_dim = self._add_conv_fc_branch( self.num_convs, self.num_fcs, self.in_channels) self.fc_embed = nn.Linear(last_layer_dim, embed_channels) self.softmax_temp = softmax_temp self.loss_track = build_loss(loss_track) if loss_track_aux is not None: self.loss_track_aux = build_loss(loss_track_aux) else: self.loss_track_aux = None def _add_conv_fc_branch(self, num_convs, num_fcs, in_channels): last_layer_dim = in_channels # add branch specific conv layers convs = nn.ModuleList() if num_convs > 0: for i in range(num_convs): conv_in_channels = ( last_layer_dim if i == 0 else self.conv_out_channels) convs.append( ConvModule( conv_in_channels, self.conv_out_channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg)) last_layer_dim = self.conv_out_channels # add branch specific fc layers fcs = nn.ModuleList() if num_fcs > 0: last_layer_dim *= (self.roi_feat_size * self.roi_feat_size) for i in range(num_fcs): fc_in_channels = ( last_layer_dim if i == 0 else self.fc_out_channels) fcs.append(nn.Linear(fc_in_channels, self.fc_out_channels)) last_layer_dim = self.fc_out_channels return convs, fcs, last_layer_dim def init_weights(self): for m in self.fcs: if isinstance(m, nn.Linear): nn.init.xavier_uniform_(m.weight) nn.init.constant_(m.bias, 0) nn.init.normal_(self.fc_embed.weight, 0, 0.01) nn.init.constant_(self.fc_embed.bias, 0) def forward(self, x): if self.num_convs > 0: for i, conv in enumerate(self.convs): x = conv(x) x = x.view(x.size(0), -1) if self.num_fcs > 0: for i, fc in enumerate(self.fcs): x = self.relu(fc(x)) x = self.fc_embed(x) return x def get_track_targets(self, gt_match_indices, key_sampling_results, ref_sampling_results): track_targets = [] track_weights = [] for _gt_match_indices, key_res, ref_res in zip(gt_match_indices, key_sampling_results, ref_sampling_results): targets = _gt_match_indices.new_zeros( (key_res.pos_masks.size(0), ref_res.pos_masks.size(0)), dtype=torch.int) _match_indices = _gt_match_indices[key_res.pos_assigned_gt_inds] pos2pos = (_match_indices.view( -1, 1) == ref_res.pos_assigned_gt_inds.view(1, -1)).int() targets[:, :pos2pos.size(1)] = pos2pos weights = (targets.sum(dim=1) > 0).float() track_targets.append(targets) track_weights.append(weights) return track_targets, track_weights def match(self, key_embeds, ref_embeds, key_sampling_results, ref_sampling_results): num_key_rois = [res.pos_masks.size(0) for res in key_sampling_results] key_embeds = torch.split(key_embeds, num_key_rois) num_ref_rois = [res.pos_masks.size(0) for res in ref_sampling_results] ref_embeds = torch.split(ref_embeds, num_ref_rois) dists, cos_dists = [], [] for key_embed, ref_embed in zip(key_embeds, ref_embeds): dist = cal_similarity( key_embed, ref_embed, method='dot_product', temperature=self.softmax_temp) dists.append(dist) if self.loss_track_aux is not None: cos_dist = cal_similarity( key_embed, ref_embed, method='cosine') cos_dists.append(cos_dist) else: cos_dists.append(None) return dists, cos_dists def loss(self, dists, cos_dists, targets, weights): losses = dict() loss_track = 0. loss_track_aux = 0. for _dists, _cos_dists, _targets, _weights in zip( dists, cos_dists, targets, weights): loss_track += self.loss_track( _dists, _targets, _weights, avg_factor=_weights.sum()) if self.loss_track_aux is not None: loss_track_aux += self.loss_track_aux(_cos_dists, _targets) losses['loss_track'] = loss_track / len(dists) if self.loss_track_aux is not None: losses['loss_track_aux'] = loss_track_aux / len(dists) return losses @staticmethod def random_choice(gallery, num): """Random select some elements from the gallery. It seems that Pytorch's implementation is slower than numpy so we use numpy to randperm the indices. """ assert len(gallery) >= num if isinstance(gallery, list): gallery = np.array(gallery) cands = np.arange(len(gallery)) np.random.shuffle(cands) rand_inds = cands[:num] if not isinstance(gallery, np.ndarray): rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device) return gallery[rand_inds] ================================================ FILE: knet/video/tracker.py ================================================ """ This is a simple mask based tracker Copyright (c) https://github.com/xingyizhou/CenterTrack Modified by Xiangtai Li """ # coding: utf-8 import torch from scipy.optimize import linear_sum_assignment from .util import generalized_box_iou, masks_to_boxes import copy class SimpleMaskTracker(object): def __init__(self, score_thresh, max_age=32): self.score_thresh = score_thresh self.max_age = max_age self.id_count = 0 self.tracks_dict = dict() self.tracks = list() self.unmatched_tracks = list() self.reset_all() def reset_all(self): self.id_count = 0 self.tracks_dict = dict() self.tracks = list() self.unmatched_tracks = list() def init_track(self, results): scores = results["scores"] # (n,) masks = results["masks"] # (n,h,w) ret = list() ret_dict = dict() for idx in range(scores.shape[0]): if scores[idx] >= self.score_thresh: self.id_count += 1 obj = dict() obj["score"] = float(scores[idx]) obj["mask"] = masks[idx] obj["tracking_id"] = self.id_count obj['active'] = 1 obj['age'] = 1 ret.append(obj) ret_dict[idx] = obj self.tracks = ret self.tracks_dict = ret_dict return copy.deepcopy(ret) def step(self, output_results, track_results): """ Args: output_results: Current Frame Output including the tracked results Returns: """ scores = output_results["scores"] # (n,h,w) bboxes = output_results["masks"] # (n,h,w) # track_bboxes = track_results["masks"] # (m,h,w) results = list() results_dict = dict() # tracks = list() # for idx in range(scores.shape[0]): # if idx in self.tracks_dict and idx < len(track_bboxes): # self.tracks_dict[idx]["mask"] = track_bboxes[idx] # # if scores[idx] >= self.score_thresh: # obj = dict() # obj["score"] = float(scores[idx]) # obj["mask"] = bboxes[idx] # results.append(obj) # results_dict[idx] = obj tracks = [v for v in self.tracks_dict.values()] + self.unmatched_tracks N = len(results) M = len(tracks) ret = list() unmatched_tracks = [t for t in range(M)] unmatched_dets = [d for d in range(N)] if N > 0 and M > 0: det_box = masks_to_boxes(torch.stack([torch.tensor(obj['mask']) for obj in results], dim=0)) # N x h * w track_box = masks_to_boxes(torch.stack([torch.tensor(obj['mask']) for obj in tracks], dim=0)) # M x h * w cost_bbox = 1.0 - generalized_box_iou(det_box, track_box) # N x M matched_indices = linear_sum_assignment(cost_bbox) unmatched_dets = [d for d in range(N) if not (d in matched_indices[0])] unmatched_tracks = [d for d in range(M) if not (d in matched_indices[1])] matches = [[], []] for (m0, m1) in zip(matched_indices[0], matched_indices[1]): if cost_bbox[m0, m1] > 1.2: unmatched_dets.append(m0) unmatched_tracks.append(m1) else: matches[0].append(m0) matches[1].append(m1) # handle the matched tracks for (m0, m1) in zip(matches[0], matches[1]): track = results[m0] track['tracking_id'] = tracks[m1]['tracking_id'] track['age'] = 1 track['active'] = 1 ret.append(track) for i in unmatched_dets: track = results[i] self.id_count += 1 track['tracking_id'] = self.id_count track['age'] = 1 track['active'] = 1 ret.append(track) curent_track = ret # handle the remaining tracks ret_unmatched_tracks = [] for i in unmatched_tracks: track = tracks[i] if track['age'] < self.max_age: track['age'] += 1 track['active'] = 0 ret.append(track) ret_unmatched_tracks.append(track) self.tracks = ret self.tracks_dict = results_dict self.unmatched_tracks = ret_unmatched_tracks return curent_track ================================================ FILE: knet/video/util.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved """ Utilities for bounding box manipulation and GIoU. """ import torch from torchvision.ops.boxes import box_area def box_cxcywh_to_xyxy(x): x_c, y_c, w, h = x.unbind(-1) b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] return torch.stack(b, dim=-1) def box_xyxy_to_cxcywh(x): x0, y0, x1, y1 = x.unbind(-1) b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)] return torch.stack(b, dim=-1) # modified from torchvision to also return the union def box_iou(boxes1, boxes2): area1 = box_area(boxes1) area2 = box_area(boxes2) lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] wh = (rb - lt).clamp(min=0) # [N,M,2] inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] union = area1[:, None] + area2 - inter iou = inter / union return iou, union def generalized_box_iou(boxes1, boxes2): """ Generalized IoU from https://giou.stanford.edu/ The boxes should be in [x0, y0, x1, y1] format Returns a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) """ # degenerate boxes gives inf / nan results # so do an early check assert (boxes1[:, 2:] >= boxes1[:, :2]).all() assert (boxes2[:, 2:] >= boxes2[:, :2]).all() iou, union = box_iou(boxes1, boxes2) lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) wh = (rb - lt).clamp(min=0) # [N,M,2] area = wh[:, :, 0] * wh[:, :, 1] return iou - (area - union) / area def masks_to_boxes(masks): """Compute the bounding boxes around the provided masks The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. Returns a [N, 4] tensors, with the boxes in xyxy format """ if masks.numel() == 0: return torch.zeros((0, 4), device=masks.device) h, w = masks.shape[-2:] y = torch.arange(0, h, dtype=torch.float) x = torch.arange(0, w, dtype=torch.float) y, x = torch.meshgrid(y, x) x_mask = (masks * x.unsqueeze(0)) x_max = x_mask.flatten(1).max(-1)[0] x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] y_mask = (masks * y.unsqueeze(0)) y_max = y_mask.flatten(1).max(-1)[0] y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] return torch.stack([x_min, y_min, x_max, y_max], 1) ================================================ FILE: knet_vis/__init__.py ================================================ ================================================ FILE: knet_vis/det/__init__.py ================================================ ================================================ FILE: knet_vis/det/kernel_head.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import (ConvModule, bias_init_with_prob, normal_init) from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean from mmdet.models.builder import HEADS, build_loss, build_neck from mmdet.models.losses import accuracy from mmdet.utils import get_root_logger @HEADS.register_module() class ConvKernelHead(nn.Module): def __init__(self, num_proposals=100, in_channels=256, out_channels=256, num_heads=8, num_cls_fcs=1, num_seg_convs=1, num_loc_convs=1, att_dropout=False, localization_fpn=None, conv_kernel_size=1, norm_cfg=dict(type='GN', num_groups=32), semantic_fpn=True, train_cfg=None, num_classes=80, xavier_init_kernel=False, kernel_init_std=0.01, use_binary=False, proposal_feats_with_obj=False, loss_mask=None, loss_seg=None, loss_cls=None, loss_dice=None, loss_rank=None, feat_downsample_stride=1, feat_refine_stride=1, feat_refine=True, with_embed=False, feat_embed_only=False, conv_normal_init=False, mask_out_stride=4, hard_target=False, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, cat_stuff_mask=False, **kwargs): super(ConvKernelHead, self).__init__() self.num_proposals = num_proposals self.num_cls_fcs = num_cls_fcs self.train_cfg = train_cfg self.in_channels = in_channels self.out_channels = out_channels self.num_classes = num_classes self.proposal_feats_with_obj = proposal_feats_with_obj self.sampling = False self.localization_fpn = build_neck(localization_fpn) self.semantic_fpn = semantic_fpn self.norm_cfg = norm_cfg self.num_heads = num_heads self.att_dropout = att_dropout self.mask_out_stride = mask_out_stride self.hard_target = hard_target self.conv_kernel_size = conv_kernel_size self.xavier_init_kernel = xavier_init_kernel self.kernel_init_std = kernel_init_std self.feat_downsample_stride = feat_downsample_stride self.feat_refine_stride = feat_refine_stride self.conv_normal_init = conv_normal_init self.feat_refine = feat_refine self.with_embed = with_embed self.feat_embed_only = feat_embed_only self.num_loc_convs = num_loc_convs self.num_seg_convs = num_seg_convs self.use_binary = use_binary self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.ignore_label = ignore_label self.thing_label_in_seg = thing_label_in_seg self.cat_stuff_mask = cat_stuff_mask if loss_mask is not None: self.loss_mask = build_loss(loss_mask) else: self.loss_mask = loss_mask if loss_dice is not None: self.loss_dice = build_loss(loss_dice) else: self.loss_dice = loss_dice if loss_seg is not None: self.loss_seg = build_loss(loss_seg) else: self.loss_seg = loss_seg if loss_cls is not None: self.loss_cls = build_loss(loss_cls) else: self.loss_cls = loss_cls if loss_rank is not None: self.loss_rank = build_loss(loss_rank) else: self.loss_rank = loss_rank if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) # use PseudoSampler when sampling is False if self.sampling and hasattr(self.train_cfg, 'sampler'): sampler_cfg = self.train_cfg.sampler else: sampler_cfg = dict(type='MaskPseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self._init_layers() def _init_layers(self): """Initialize a sparse set of proposal boxes and proposal features.""" self.init_kernels = nn.Conv2d( self.out_channels, self.num_proposals, self.conv_kernel_size, padding=int(self.conv_kernel_size // 2), bias=False) if self.semantic_fpn: if self.loss_seg.use_sigmoid: self.conv_seg = nn.Conv2d(self.out_channels, self.num_classes, 1) else: self.conv_seg = nn.Conv2d(self.out_channels, self.num_classes + 1, 1) if self.feat_downsample_stride > 1 and self.feat_refine: self.ins_downsample = ConvModule( self.in_channels, self.out_channels, 3, stride=self.feat_refine_stride, padding=1, norm_cfg=self.norm_cfg) self.seg_downsample = ConvModule( self.in_channels, self.out_channels, 3, stride=self.feat_refine_stride, padding=1, norm_cfg=self.norm_cfg) self.loc_convs = nn.ModuleList() for i in range(self.num_loc_convs): self.loc_convs.append( ConvModule( self.in_channels, self.out_channels, 1, norm_cfg=self.norm_cfg)) self.seg_convs = nn.ModuleList() for i in range(self.num_seg_convs): self.seg_convs.append( ConvModule( self.in_channels, self.out_channels, 1, norm_cfg=self.norm_cfg)) def init_weights(self): self.localization_fpn.init_weights() if self.feat_downsample_stride > 1 and self.conv_normal_init: logger = get_root_logger() logger.info('Initialize convs in KPN head by normal std 0.01') for conv in [self.loc_convs, self.seg_convs]: for m in conv.modules(): if isinstance(m, nn.Conv2d): normal_init(m, std=0.01) if self.semantic_fpn: bias_seg = bias_init_with_prob(0.01) if self.loss_seg.use_sigmoid: normal_init(self.conv_seg, std=0.01, bias=bias_seg) else: normal_init(self.conv_seg, mean=0, std=0.01) if self.xavier_init_kernel: logger = get_root_logger() logger.info('Initialize kernels by xavier uniform') nn.init.xavier_uniform_(self.init_kernels.weight) else: logger = get_root_logger() logger.info( f'Initialize kernels by normal std: {self.kernel_init_std}') normal_init(self.init_kernels, mean=0, std=self.kernel_init_std) def _decode_init_proposals(self, img, img_metas): num_imgs = len(img_metas) localization_feats = self.localization_fpn(img) if isinstance(localization_feats, list): loc_feats = localization_feats[0] else: loc_feats = localization_feats for conv in self.loc_convs: loc_feats = conv(loc_feats) if self.feat_downsample_stride > 1 and self.feat_refine: loc_feats = self.ins_downsample(loc_feats) mask_preds = self.init_kernels(loc_feats) if self.semantic_fpn: if isinstance(localization_feats, list): semantic_feats = localization_feats[1] else: semantic_feats = localization_feats for conv in self.seg_convs: semantic_feats = conv(semantic_feats) if self.feat_downsample_stride > 1 and self.feat_refine: semantic_feats = self.seg_downsample(semantic_feats) else: semantic_feats = None if semantic_feats is not None: seg_preds = self.conv_seg(semantic_feats) else: seg_preds = None proposal_feats = self.init_kernels.weight.clone() proposal_feats = proposal_feats[None].expand(num_imgs, *proposal_feats.size()) if semantic_feats is not None: x_feats = semantic_feats + loc_feats else: x_feats = loc_feats if self.proposal_feats_with_obj: sigmoid_masks = mask_preds.sigmoid() nonzero_inds = sigmoid_masks > 0.5 if self.use_binary: sigmoid_masks = nonzero_inds.float() else: sigmoid_masks = nonzero_inds.float() * sigmoid_masks obj_feats = torch.einsum('bnhw,bchw->bnc', sigmoid_masks, x_feats) cls_scores = None if self.proposal_feats_with_obj: proposal_feats = proposal_feats + obj_feats.view( num_imgs, self.num_proposals, self.out_channels, 1, 1) if self.cat_stuff_mask and not self.training: mask_preds = torch.cat( [mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1) stuff_kernels = self.conv_seg.weight[self. num_thing_classes:].clone() stuff_kernels = stuff_kernels[None].expand(num_imgs, *stuff_kernels.size()) proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1) return proposal_feats, x_feats, mask_preds, cls_scores, seg_preds def forward_train(self, img, img_metas, gt_masks, gt_labels, gt_sem_seg=None, gt_sem_cls=None): """Forward function in training stage.""" num_imgs = len(img_metas) results = self._decode_init_proposals(img, img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = results if self.feat_downsample_stride > 1: scaled_mask_preds = F.interpolate( mask_preds, scale_factor=self.feat_downsample_stride, mode='bilinear', align_corners=False) if seg_preds is not None: scaled_seg_preds = F.interpolate( seg_preds, scale_factor=self.feat_downsample_stride, mode='bilinear', align_corners=False) else: scaled_mask_preds = mask_preds scaled_seg_preds = seg_preds if self.hard_target: gt_masks = [x.bool().float() for x in gt_masks] else: gt_masks = gt_masks sampling_results = [] if cls_scores is None: detached_cls_scores = [None] * num_imgs else: detached_cls_scores = cls_scores.detach() for i in range(num_imgs): assign_result = self.assigner.assign(scaled_mask_preds[i].detach(), detached_cls_scores[i], gt_masks[i], gt_labels[i], img_metas[i]) sampling_result = self.sampler.sample(assign_result, scaled_mask_preds[i], gt_masks[i]) sampling_results.append(sampling_result) mask_targets = self.get_targets( sampling_results, gt_masks, self.train_cfg, True, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls) losses = self.loss(scaled_mask_preds, cls_scores, scaled_seg_preds, proposal_feats, *mask_targets) if self.cat_stuff_mask and self.training: mask_preds = torch.cat( [mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1) stuff_kernels = self.conv_seg.weight[self. num_thing_classes:].clone() stuff_kernels = stuff_kernels[None].expand(num_imgs, *stuff_kernels.size()) proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1) return losses, proposal_feats, x_feats, mask_preds, cls_scores def loss(self, mask_pred, cls_scores, seg_preds, proposal_feats, labels, label_weights, mask_targets, mask_weights, seg_targets, reduction_override=None, **kwargs): losses = dict() bg_class_ind = self.num_classes # note in spare rcnn num_gt == num_pos pos_inds = (labels >= 0) & (labels < bg_class_ind) num_preds = mask_pred.shape[0] * mask_pred.shape[1] if cls_scores is not None: num_pos = pos_inds.sum().float() avg_factor = reduce_mean(num_pos) assert mask_pred.shape[0] == cls_scores.shape[0] assert mask_pred.shape[1] == cls_scores.shape[1] losses['loss_rpn_cls'] = self.loss_cls( cls_scores.view(num_preds, -1), labels, label_weights, avg_factor=avg_factor, reduction_override=reduction_override) losses['rpn_pos_acc'] = accuracy( cls_scores.view(num_preds, -1)[pos_inds], labels[pos_inds]) bool_pos_inds = pos_inds.type(torch.bool) # 0~self.num_classes-1 are FG, self.num_classes is BG # do not perform bounding box regression for BG anymore. H, W = mask_pred.shape[-2:] if pos_inds.any(): pos_mask_pred = mask_pred.reshape(num_preds, H, W)[bool_pos_inds] pos_mask_targets = mask_targets[bool_pos_inds] losses['loss_rpn_mask'] = self.loss_mask(pos_mask_pred, pos_mask_targets) losses['loss_rpn_dice'] = self.loss_dice(pos_mask_pred, pos_mask_targets) if self.loss_rank is not None: batch_size = mask_pred.size(0) rank_target = mask_targets.new_full((batch_size, H, W), self.ignore_label, dtype=torch.long) rank_inds = pos_inds.view(batch_size, -1).nonzero(as_tuple=False) batch_mask_targets = mask_targets.view(batch_size, -1, H, W).bool() for i in range(batch_size): curr_inds = (rank_inds[:, 0] == i) curr_rank = rank_inds[:, 1][curr_inds] for j in curr_rank: rank_target[i][batch_mask_targets[i][j]] = j losses['loss_rpn_rank'] = self.loss_rank( mask_pred, rank_target, ignore_index=self.ignore_label) else: losses['loss_rpn_mask'] = mask_pred.sum() * 0 losses['loss_rpn_dice'] = mask_pred.sum() * 0 if self.loss_rank is not None: losses['loss_rank'] = mask_pred.sum() * 0 if seg_preds is not None: if self.loss_seg.use_sigmoid: cls_channel = seg_preds.shape[1] flatten_seg = seg_preds.view( -1, cls_channel, H * W).permute(0, 2, 1).reshape(-1, cls_channel) flatten_seg_target = seg_targets.view(-1) num_dense_pos = (flatten_seg_target >= 0) & ( flatten_seg_target < bg_class_ind) num_dense_pos = num_dense_pos.sum().float().clamp(min=1.0) losses['loss_rpn_seg'] = self.loss_seg( flatten_seg, flatten_seg_target, avg_factor=num_dense_pos) else: cls_channel = seg_preds.shape[1] flatten_seg = seg_preds.view(-1, cls_channel, H * W).permute( 0, 2, 1).reshape(-1, cls_channel) flatten_seg_target = seg_targets.view(-1) losses['loss_rpn_seg'] = self.loss_seg(flatten_seg, flatten_seg_target) return losses def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask, pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls, cfg): num_pos = pos_mask.size(0) num_neg = neg_mask.size(0) num_samples = num_pos + num_neg H, W = pos_mask.shape[-2:] # original implementation uses new_zeros since BG are set to be 0 # now use empty & fill because BG cat_id = num_classes, # FG cat_id = [0, num_classes-1] labels = pos_mask.new_full((num_samples, ), self.num_classes, dtype=torch.long) label_weights = pos_mask.new_zeros(num_samples) mask_targets = pos_mask.new_zeros(num_samples, H, W) mask_weights = pos_mask.new_zeros(num_samples, H, W) seg_targets = pos_mask.new_full((H, W), self.num_classes, dtype=torch.long) if gt_sem_cls is not None and gt_sem_seg is not None: gt_sem_seg = gt_sem_seg.bool() for sem_mask, sem_cls in zip(gt_sem_seg, gt_sem_cls): seg_targets[sem_mask] = sem_cls.long() if num_pos > 0: labels[pos_inds] = pos_gt_labels pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight label_weights[pos_inds] = pos_weight mask_targets[pos_inds, ...] = pos_gt_mask mask_weights[pos_inds, ...] = 1 for i in range(num_pos): seg_targets[pos_gt_mask[i].bool()] = pos_gt_labels[i] if num_neg > 0: label_weights[neg_inds] = 1.0 return labels, label_weights, mask_targets, mask_weights, seg_targets def get_targets(self, sampling_results, gt_mask, rpn_train_cfg, concat=True, gt_sem_seg=None, gt_sem_cls=None): num_imgs = len(sampling_results) pos_inds_list = [res.pos_inds for res in sampling_results] neg_inds_list = [res.neg_inds for res in sampling_results] pos_mask_list = [res.pos_masks for res in sampling_results] neg_mask_list = [res.neg_masks for res in sampling_results] pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results] pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results] if gt_sem_seg is None: gt_sem_seg = [None] * num_imgs gt_sem_cls = [None] * num_imgs results = multi_apply( self._get_target_single, pos_inds_list, neg_inds_list, pos_mask_list, neg_mask_list, pos_gt_mask_list, pos_gt_labels_list, gt_sem_seg, gt_sem_cls, cfg=rpn_train_cfg) (labels, label_weights, mask_targets, mask_weights, seg_targets) = results if concat: labels = torch.cat(labels, 0) label_weights = torch.cat(label_weights, 0) mask_targets = torch.cat(mask_targets, 0) mask_weights = torch.cat(mask_weights, 0) seg_targets = torch.stack(seg_targets, 0) return labels, label_weights, mask_targets, mask_weights, seg_targets def simple_test_rpn(self, img, img_metas): """Forward function in testing stage.""" return self._decode_init_proposals(img, img_metas) def forward_dummy(self, img, img_metas): """Dummy forward function. Used in flops calculation. """ return self._decode_init_proposals(img, img_metas) ================================================ FILE: knet_vis/det/kernel_iter_head.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from mmdet.core import build_assigner, build_sampler from mmdet.datasets.coco_panoptic import INSTANCE_OFFSET from mmdet.models.builder import HEADS, build_head from mmdet.models.roi_heads import BaseRoIHead from .mask_pseudo_sampler import MaskPseudoSampler @HEADS.register_module() class KernelIterHead(BaseRoIHead): def __init__(self, num_stages=6, recursive=False, assign_stages=5, stage_loss_weights=(1, 1, 1, 1, 1, 1), proposal_feature_channel=256, merge_cls_scores=False, do_panoptic=False, post_assign=False, hard_target=False, num_proposals=100, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, thing_label_in_seg=0, mask_head=dict( type='KernelUpdateHead', num_classes=80, num_fcs=2, num_heads=8, num_cls_fcs=1, num_reg_fcs=3, feedforward_channels=2048, hidden_channels=256, dropout=0.0, roi_feat_size=7, ffn_act_cfg=dict(type='ReLU', inplace=True)), mask_out_stride=4, train_cfg=None, test_cfg=None, **kwargs): assert mask_head is not None assert len(stage_loss_weights) == num_stages self.num_stages = num_stages self.stage_loss_weights = stage_loss_weights self.proposal_feature_channel = proposal_feature_channel self.merge_cls_scores = merge_cls_scores self.recursive = recursive self.post_assign = post_assign self.mask_out_stride = mask_out_stride self.hard_target = hard_target self.assign_stages = assign_stages self.do_panoptic = do_panoptic self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.num_classes = num_thing_classes + num_stuff_classes self.mask_assign_stride = mask_assign_stride self.thing_label_in_seg = thing_label_in_seg self.num_proposals = num_proposals super(KernelIterHead, self).__init__( mask_head=mask_head, train_cfg=train_cfg, test_cfg=test_cfg, **kwargs) # train_cfg would be None when run the test.py if train_cfg is not None: for stage in range(num_stages): assert isinstance( self.mask_sampler[stage], MaskPseudoSampler), \ 'Sparse Mask only support `MaskPseudoSampler`' def init_bbox_head(self, mask_roi_extractor, mask_head): """Initialize box head and box roi extractor. Args: mask_roi_extractor (dict): Config of box roi extractor. mask_head (dict): Config of box in box head. """ pass def init_assigner_sampler(self): """Initialize assigner and sampler for each stage.""" self.mask_assigner = [] self.mask_sampler = [] if self.train_cfg is not None: for idx, rcnn_train_cfg in enumerate(self.train_cfg): self.mask_assigner.append( build_assigner(rcnn_train_cfg.assigner)) self.current_stage = idx self.mask_sampler.append( build_sampler(rcnn_train_cfg.sampler, context=self)) def init_weights(self): for i in range(self.num_stages): self.mask_head[i].init_weights() def init_mask_head(self, mask_roi_extractor, mask_head): """Initialize mask head and mask roi extractor. Args: mask_roi_extractor (dict): Config of mask roi extractor. mask_head (dict): Config of mask in mask head. """ self.mask_head = nn.ModuleList() if not isinstance(mask_head, list): mask_head = [mask_head for _ in range(self.num_stages)] assert len(mask_head) == self.num_stages for head in mask_head: self.mask_head.append(build_head(head)) if self.recursive: for i in range(self.num_stages): self.mask_head[i] = self.mask_head[0] def _mask_forward(self, stage, x, object_feats, mask_preds, img_metas): mask_head = self.mask_head[stage] cls_score, mask_preds, object_feats = mask_head( x, object_feats, mask_preds, img_metas=img_metas) if mask_head.mask_upsample_stride > 1 and (stage == self.num_stages - 1 or self.training): scaled_mask_preds = F.interpolate( mask_preds, scale_factor=mask_head.mask_upsample_stride, align_corners=False, mode='bilinear') else: scaled_mask_preds = mask_preds mask_results = dict( cls_score=cls_score, mask_preds=mask_preds, scaled_mask_preds=scaled_mask_preds, object_feats=object_feats) return mask_results def forward_train(self, x, proposal_feats, mask_preds, cls_score, img_metas, gt_masks, gt_labels, gt_bboxes_ignore=None, imgs_whwh=None, gt_bboxes=None, gt_sem_seg=None, gt_sem_cls=None): num_imgs = len(img_metas) if self.mask_head[0].mask_upsample_stride > 1: prev_mask_preds = F.interpolate( mask_preds.detach(), scale_factor=self.mask_head[0].mask_upsample_stride, mode='bilinear', align_corners=False) else: prev_mask_preds = mask_preds.detach() if cls_score is not None: prev_cls_score = cls_score.detach() else: prev_cls_score = [None] * num_imgs if self.hard_target: gt_masks = [x.bool().float() for x in gt_masks] else: gt_masks = gt_masks object_feats = proposal_feats all_stage_loss = {} all_stage_mask_results = [] assign_results = [] for stage in range(self.num_stages): mask_results = self._mask_forward(stage, x, object_feats, mask_preds, img_metas) all_stage_mask_results.append(mask_results) mask_preds = mask_results['mask_preds'] scaled_mask_preds = mask_results['scaled_mask_preds'] cls_score = mask_results['cls_score'] object_feats = mask_results['object_feats'] if self.post_assign: prev_mask_preds = scaled_mask_preds.detach() prev_cls_score = cls_score.detach() sampling_results = [] if stage < self.assign_stages: assign_results = [] for i in range(num_imgs): if stage < self.assign_stages: mask_for_assign = prev_mask_preds[i][:self.num_proposals] if prev_cls_score[i] is not None: cls_for_assign = prev_cls_score[ i][:self.num_proposals, :self.num_thing_classes] else: cls_for_assign = None assign_result = self.mask_assigner[stage].assign( mask_for_assign, cls_for_assign, gt_masks[i], gt_labels[i], img_metas[i]) assign_results.append(assign_result) sampling_result = self.mask_sampler[stage].sample( assign_results[i], scaled_mask_preds[i], gt_masks[i]) sampling_results.append(sampling_result) mask_targets = self.mask_head[stage].get_targets( sampling_results, self.train_cfg[stage], True, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls) single_stage_loss = self.mask_head[stage].loss( object_feats, cls_score, scaled_mask_preds, *mask_targets, imgs_whwh=imgs_whwh) for key, value in single_stage_loss.items(): all_stage_loss[f's{stage}_{key}'] = value * \ self.stage_loss_weights[stage] if not self.post_assign: prev_mask_preds = scaled_mask_preds.detach() prev_cls_score = cls_score.detach() return all_stage_loss def simple_test(self, x, proposal_feats, mask_preds, cls_score, img_metas, imgs_whwh=None, rescale=False): # Decode initial proposals num_imgs = len(img_metas) # num_proposals = proposal_feats.size(1) object_feats = proposal_feats for stage in range(self.num_stages): mask_results = self._mask_forward(stage, x, object_feats, mask_preds, img_metas) object_feats = mask_results['object_feats'] cls_score = mask_results['cls_score'] mask_preds = mask_results['mask_preds'] scaled_mask_preds = mask_results['scaled_mask_preds'] num_classes = self.mask_head[-1].num_classes results = [] if self.mask_head[-1].loss_cls.use_sigmoid: cls_score = cls_score.sigmoid() else: cls_score = cls_score.softmax(-1)[..., :-1] if self.do_panoptic: for img_id in range(num_imgs): single_result = self.get_panoptic(cls_score[img_id], scaled_mask_preds[img_id], self.test_cfg, img_metas[img_id]) results.append(single_result) else: for img_id in range(num_imgs): cls_score_per_img = cls_score[img_id] # h, quite tricky here, a bounding box can predict multiple results with different labels scores_per_img, topk_indices = cls_score_per_img.flatten(0, 1).topk( self.test_cfg.max_per_img, sorted=True) mask_indices = topk_indices // num_classes # Use the following when torch >= 1.9.0 # mask_indices = torch.div(topk_indices, num_classes, rounding_mode='trunc') labels_per_img = topk_indices % num_classes masks_per_img = scaled_mask_preds[img_id][mask_indices] single_result = self.mask_head[-1].get_seg_masks( masks_per_img, labels_per_img, scores_per_img, self.test_cfg, img_metas[img_id]) results.append(single_result) return results def aug_test(self, features, proposal_list, img_metas, rescale=False): raise NotImplementedError('SparseMask does not support `aug_test`') def forward_dummy(self, x, proposal_boxes, proposal_feats, img_metas): """Dummy forward function when do the flops computing.""" all_stage_mask_results = [] num_imgs = len(img_metas) num_proposals = proposal_feats.size(1) C, H, W = x.shape[-3:] mask_preds = proposal_feats.bmm(x.view(num_imgs, C, -1)).view( num_imgs, num_proposals, H, W) object_feats = proposal_feats for stage in range(self.num_stages): mask_results = self._mask_forward(stage, x, object_feats, mask_preds, img_metas) all_stage_mask_results.append(mask_results) return all_stage_mask_results def get_panoptic(self, cls_scores, mask_preds, test_cfg, img_meta): # resize mask predictions back scores = cls_scores[:self.num_proposals][:, :self.num_thing_classes] thing_scores, thing_labels = scores.max(dim=1) stuff_scores = cls_scores[ self.num_proposals:][:, self.num_thing_classes:].diag() stuff_labels = torch.arange( 0, self.num_stuff_classes) + self.num_thing_classes stuff_labels = stuff_labels.to(thing_labels.device) total_masks = self.mask_head[-1].rescale_masks(mask_preds, img_meta) total_scores = torch.cat([thing_scores, stuff_scores], dim=0) total_labels = torch.cat([thing_labels, stuff_labels], dim=0) panoptic_result = self.merge_stuff_thing(total_masks, total_labels, total_scores, test_cfg.merge_stuff_thing) return dict(pan_results=panoptic_result) def merge_stuff_thing(self, total_masks, total_labels, total_scores, merge_cfg=None): H, W = total_masks.shape[-2:] panoptic_seg = total_masks.new_full((H, W), self.num_classes, dtype=torch.long) cur_prob_masks = total_scores.view(-1, 1, 1) * total_masks cur_mask_ids = cur_prob_masks.argmax(0) # sort instance outputs by scores sorted_inds = torch.argsort(-total_scores) current_segment_id = 0 for k in sorted_inds: pred_class = total_labels[k].item() isthing = pred_class < self.num_thing_classes if isthing and total_scores[k] < merge_cfg.instance_score_thr: continue mask = cur_mask_ids == k mask_area = mask.sum().item() original_area = (total_masks[k] >= 0.5).sum().item() if mask_area > 0 and original_area > 0: if mask_area / original_area < merge_cfg.overlap_thr: continue panoptic_seg[mask] = total_labels[k] \ + current_segment_id * INSTANCE_OFFSET current_segment_id += 1 return panoptic_seg.cpu().numpy() ================================================ FILE: knet_vis/det/kernel_update_head.py ================================================ import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import (ConvModule, bias_init_with_prob, build_activation_layer, build_norm_layer) from mmcv.cnn.bricks.transformer import (FFN, MultiheadAttention, build_transformer_layer) from mmcv.runner import force_fp32 from mmdet.core import multi_apply from mmdet.models.builder import HEADS, build_loss from mmdet.models.dense_heads.atss_head import reduce_mean from mmdet.models.losses import accuracy from mmdet.utils import get_root_logger from mmtrack.transform import outs2results @HEADS.register_module() class KernelUpdateHead(nn.Module): def __init__(self, num_classes=80, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=3, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, act_cfg=dict(type='ReLU', inplace=True), ffn_act_cfg=dict(type='ReLU', inplace=True), conv_kernel_size=3, feat_transform_cfg=None, hard_mask_thr=0.5, kernel_init=False, with_ffn=True, mask_out_stride=4, relative_coors=False, relative_coors_off=False, feat_gather_stride=1, mask_transform_stride=1, mask_upsample_stride=1, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, kernel_updator_cfg=dict( type='DynamicConv', in_channels=256, feat_channels=64, out_channels=256, input_feat_shape=1, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_rank=None, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0), loss_dice=dict(type='DiceLoss', loss_weight=3.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0)): super(KernelUpdateHead, self).__init__() self.num_classes = num_classes self.loss_cls = build_loss(loss_cls) self.loss_mask = build_loss(loss_mask) self.loss_dice = build_loss(loss_dice) if loss_rank is not None: self.loss_rank = build_loss(loss_rank) else: self.loss_rank = loss_rank self.in_channels = in_channels self.out_channels = out_channels self.mask_thr = mask_thr self.fp16_enabled = False self.dropout = dropout self.num_heads = num_heads self.hard_mask_thr = hard_mask_thr self.kernel_init = kernel_init self.with_ffn = with_ffn self.mask_out_stride = mask_out_stride self.relative_coors = relative_coors self.relative_coors_off = relative_coors_off self.conv_kernel_size = conv_kernel_size self.feat_gather_stride = feat_gather_stride self.mask_transform_stride = mask_transform_stride self.mask_upsample_stride = mask_upsample_stride self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.ignore_label = ignore_label self.thing_label_in_seg = thing_label_in_seg self.attention = MultiheadAttention(in_channels * conv_kernel_size**2, num_heads, dropout) self.attention_norm = build_norm_layer( dict(type='LN'), in_channels * conv_kernel_size**2)[1] self.kernel_update_conv = build_transformer_layer(kernel_updator_cfg) if feat_transform_cfg is not None: kernel_size = feat_transform_cfg.pop('kernel_size', 1) self.feat_transform = ConvModule( in_channels, in_channels, kernel_size, stride=feat_gather_stride, padding=int(feat_gather_stride // 2), **feat_transform_cfg) else: self.feat_transform = None if self.with_ffn: self.ffn = FFN( in_channels, feedforward_channels, num_ffn_fcs, act_cfg=ffn_act_cfg, ffn_drop=dropout) self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1] self.cls_fcs = nn.ModuleList() for _ in range(num_cls_fcs): self.cls_fcs.append( nn.Linear(in_channels, in_channels, bias=False)) self.cls_fcs.append( build_norm_layer(dict(type='LN'), in_channels)[1]) self.cls_fcs.append(build_activation_layer(act_cfg)) if self.loss_cls.use_sigmoid: self.fc_cls = nn.Linear(in_channels, self.num_classes) else: self.fc_cls = nn.Linear(in_channels, self.num_classes + 1) self.mask_fcs = nn.ModuleList() for _ in range(num_mask_fcs): self.mask_fcs.append( nn.Linear(in_channels, in_channels, bias=False)) self.mask_fcs.append( build_norm_layer(dict(type='LN'), in_channels)[1]) self.mask_fcs.append(build_activation_layer(act_cfg)) self.fc_mask = nn.Linear(in_channels, out_channels) def init_weights(self): """Use xavier initialization for all weight parameter and set classification head bias as a specific value when use focal loss.""" for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) else: # adopt the default initialization for # the weight and bias of the layer norm pass if self.loss_cls.use_sigmoid: bias_init = bias_init_with_prob(0.01) nn.init.constant_(self.fc_cls.bias, bias_init) if self.kernel_init: logger = get_root_logger() logger.info( 'mask kernel in mask head is normal initialized by std 0.01') nn.init.normal_(self.fc_mask.weight, mean=0, std=0.01) def forward(self, x, proposal_feat, mask_preds, prev_cls_score=None, mask_shape=None, img_metas=None): N, num_proposals = proposal_feat.shape[:2] if self.feat_transform is not None: x = self.feat_transform(x) C, H, W = x.shape[-3:] mask_h, mask_w = mask_preds.shape[-2:] if mask_h != H or mask_w != W: gather_mask = F.interpolate( mask_preds, (H, W), align_corners=False, mode='bilinear') else: gather_mask = mask_preds sigmoid_masks = gather_mask.sigmoid() nonzero_inds = sigmoid_masks > self.hard_mask_thr sigmoid_masks = nonzero_inds.float() # einsum is faster than bmm by 30% x_feat = torch.einsum('bnhw,bchw->bnc', sigmoid_masks, x) # obj_feat in shape [B, N, C, K, K] -> [B, N, C, K*K] -> [B, N, K*K, C] proposal_feat = proposal_feat.reshape(N, num_proposals, self.in_channels, -1).permute(0, 1, 3, 2) obj_feat = self.kernel_update_conv(x_feat, proposal_feat) # [B, N, K*K, C] -> [B, N, K*K*C] -> [N, B, K*K*C] obj_feat = obj_feat.reshape(N, num_proposals, -1).permute(1, 0, 2) obj_feat = self.attention_norm(self.attention(obj_feat)) # [N, B, K*K*C] -> [B, N, K*K*C] obj_feat = obj_feat.permute(1, 0, 2) # obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C] obj_feat = obj_feat.reshape(N, num_proposals, -1, self.in_channels) # FFN if self.with_ffn: obj_feat = self.ffn_norm(self.ffn(obj_feat)) cls_feat = obj_feat.sum(-2) mask_feat = obj_feat for cls_layer in self.cls_fcs: cls_feat = cls_layer(cls_feat) for reg_layer in self.mask_fcs: mask_feat = reg_layer(mask_feat) cls_score = self.fc_cls(cls_feat).view(N, num_proposals, -1) # [B, N, K*K, C] -> [B, N, C, K*K] mask_feat = self.fc_mask(mask_feat).permute(0, 1, 3, 2) if (self.mask_transform_stride == 2 and self.feat_gather_stride == 1): mask_x = F.interpolate( x, scale_factor=0.5, mode='bilinear', align_corners=False) H, W = mask_x.shape[-2:] raise NotImplementedError else: mask_x = x # group conv is 5x faster than unfold and uses about 1/5 memory # Group conv vs. unfold vs. concat batch, 2.9ms :13.5ms :3.8ms # Group conv vs. unfold vs. concat batch, 278 : 1420 : 369 # fold_x = F.unfold( # mask_x, # self.conv_kernel_size, # padding=int(self.conv_kernel_size // 2)) # mask_feat = mask_feat.reshape(N, num_proposals, -1) # new_mask_preds = torch.einsum('bnc,bcl->bnl', mask_feat, fold_x) # [B, N, C, K*K] -> [B*N, C, K, K] mask_feat = mask_feat.reshape(N, num_proposals, C, self.conv_kernel_size, self.conv_kernel_size) # [B, C, H, W] -> [1, B*C, H, W] new_mask_preds = [] for i in range(N): new_mask_preds.append( F.conv2d( mask_x[i:i + 1], mask_feat[i], padding=int(self.conv_kernel_size // 2))) new_mask_preds = torch.cat(new_mask_preds, dim=0) new_mask_preds = new_mask_preds.reshape(N, num_proposals, H, W) if self.mask_transform_stride == 2: new_mask_preds = F.interpolate( new_mask_preds, scale_factor=2, mode='bilinear', align_corners=False) if mask_shape is not None and mask_shape[0] != H: new_mask_preds = F.interpolate( new_mask_preds, mask_shape, align_corners=False, mode='bilinear') return cls_score, new_mask_preds, obj_feat.permute(0, 1, 3, 2).reshape( N, num_proposals, self.in_channels, self.conv_kernel_size, self.conv_kernel_size) @force_fp32(apply_to=('cls_score', 'mask_pred')) def loss(self, object_feats, cls_score, mask_pred, labels, label_weights, mask_targets, mask_weights, imgs_whwh=None, reduction_override=None, **kwargs): losses = dict() bg_class_ind = self.num_classes # note in spare rcnn num_gt == num_pos pos_inds = (labels >= 0) & (labels < bg_class_ind) num_pos = pos_inds.sum().float() avg_factor = reduce_mean(num_pos).clamp_(min=1.0) num_preds = mask_pred.shape[0] * mask_pred.shape[1] assert mask_pred.shape[0] == cls_score.shape[0] assert mask_pred.shape[1] == cls_score.shape[1] if cls_score is not None: if cls_score.numel() > 0: losses['loss_cls'] = self.loss_cls( cls_score.view(num_preds, -1), labels, label_weights, avg_factor=avg_factor, reduction_override=reduction_override) losses['pos_acc'] = accuracy( cls_score.view(num_preds, -1)[pos_inds], labels[pos_inds]) if mask_pred is not None: bool_pos_inds = pos_inds.type(torch.bool) # 0~self.num_classes-1 are FG, self.num_classes is BG # do not perform bounding box regression for BG anymore. H, W = mask_pred.shape[-2:] if pos_inds.any(): pos_mask_pred = mask_pred.reshape(num_preds, H, W)[bool_pos_inds] pos_mask_targets = mask_targets[bool_pos_inds] losses['loss_mask'] = self.loss_mask(pos_mask_pred, pos_mask_targets) losses['loss_dice'] = self.loss_dice(pos_mask_pred, pos_mask_targets) if self.loss_rank is not None: batch_size = mask_pred.size(0) rank_target = mask_targets.new_full((batch_size, H, W), self.ignore_label, dtype=torch.long) rank_inds = pos_inds.view(batch_size, -1).nonzero(as_tuple=False) batch_mask_targets = mask_targets.view( batch_size, -1, H, W).bool() for i in range(batch_size): curr_inds = (rank_inds[:, 0] == i) curr_rank = rank_inds[:, 1][curr_inds] for j in curr_rank: rank_target[i][batch_mask_targets[i][j]] = j losses['loss_rank'] = self.loss_rank( mask_pred, rank_target, ignore_index=self.ignore_label) else: losses['loss_mask'] = mask_pred.sum() * 0 losses['loss_dice'] = mask_pred.sum() * 0 if self.loss_rank is not None: losses['loss_rank'] = mask_pred.sum() * 0 return losses def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask, pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls, cfg): num_pos = pos_mask.size(0) num_neg = neg_mask.size(0) num_samples = num_pos + num_neg H, W = pos_mask.shape[-2:] # original implementation uses new_zeros since BG are set to be 0 # now use empty & fill because BG cat_id = num_classes, # FG cat_id = [0, num_classes-1] labels = pos_mask.new_full((num_samples, ), self.num_classes, dtype=torch.long) label_weights = pos_mask.new_zeros((num_samples, self.num_classes)) mask_targets = pos_mask.new_zeros(num_samples, H, W) mask_weights = pos_mask.new_zeros(num_samples, H, W) if num_pos > 0: labels[pos_inds] = pos_gt_labels pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight label_weights[pos_inds] = pos_weight pos_mask_targets = pos_gt_mask mask_targets[pos_inds, ...] = pos_mask_targets mask_weights[pos_inds, ...] = 1 if num_neg > 0: label_weights[neg_inds] = 1.0 if gt_sem_cls is not None and gt_sem_seg is not None: sem_labels = pos_mask.new_full((self.num_stuff_classes, ), self.num_classes, dtype=torch.long) sem_targets = pos_mask.new_zeros(self.num_stuff_classes, H, W) sem_weights = pos_mask.new_zeros(self.num_stuff_classes, H, W) sem_stuff_weights = torch.eye( self.num_stuff_classes, device=pos_mask.device) sem_thing_weights = pos_mask.new_zeros( (self.num_stuff_classes, self.num_thing_classes)) sem_label_weights = torch.cat( [sem_thing_weights, sem_stuff_weights], dim=-1) if len(gt_sem_cls > 0): sem_inds = gt_sem_cls - self.num_thing_classes sem_inds = sem_inds.long() sem_labels[sem_inds] = gt_sem_cls.long() sem_targets[sem_inds] = gt_sem_seg sem_weights[sem_inds] = 1 label_weights[:, self.num_thing_classes:] = 0 labels = torch.cat([labels, sem_labels]) label_weights = torch.cat([label_weights, sem_label_weights]) mask_targets = torch.cat([mask_targets, sem_targets]) mask_weights = torch.cat([mask_weights, sem_weights]) return labels, label_weights, mask_targets, mask_weights def get_targets(self, sampling_results, rcnn_train_cfg, concat=True, gt_sem_seg=None, gt_sem_cls=None): num_imgs = len(sampling_results) pos_inds_list = [res.pos_inds for res in sampling_results] neg_inds_list = [res.neg_inds for res in sampling_results] pos_mask_list = [res.pos_masks for res in sampling_results] neg_mask_list = [res.neg_masks for res in sampling_results] pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results] pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results] if gt_sem_seg is None: gt_sem_seg = [None] * num_imgs gt_sem_cls = [None] * num_imgs labels, label_weights, mask_targets, mask_weights = multi_apply( self._get_target_single, pos_inds_list, neg_inds_list, pos_mask_list, neg_mask_list, pos_gt_mask_list, pos_gt_labels_list, gt_sem_seg, gt_sem_cls, cfg=rcnn_train_cfg) if concat: labels = torch.cat(labels, 0) label_weights = torch.cat(label_weights, 0) mask_targets = torch.cat(mask_targets, 0) mask_weights = torch.cat(mask_weights, 0) return labels, label_weights, mask_targets, mask_weights def rescale_masks(self, masks_per_img, img_meta): h, w, _ = img_meta['img_shape'] masks_per_img = F.interpolate( masks_per_img.unsqueeze(0).sigmoid(), size=img_meta['batch_input_shape'], mode='bilinear', align_corners=False) masks_per_img = masks_per_img[:, :, :h, :w] ori_shape = img_meta['ori_shape'] seg_masks = F.interpolate( masks_per_img, size=ori_shape[:2], mode='bilinear', align_corners=False).squeeze(0) return seg_masks def get_seg_masks(self, masks_per_img, labels_per_img, scores_per_img, test_cfg, img_meta): # resize mask predictions back seg_masks = self.rescale_masks(masks_per_img, img_meta) seg_masks = seg_masks > test_cfg.mask_thr bbox_result, segm_result = self.segm2result(seg_masks, labels_per_img, scores_per_img) return bbox_result, segm_result def segm2result(self, mask_preds, det_labels, cls_scores): num_classes = self.num_classes bbox_result = None segm_result = [[] for _ in range(num_classes)] mask_preds = mask_preds.cpu().numpy() det_labels = det_labels.cpu().numpy() cls_scores = cls_scores.cpu().numpy() num_ins = mask_preds.shape[0] # fake bboxes bboxes = np.zeros((num_ins, 5), dtype=np.float32) bboxes[:, -1] = cls_scores bbox_result = [bboxes[det_labels == i, :] for i in range(num_classes)] for idx in range(num_ins): segm_result[det_labels[idx]].append(mask_preds[idx]) return bbox_result, segm_result def get_seg_masks_tracking(self, masks_per_img, labels_per_img, scores_per_img, ids_per_img, test_cfg, img_meta): num_ins = masks_per_img.shape[0] # resize mask predictions back seg_masks = self.rescale_masks(masks_per_img, img_meta) seg_masks = seg_masks > test_cfg.mask_thr # fake bboxes bboxes = torch.zeros((num_ins, 5), dtype=torch.float32) bboxes[:, -1] = scores_per_img tracks = outs2results( bboxes=bboxes, labels=labels_per_img, masks=seg_masks, ids=ids_per_img, num_classes=self.num_classes, ) return tracks['bbox_results'], tracks['mask_results'] ================================================ FILE: knet_vis/det/knet.py ================================================ import torch import torch.nn.functional as F from mmdet.models.builder import DETECTORS from mmdet.models.detectors import TwoStageDetector from mmdet.utils import get_root_logger from .utils import sem2ins_masks @DETECTORS.register_module() class KNet(TwoStageDetector): def __init__(self, *args, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, thing_label_in_seg=0, **kwargs): super(KNet, self).__init__(*args, **kwargs) assert self.with_rpn, 'KNet does not support external proposals' self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.thing_label_in_seg = thing_label_in_seg logger = get_root_logger() logger.info(f'Model: \n{self}') def forward_train(self, img, img_metas, gt_bboxes=None, gt_labels=None, gt_bboxes_ignore=None, gt_masks=None, proposals=None, gt_semantic_seg=None, **kwargs): super(TwoStageDetector, self).forward_train(img, img_metas) assert proposals is None, 'KNet does not support' \ ' external proposals' assert gt_masks is not None # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks_tensor = [] gt_sem_seg = [] gt_sem_cls = [] # batch_input_shape shoud be the same across images pad_H, pad_W = img_metas[0]['batch_input_shape'] assign_H = pad_H // self.mask_assign_stride assign_W = pad_W // self.mask_assign_stride for i, gt_mask in enumerate(gt_masks): mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device) if gt_mask.width != pad_W or gt_mask.height != pad_H: pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height) mask_tensor = F.pad(mask_tensor, pad_wh, value=0) if gt_semantic_seg is not None: # gt_semantic seg is padded by 255 and # zero indicating the first class sem_labels, sem_seg = sem2ins_masks( gt_semantic_seg[i], num_thing_classes=self.num_thing_classes) if sem_seg.shape[0] == 0: gt_sem_seg.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_sem_seg.append( F.interpolate( sem_seg[None], (assign_H, assign_W), mode='bilinear', align_corners=False)[0]) gt_sem_cls.append(sem_labels) else: gt_sem_seg = None gt_sem_cls = None if mask_tensor.shape[0] == 0: gt_masks_tensor.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W))) else: gt_masks_tensor.append( F.interpolate( mask_tensor[None], (assign_H, assign_W), mode='bilinear', align_corners=False)[0]) gt_masks = gt_masks_tensor x = self.extract_feat(img) rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks, gt_labels, gt_sem_seg, gt_sem_cls) (rpn_losses, proposal_feats, x_feats, mask_preds, cls_scores) = rpn_results losses = self.roi_head.forward_train( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, gt_masks, gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, gt_bboxes=gt_bboxes, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls, imgs_whwh=None) losses.update(rpn_losses) return losses def simple_test(self, img, img_metas, rescale=False): x = self.extract_feat(img) rpn_results = self.rpn_head.simple_test_rpn(x, img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results segm_results = self.roi_head.simple_test( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, imgs_whwh=None, rescale=rescale) return segm_results def forward_dummy(self, img): """Used for computing network flops. See `mmdetection/tools/get_flops.py` """ # backbone x = self.extract_feat(img) # rpn num_imgs = len(img) dummy_img_metas = [ dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs) ] rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results # roi_head roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats, dummy_img_metas) return roi_outs ================================================ FILE: knet_vis/det/mask_hungarian_assigner.py ================================================ import numpy as np import torch from mmdet.core import AssignResult, BaseAssigner from mmdet.core.bbox.builder import BBOX_ASSIGNERS from mmdet.core.bbox.match_costs.builder import MATCH_COST, build_match_cost try: from scipy.optimize import linear_sum_assignment except ImportError: linear_sum_assignment = None @MATCH_COST.register_module() class DiceCost(object): """DiceCost. Args: weight (int | float, optional): loss_weight pred_act (bool): Whether to activate the prediction before calculating cost Examples: >>> from mmdet.core.bbox.match_costs.match_cost import BBoxL1Cost >>> import torch >>> self = BBoxL1Cost() >>> bbox_pred = torch.rand(1, 4) >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]]) >>> factor = torch.tensor([10, 8, 10, 8]) >>> self(bbox_pred, gt_bboxes, factor) tensor([[1.6172, 1.6422]]) """ def __init__(self, weight=1., pred_act=False, act_mode='sigmoid', eps=1e-3): self.weight = weight self.pred_act = pred_act self.act_mode = act_mode self.eps = eps def dice_loss(cls, input, target, eps=1e-3): input = input.reshape(input.size()[0], -1) target = target.reshape(target.size()[0], -1).float() # einsum saves 10x memory # a = torch.sum(input[:, None] * target[None, ...], -1) a = torch.einsum('nh,mh->nm', input, target) b = torch.sum(input * input, 1) + eps c = torch.sum(target * target, 1) + eps d = (2 * a) / (b[:, None] + c[None, ...]) # 1 is a constance that will not affect the matching, so ommitted return -d def __call__(self, mask_preds, gt_masks): """ Args: bbox_pred (Tensor): Predicted boxes with normalized coordinates (cx, cy, w, h), which are all in range [0, 1]. Shape [num_query, 4]. gt_bboxes (Tensor): Ground truth boxes with normalized coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. Returns: torch.Tensor: bbox_cost value with weight """ if self.pred_act and self.act_mode == 'sigmoid': mask_preds = mask_preds.sigmoid() elif self.pred_act: mask_preds = mask_preds.softmax(dim=0) dice_cost = self.dice_loss(mask_preds, gt_masks, self.eps) return dice_cost * self.weight @MATCH_COST.register_module() class MaskCost(object): """MaskCost. Args: weight (int | float, optional): loss_weight """ def __init__(self, weight=1., pred_act=False, act_mode='sigmoid'): self.weight = weight self.pred_act = pred_act self.act_mode = act_mode def __call__(self, cls_pred, target): """ Args: cls_pred (Tensor): Predicted classification logits, shape [num_query, num_class]. gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). Returns: torch.Tensor: cls_cost value with weight """ if self.pred_act and self.act_mode == 'sigmoid': cls_pred = cls_pred.sigmoid() elif self.pred_act: cls_pred = cls_pred.softmax(dim=0) _, H, W = target.shape # flatten_cls_pred = cls_pred.view(num_proposals, -1) # eingum is ~10 times faster than matmul pos_cost = torch.einsum('nhw,mhw->nm', cls_pred, target) neg_cost = torch.einsum('nhw,mhw->nm', 1 - cls_pred, 1 - target) cls_cost = -(pos_cost + neg_cost) / (H * W) return cls_cost * self.weight @BBOX_ASSIGNERS.register_module() class MaskHungarianAssigner(BaseAssigner): """Computes one-to-one matching between predictions and ground truth. This class computes an assignment between the targets and the predictions based on the costs. The costs are weighted sum of three components: classfication cost, regression L1 cost and regression iou cost. The targets don't include the no_object, so generally there are more predictions than targets. After the one-to-one matching, the un-matched are treated as backgrounds. Thus each query prediction will be assigned with `0` or a positive integer indicating the ground truth index: - 0: negative sample, no assigned gt - positive integer: positive sample, index (1-based) of assigned gt Args: cls_weight (int | float, optional): The scale factor for classification cost. Default 1.0. bbox_weight (int | float, optional): The scale factor for regression L1 cost. Default 1.0. iou_weight (int | float, optional): The scale factor for regression iou cost. Default 1.0. iou_calculator (dict | optional): The config for the iou calculation. Default type `BboxOverlaps2D`. iou_mode (str | optional): "iou" (intersection over union), "iof" (intersection over foreground), or "giou" (generalized intersection over union). Default "giou". """ def __init__(self, cls_cost=dict(type='ClassificationCost', weight=1.), mask_cost=dict(type='SigmoidCost', weight=1.0), dice_cost=dict(), boundary_cost=None, topk=1): self.cls_cost = build_match_cost(cls_cost) self.mask_cost = build_match_cost(mask_cost) self.dice_cost = build_match_cost(dice_cost) if boundary_cost is not None: self.boundary_cost = build_match_cost(boundary_cost) else: self.boundary_cost = None self.topk = topk def assign(self, bbox_pred, cls_pred, gt_bboxes, gt_labels, img_meta=None, gt_bboxes_ignore=None, eps=1e-7): """Computes one-to-one matching based on the weighted costs. This method assign each query prediction to a ground truth or background. The `assigned_gt_inds` with -1 means don't care, 0 means negative sample, and positive number is the index (1-based) of assigned gt. The assignment is done in the following steps, the order matters. 1. assign every prediction to -1 2. compute the weighted costs 3. do Hungarian matching on CPU based on the costs 4. assign all to 0 (background) first, then for each matched pair between predictions and gts, treat this prediction as foreground and assign the corresponding gt index (plus 1) to it. Args: bbox_pred (Tensor): Predicted boxes with normalized coordinates (cx, cy, w, h), which are all in range [0, 1]. Shape [num_query, 4]. cls_pred (Tensor): Predicted classification logits, shape [num_query, num_class]. gt_bboxes (Tensor): Ground truth boxes with unnormalized coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). img_meta (dict): Meta information for current image. gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are labelled as `ignored`. Default None. eps (int | float, optional): A value added to the denominator for numerical stability. Default 1e-7. Returns: :obj:`AssignResult`: The assigned result. """ assert gt_bboxes_ignore is None, \ 'Only case when gt_bboxes_ignore is None is supported.' num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) # 1. assign -1 by default assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long) assigned_labels = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long) if num_gts == 0 or num_bboxes == 0: # No ground truth or boxes, return empty assignment if num_gts == 0: # No ground truth, assign all to background assigned_gt_inds[:] = 0 return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels) # 2. compute the weighted costs # classification and bboxcost. if self.cls_cost.weight != 0 and cls_pred is not None: cls_cost = self.cls_cost(cls_pred, gt_labels) else: cls_cost = 0 if self.mask_cost.weight != 0: reg_cost = self.mask_cost(bbox_pred, gt_bboxes) else: reg_cost = 0 if self.dice_cost.weight != 0: dice_cost = self.dice_cost(bbox_pred, gt_bboxes) else: dice_cost = 0 if self.boundary_cost is not None and self.boundary_cost.weight != 0: b_cost = self.boundary_cost(bbox_pred, gt_bboxes) else: b_cost = 0 cost = cls_cost + reg_cost + dice_cost + b_cost # 3. do Hungarian matching on CPU using linear_sum_assignment cost = cost.detach().cpu() if linear_sum_assignment is None: raise ImportError('Please run "pip install scipy" ' 'to install scipy first.') if self.topk == 1: matched_row_inds, matched_col_inds = linear_sum_assignment(cost) else: topk_matched_row_inds = [] topk_matched_col_inds = [] for i in range(self.topk): matched_row_inds, matched_col_inds = linear_sum_assignment( cost) topk_matched_row_inds.append(matched_row_inds) topk_matched_col_inds.append(matched_col_inds) cost[matched_row_inds] = 1e10 matched_row_inds = np.concatenate(topk_matched_row_inds) matched_col_inds = np.concatenate(topk_matched_col_inds) matched_row_inds = torch.from_numpy(matched_row_inds).to( bbox_pred.device) matched_col_inds = torch.from_numpy(matched_col_inds).to( bbox_pred.device) # 4. assign backgrounds and foregrounds # assign all indices to backgrounds first assigned_gt_inds[:] = 0 # assign foregrounds based on matching results assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels) ================================================ FILE: knet_vis/det/mask_pseudo_sampler.py ================================================ import torch from mmdet.core.bbox import BaseSampler, SamplingResult from mmdet.core.bbox.builder import BBOX_SAMPLERS class MaskSamplingResult(SamplingResult): """Bbox sampling result. Example: >>> # xdoctest: +IGNORE_WANT >>> from mmdet.core.bbox.samplers.sampling_result import * # NOQA >>> self = SamplingResult.random(rng=10) >>> print(f'self = {self}') self = """ def __init__(self, pos_inds, neg_inds, masks, gt_masks, assign_result, gt_flags): self.pos_inds = pos_inds self.neg_inds = neg_inds self.pos_masks = masks[pos_inds] self.neg_masks = masks[neg_inds] self.pos_is_gt = gt_flags[pos_inds] self.num_gts = gt_masks.shape[0] self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 if gt_masks.numel() == 0: # hack for index error case assert self.pos_assigned_gt_inds.numel() == 0 self.pos_gt_masks = torch.empty_like(gt_masks) else: self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :] if assign_result.labels is not None: self.pos_gt_labels = assign_result.labels[pos_inds] else: self.pos_gt_labels = None @property def masks(self): """torch.Tensor: concatenated positive and negative boxes""" return torch.cat([self.pos_masks, self.neg_masks]) def __nice__(self): data = self.info.copy() data['pos_masks'] = data.pop('pos_masks').shape data['neg_masks'] = data.pop('neg_masks').shape parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())] body = ' ' + ',\n '.join(parts) return '{\n' + body + '\n}' @property def info(self): """Returns a dictionary of info about the object.""" return { 'pos_inds': self.pos_inds, 'neg_inds': self.neg_inds, 'pos_masks': self.pos_masks, 'neg_masks': self.neg_masks, 'pos_is_gt': self.pos_is_gt, 'num_gts': self.num_gts, 'pos_assigned_gt_inds': self.pos_assigned_gt_inds, } @BBOX_SAMPLERS.register_module() class MaskPseudoSampler(BaseSampler): """A pseudo sampler that does not do sampling actually.""" def __init__(self, **kwargs): pass def _sample_pos(self, **kwargs): """Sample positive samples.""" raise NotImplementedError def _sample_neg(self, **kwargs): """Sample negative samples.""" raise NotImplementedError def sample(self, assign_result, masks, gt_masks, **kwargs): """Directly returns the positive and negative indices of samples. Args: assign_result (:obj:`AssignResult`): Assigned results masks (torch.Tensor): Bounding boxes gt_masks (torch.Tensor): Ground truth boxes Returns: :obj:`SamplingResult`: sampler results """ pos_inds = torch.nonzero( assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique() neg_inds = torch.nonzero( assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique() gt_flags = masks.new_zeros(masks.shape[0], dtype=torch.uint8) sampling_result = MaskSamplingResult(pos_inds, neg_inds, masks, gt_masks, assign_result, gt_flags) return sampling_result ================================================ FILE: knet_vis/det/semantic_fpn_wrapper.py ================================================ import torch import torch.nn as nn from mmcv.cnn import ConvModule, normal_init from mmdet.models.builder import NECKS from mmcv.cnn.bricks.transformer import build_positional_encoding from mmdet.utils import get_root_logger @NECKS.register_module() class SemanticFPNWrapper(nn.Module): """Implementation of Semantic FPN used in Panoptic FPN. Args: in_channels ([type]): [description] feat_channels ([type]): [description] out_channels ([type]): [description] start_level ([type]): [description] end_level ([type]): [description] cat_coors (bool, optional): [description]. Defaults to False. fuse_by_cat (bool, optional): [description]. Defaults to False. conv_cfg ([type], optional): [description]. Defaults to None. norm_cfg ([type], optional): [description]. Defaults to None. """ def __init__(self, in_channels, feat_channels, out_channels, start_level, end_level, cat_coors=False, positional_encoding=None, cat_coors_level=3, fuse_by_cat=False, return_list=False, upsample_times=3, with_pred=True, num_aux_convs=0, act_cfg=dict(type='ReLU', inplace=True), out_act_cfg=dict(type='ReLU'), conv_cfg=None, norm_cfg=None): super(SemanticFPNWrapper, self).__init__() self.in_channels = in_channels self.feat_channels = feat_channels self.start_level = start_level self.end_level = end_level assert start_level >= 0 and end_level >= start_level self.out_channels = out_channels self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.cat_coors = cat_coors self.cat_coors_level = cat_coors_level self.fuse_by_cat = fuse_by_cat self.return_list = return_list self.upsample_times = upsample_times self.with_pred = with_pred if positional_encoding is not None: self.positional_encoding = build_positional_encoding( positional_encoding) else: self.positional_encoding = None self.convs_all_levels = nn.ModuleList() for i in range(self.start_level, self.end_level + 1): convs_per_level = nn.Sequential() if i == 0: if i == self.cat_coors_level and self.cat_coors: chn = self.in_channels + 2 else: chn = self.in_channels if upsample_times == self.end_level - i: one_conv = ConvModule( chn, self.feat_channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, inplace=False) convs_per_level.add_module('conv' + str(i), one_conv) else: for i in range(self.end_level - upsample_times): one_conv = ConvModule( chn, self.feat_channels, 3, padding=1, stride=2, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, inplace=False) convs_per_level.add_module('conv' + str(i), one_conv) self.convs_all_levels.append(convs_per_level) continue for j in range(i): if j == 0: if i == self.cat_coors_level and self.cat_coors: chn = self.in_channels + 2 else: chn = self.in_channels one_conv = ConvModule( chn, self.feat_channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, inplace=False) convs_per_level.add_module('conv' + str(j), one_conv) if j < upsample_times - (self.end_level - i): one_upsample = nn.Upsample( scale_factor=2, mode='bilinear', align_corners=False) convs_per_level.add_module('upsample' + str(j), one_upsample) continue one_conv = ConvModule( self.feat_channels, self.feat_channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, inplace=False) convs_per_level.add_module('conv' + str(j), one_conv) if j < upsample_times - (self.end_level - i): one_upsample = nn.Upsample( scale_factor=2, mode='bilinear', align_corners=False) convs_per_level.add_module('upsample' + str(j), one_upsample) self.convs_all_levels.append(convs_per_level) if fuse_by_cat: in_channels = self.feat_channels * len(self.convs_all_levels) else: in_channels = self.feat_channels if self.with_pred: self.conv_pred = ConvModule( in_channels, self.out_channels, 1, padding=0, conv_cfg=self.conv_cfg, act_cfg=out_act_cfg, norm_cfg=self.norm_cfg) self.num_aux_convs = num_aux_convs self.aux_convs = nn.ModuleList() for i in range(num_aux_convs): self.aux_convs.append( ConvModule( in_channels, self.out_channels, 1, padding=0, conv_cfg=self.conv_cfg, act_cfg=out_act_cfg, norm_cfg=self.norm_cfg)) def init_weights(self): logger = get_root_logger() logger.info('Use normal intialization for semantic FPN') for m in self.modules(): if isinstance(m, nn.Conv2d): normal_init(m, std=0.01) def generate_coord(self, input_feat): x_range = torch.linspace( -1, 1, input_feat.shape[-1], device=input_feat.device) y_range = torch.linspace( -1, 1, input_feat.shape[-2], device=input_feat.device) y, x = torch.meshgrid(y_range, x_range) y = y.expand([input_feat.shape[0], 1, -1, -1]) x = x.expand([input_feat.shape[0], 1, -1, -1]) coord_feat = torch.cat([x, y], 1) return coord_feat def forward(self, inputs): mlvl_feats = [] for i in range(self.start_level, self.end_level + 1): input_p = inputs[i] if i == self.cat_coors_level: if self.positional_encoding is not None: ignore_mask = input_p.new_zeros( (input_p.shape[0], input_p.shape[-2], input_p.shape[-1]), dtype=torch.bool) positional_encoding = self.positional_encoding(ignore_mask) input_p = input_p + positional_encoding if self.cat_coors: coord_feat = self.generate_coord(input_p) input_p = torch.cat([input_p, coord_feat], 1) mlvl_feats.append(self.convs_all_levels[i](input_p)) if self.fuse_by_cat: feature_add_all_level = torch.cat(mlvl_feats, dim=1) else: feature_add_all_level = sum(mlvl_feats) if self.with_pred: out = self.conv_pred(feature_add_all_level) else: out = feature_add_all_level if self.num_aux_convs > 0: outs = [out] for conv in self.aux_convs: outs.append(conv(feature_add_all_level)) return outs if self.return_list: return [out] else: return out ================================================ FILE: knet_vis/det/utils.py ================================================ import torch def sem2ins_masks(gt_sem_seg, num_thing_classes=80): """Convert semantic segmentation mask to binary masks Args: gt_sem_seg (torch.Tensor): Semantic masks to be converted. [0, num_thing_classes-1] is the classes of things, [num_thing_classes:] is the classes of stuff. num_thing_classes (int, optional): Number of thing classes. Defaults to 80. Returns: tuple[torch.Tensor]: (mask_labels, bin_masks). Mask labels and binary masks of stuff classes. """ # gt_sem_seg is zero-started, where zero indicates the first class # since mmdet>=2.17.0, see more discussion in # https://mmdetection.readthedocs.io/en/latest/conventions.html#coco-panoptic-dataset # noqa classes = torch.unique(gt_sem_seg) # classes ranges from 0 - N-1, where the class IDs in # [0, num_thing_classes - 1] are IDs of thing classes masks = [] labels = [] for i in classes: # skip ignore class 255 and "thing classes" in semantic seg if i == 255 or i < num_thing_classes: continue labels.append(i) masks.append(gt_sem_seg == i) if len(labels) > 0: labels = torch.stack(labels) masks = torch.cat(masks) else: labels = gt_sem_seg.new_zeros(size=[0]) masks = gt_sem_seg.new_zeros( size=[0, gt_sem_seg.shape[-2], gt_sem_seg.shape[-1]]) return labels.long(), masks.float() ================================================ FILE: knet_vis/kernel_updator.py ================================================ import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import build_activation_layer, build_norm_layer from mmcv.cnn.bricks.transformer import TRANSFORMER_LAYER @TRANSFORMER_LAYER.register_module() class KernelUpdator(nn.Module): def __init__(self, in_channels=256, feat_channels=64, out_channels=None, input_feat_shape=3, gate_sigmoid=True, gate_norm_act=False, activate_out=False, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')): super(KernelUpdator, self).__init__() self.in_channels = in_channels self.feat_channels = feat_channels self.out_channels_raw = out_channels self.gate_sigmoid = gate_sigmoid self.gate_norm_act = gate_norm_act self.activate_out = activate_out if isinstance(input_feat_shape, int): input_feat_shape = [input_feat_shape] * 2 self.input_feat_shape = input_feat_shape self.act_cfg = act_cfg self.norm_cfg = norm_cfg self.out_channels = out_channels if out_channels else in_channels self.num_params_in = self.feat_channels self.num_params_out = self.feat_channels self.dynamic_layer = nn.Linear( self.in_channels, self.num_params_in + self.num_params_out) self.input_layer = nn.Linear(self.in_channels, self.num_params_in + self.num_params_out, 1) self.input_gate = nn.Linear(self.in_channels, self.feat_channels, 1) self.update_gate = nn.Linear(self.in_channels, self.feat_channels, 1) if self.gate_norm_act: self.gate_norm = build_norm_layer(norm_cfg, self.feat_channels)[1] self.norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1] self.norm_out = build_norm_layer(norm_cfg, self.feat_channels)[1] self.input_norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1] self.input_norm_out = build_norm_layer(norm_cfg, self.feat_channels)[1] self.activation = build_activation_layer(act_cfg) self.fc_layer = nn.Linear(self.feat_channels, self.out_channels, 1) self.fc_norm = build_norm_layer(norm_cfg, self.out_channels)[1] def forward(self, update_feature, input_feature): update_feature = update_feature.reshape(-1, self.in_channels) num_proposals = update_feature.size(0) parameters = self.dynamic_layer(update_feature) param_in = parameters[:, :self.num_params_in].view( -1, self.feat_channels) param_out = parameters[:, -self.num_params_out:].view( -1, self.feat_channels) input_feats = self.input_layer( input_feature.reshape(num_proposals, -1, self.feat_channels)) input_in = input_feats[..., :self.num_params_in] input_out = input_feats[..., -self.num_params_out:] gate_feats = input_in * param_in.unsqueeze(-2) if self.gate_norm_act: gate_feats = self.activation(self.gate_norm(gate_feats)) input_gate = self.input_norm_in(self.input_gate(gate_feats)) update_gate = self.norm_in(self.update_gate(gate_feats)) if self.gate_sigmoid: input_gate = input_gate.sigmoid() update_gate = update_gate.sigmoid() param_out = self.norm_out(param_out) input_out = self.input_norm_out(input_out) if self.activate_out: param_out = self.activation(param_out) input_out = self.activation(input_out) # param_out has shape (batch_size, feat_channels, out_channels) features = update_gate * param_out.unsqueeze( -2) + input_gate * input_out features = self.fc_layer(features) features = self.fc_norm(features) features = self.activation(features) return features ================================================ FILE: knet_vis/tracker/__init__.py ================================================ ================================================ FILE: knet_vis/tracker/kernel_frame_head.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import (ConvModule, bias_init_with_prob, normal_init) from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean from mmdet.models.builder import HEADS, build_loss, build_neck from mmdet.models.losses import accuracy from mmdet.utils import get_root_logger @HEADS.register_module() class ConvKernelHeadVolume(nn.Module): def __init__(self, num_proposals=100, in_channels=256, out_channels=256, num_heads=8, num_cls_fcs=1, num_seg_convs=1, num_loc_convs=1, att_dropout=False, localization_fpn=None, conv_kernel_size=1, norm_cfg=dict(type='GN', num_groups=32), semantic_fpn=True, train_cfg=None, num_classes=80, xavier_init_kernel=False, kernel_init_std=0.01, use_binary=False, proposal_feats_with_obj=False, loss_mask=None, loss_seg=None, loss_cls=None, loss_dice=None, loss_rank=None, feat_downsample_stride=1, feat_refine_stride=1, feat_refine=True, with_embed=False, feat_embed_only=False, conv_normal_init=False, mask_out_stride=4, hard_target=False, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, cat_stuff_mask=False, **kwargs): super().__init__() self.num_proposals = num_proposals self.num_cls_fcs = num_cls_fcs self.train_cfg = train_cfg self.in_channels = in_channels self.out_channels = out_channels self.num_classes = num_classes self.proposal_feats_with_obj = proposal_feats_with_obj self.sampling = False self.localization_fpn = build_neck(localization_fpn) self.semantic_fpn = semantic_fpn self.norm_cfg = norm_cfg self.num_heads = num_heads self.att_dropout = att_dropout self.mask_out_stride = mask_out_stride self.hard_target = hard_target self.conv_kernel_size = conv_kernel_size self.xavier_init_kernel = xavier_init_kernel self.kernel_init_std = kernel_init_std self.feat_downsample_stride = feat_downsample_stride self.feat_refine_stride = feat_refine_stride self.conv_normal_init = conv_normal_init self.feat_refine = feat_refine self.with_embed = with_embed self.feat_embed_only = feat_embed_only self.num_loc_convs = num_loc_convs self.num_seg_convs = num_seg_convs self.use_binary = use_binary self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.ignore_label = ignore_label self.thing_label_in_seg = thing_label_in_seg self.cat_stuff_mask = cat_stuff_mask if loss_mask is not None: self.loss_mask = build_loss(loss_mask) else: self.loss_mask = loss_mask if loss_dice is not None: self.loss_dice = build_loss(loss_dice) else: self.loss_dice = loss_dice if loss_seg is not None: self.loss_seg = build_loss(loss_seg) else: self.loss_seg = loss_seg if loss_cls is not None: self.loss_cls = build_loss(loss_cls) else: self.loss_cls = loss_cls if loss_rank is not None: self.loss_rank = build_loss(loss_rank) else: self.loss_rank = loss_rank if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) # use PseudoSampler when sampling is False if self.sampling and hasattr(self.train_cfg, 'sampler'): sampler_cfg = self.train_cfg.sampler else: sampler_cfg = dict(type='MaskPseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self._init_layers() def _init_layers(self): """Initialize a sparse set of proposal boxes and proposal features.""" self.init_kernels = nn.Conv2d( self.out_channels, self.num_proposals, self.conv_kernel_size, padding=int(self.conv_kernel_size // 2), bias=False) if self.semantic_fpn: if self.loss_seg.use_sigmoid: self.conv_seg = nn.Conv2d(self.out_channels, self.num_classes, 1) else: self.conv_seg = nn.Conv2d(self.out_channels, self.num_classes + 1, 1) if self.feat_downsample_stride > 1 and self.feat_refine: self.ins_downsample = ConvModule( self.in_channels, self.out_channels, 3, stride=self.feat_refine_stride, padding=1, norm_cfg=self.norm_cfg) self.seg_downsample = ConvModule( self.in_channels, self.out_channels, 3, stride=self.feat_refine_stride, padding=1, norm_cfg=self.norm_cfg) self.loc_convs = nn.ModuleList() for i in range(self.num_loc_convs): self.loc_convs.append( ConvModule( self.in_channels, self.out_channels, 1, norm_cfg=self.norm_cfg)) self.seg_convs = nn.ModuleList() for i in range(self.num_seg_convs): self.seg_convs.append( ConvModule( self.in_channels, self.out_channels, 1, norm_cfg=self.norm_cfg)) def init_weights(self): self.localization_fpn.init_weights() if self.feat_downsample_stride > 1 and self.conv_normal_init: logger = get_root_logger() logger.info('Initialize convs in KPN head by normal std 0.01') for conv in [self.loc_convs, self.seg_convs]: for m in conv.modules(): if isinstance(m, nn.Conv2d): normal_init(m, std=0.01) if self.semantic_fpn: bias_seg = bias_init_with_prob(0.01) if self.loss_seg.use_sigmoid: normal_init(self.conv_seg, std=0.01, bias=bias_seg) else: normal_init(self.conv_seg, mean=0, std=0.01) if self.xavier_init_kernel: logger = get_root_logger() logger.info('Initialize kernels by xavier uniform') nn.init.xavier_uniform_(self.init_kernels.weight) else: logger = get_root_logger() logger.info( f'Initialize kernels by normal std: {self.kernel_init_std}') normal_init(self.init_kernels, mean=0, std=self.kernel_init_std) def _decode_init_proposals(self, img, img_metas, ref_img_metas): num_imgs = len(img_metas) num_frames = len(ref_img_metas[0]) if self.localization_fpn.__class__.__name__.endswith('3D'): localization_feats = self.localization_fpn(img, num_imgs, num_frames) else: localization_feats = self.localization_fpn(img) if isinstance(localization_feats, list): loc_feats = localization_feats[0] else: loc_feats = localization_feats for conv in self.loc_convs: loc_feats = conv(loc_feats) if self.feat_downsample_stride > 1 and self.feat_refine: loc_feats = self.ins_downsample(loc_feats) mask_preds = self.init_kernels(loc_feats) if self.semantic_fpn: if isinstance(localization_feats, list): semantic_feats = localization_feats[1] else: semantic_feats = localization_feats for conv in self.seg_convs: semantic_feats = conv(semantic_feats) if self.feat_downsample_stride > 1 and self.feat_refine: semantic_feats = self.seg_downsample(semantic_feats) else: semantic_feats = None if semantic_feats is not None: seg_preds = self.conv_seg(semantic_feats) else: seg_preds = None proposal_feats = self.init_kernels.weight.clone() proposal_feats = proposal_feats[None].expand(num_imgs * num_frames, *proposal_feats.size()) if semantic_feats is not None: x_feats = semantic_feats + loc_feats else: x_feats = loc_feats if self.proposal_feats_with_obj: sigmoid_masks = mask_preds.sigmoid() nonzero_inds = sigmoid_masks > 0.5 if self.use_binary: sigmoid_masks = nonzero_inds.float() else: sigmoid_masks = nonzero_inds.float() * sigmoid_masks obj_feats = torch.einsum('bnhw,bchw->bnc', sigmoid_masks, x_feats) cls_scores = None if self.proposal_feats_with_obj: proposal_feats = proposal_feats + obj_feats.view( num_imgs * num_frames, self.num_proposals, self.out_channels, 1, 1) if self.cat_stuff_mask and not self.training: mask_preds = torch.cat( [mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1) stuff_kernels = self.conv_seg.weight[self. num_thing_classes:].clone() stuff_kernels = stuff_kernels[None].expand(num_imgs * num_frames, *stuff_kernels.size()) proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1) return proposal_feats, x_feats, mask_preds, cls_scores, seg_preds def forward_train(self, img, img_metas, ref_img_metas, gt_masks, gt_labels, gt_instance_ids=None, gt_sem_seg=None, gt_sem_cls=None): """Forward function in training stage.""" assert gt_instance_ids is not None num_imgs = len(img_metas) num_frames = len(ref_img_metas[0]) results = self._decode_init_proposals(img, img_metas, ref_img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = results if self.feat_downsample_stride > 1: scaled_mask_preds = F.interpolate( mask_preds, scale_factor=self.feat_downsample_stride, mode='bilinear', align_corners=False) if seg_preds is not None: scaled_seg_preds = F.interpolate( seg_preds, scale_factor=self.feat_downsample_stride, mode='bilinear', align_corners=False) else: scaled_mask_preds = mask_preds scaled_seg_preds = seg_preds if self.hard_target: gt_masks = [x.bool().float() for x in gt_masks] else: gt_masks = gt_masks sampling_results = [] if cls_scores is None: detached_cls_scores = [None] * num_imgs else: detached_cls_scores = cls_scores.detach() scaled_mask_preds = scaled_mask_preds.reshape((num_imgs, num_frames, *scaled_mask_preds.size()[1:])) num_cls = scaled_seg_preds.size(1) _h, _w = scaled_mask_preds.size()[-2:] scaled_seg_preds = scaled_seg_preds.reshape((num_imgs, num_frames, *scaled_seg_preds.size()[1:])) scaled_seg_preds = torch.einsum('nfshw->nsfhw', scaled_seg_preds).reshape((num_imgs, num_cls, num_frames * _h, _w)) pred_masks_concat = [] for i in range(num_imgs): assign_result, gt_masks_match = self.assigner.assign(scaled_mask_preds[i].detach(), detached_cls_scores[i], gt_masks[i], gt_labels[i], gt_instance_ids[i]) num_bboxes = scaled_mask_preds.size(2) h, w = scaled_mask_preds.shape[-2:] pred_masks_match = torch.einsum('fqhw->qfhw', scaled_mask_preds[i]).reshape((num_bboxes, -1, w)) sampling_result = self.sampler.sample(assign_result, pred_masks_match, gt_masks_match) sampling_results.append(sampling_result) pred_masks_concat.append(pred_masks_match) pred_masks_concat = torch.stack(pred_masks_concat) mask_targets = self.get_targets( sampling_results, self.train_cfg, True, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls) losses = self.loss(pred_masks_concat, cls_scores, scaled_seg_preds, None, *mask_targets) if self.cat_stuff_mask and self.training: mask_preds = torch.cat([mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1) stuff_kernels = self.conv_seg.weight[self.num_thing_classes:].clone() stuff_kernels = stuff_kernels[None].expand(num_imgs * num_frames, *stuff_kernels.size()) proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1) return losses, proposal_feats, x_feats, mask_preds, cls_scores def loss(self, mask_pred, cls_scores, seg_preds, proposal_feats, labels, label_weights, mask_targets, mask_weights, seg_targets, reduction_override=None, **kwargs): losses = dict() bg_class_ind = self.num_classes # note in spare rcnn num_gt == num_pos pos_inds = (labels >= 0) & (labels < bg_class_ind) num_preds = mask_pred.shape[0] * mask_pred.shape[1] if cls_scores is not None: num_pos = pos_inds.sum().float() avg_factor = reduce_mean(num_pos) assert mask_pred.shape[0] == cls_scores.shape[0] assert mask_pred.shape[1] == cls_scores.shape[1] losses['loss_rpn_cls'] = self.loss_cls( cls_scores.view(num_preds, -1), labels, label_weights, avg_factor=avg_factor, reduction_override=reduction_override) losses['rpn_pos_acc'] = accuracy( cls_scores.view(num_preds, -1)[pos_inds], labels[pos_inds]) bool_pos_inds = pos_inds.type(torch.bool) # 0~self.num_classes-1 are FG, self.num_classes is BG # do not perform bounding box regression for BG anymore. H, W = mask_pred.shape[-2:] if pos_inds.any(): pos_mask_pred = mask_pred.reshape(num_preds, H, W)[bool_pos_inds] pos_mask_targets = mask_targets[bool_pos_inds] losses['loss_rpn_mask'] = self.loss_mask(pos_mask_pred, pos_mask_targets) losses['loss_rpn_dice'] = self.loss_dice(pos_mask_pred, pos_mask_targets) if self.loss_rank is not None: batch_size = mask_pred.size(0) rank_target = mask_targets.new_full((batch_size, H, W), self.ignore_label, dtype=torch.long) rank_inds = pos_inds.view(batch_size, -1).nonzero(as_tuple=False) batch_mask_targets = mask_targets.view(batch_size, -1, H, W).bool() for i in range(batch_size): curr_inds = (rank_inds[:, 0] == i) curr_rank = rank_inds[:, 1][curr_inds] for j in curr_rank: rank_target[i][batch_mask_targets[i][j]] = j losses['loss_rpn_rank'] = self.loss_rank( mask_pred, rank_target, ignore_index=self.ignore_label) else: losses['loss_rpn_mask'] = mask_pred.sum() * 0 losses['loss_rpn_dice'] = mask_pred.sum() * 0 if self.loss_rank is not None: losses['loss_rank'] = mask_pred.sum() * 0 if seg_preds is not None: if self.loss_seg.use_sigmoid: cls_channel = seg_preds.shape[1] flatten_seg = seg_preds.view( -1, cls_channel, H * W).permute(0, 2, 1).reshape(-1, cls_channel) flatten_seg_target = seg_targets.view(-1) num_dense_pos = (flatten_seg_target >= 0) & ( flatten_seg_target < bg_class_ind) num_dense_pos = num_dense_pos.sum().float().clamp(min=1.0) losses['loss_rpn_seg'] = self.loss_seg( flatten_seg, flatten_seg_target, avg_factor=num_dense_pos) else: cls_channel = seg_preds.shape[1] flatten_seg = seg_preds.view(-1, cls_channel, H * W).permute( 0, 2, 1).reshape(-1, cls_channel) flatten_seg_target = seg_targets.view(-1) losses['loss_rpn_seg'] = self.loss_seg(flatten_seg, flatten_seg_target) return losses def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask, pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls, cfg): num_pos = pos_mask.size(0) num_neg = neg_mask.size(0) num_samples = num_pos + num_neg H, W = pos_mask.shape[-2:] # original implementation uses new_zeros since BG are set to be 0 # now use empty & fill because BG cat_id = num_classes, # FG cat_id = [0, num_classes-1] labels = pos_mask.new_full((num_samples, ), self.num_classes, dtype=torch.long) label_weights = pos_mask.new_zeros(num_samples) mask_targets = pos_mask.new_zeros(num_samples, H, W) mask_weights = pos_mask.new_zeros(num_samples, H, W) seg_targets = pos_mask.new_full((H, W), self.num_classes, dtype=torch.long) if gt_sem_cls is not None and gt_sem_seg is not None: gt_sem_seg = gt_sem_seg.bool() for sem_mask, sem_cls in zip(gt_sem_seg, gt_sem_cls): seg_targets[sem_mask] = sem_cls.long() if num_pos > 0: labels[pos_inds] = pos_gt_labels pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight label_weights[pos_inds] = pos_weight mask_targets[pos_inds, ...] = pos_gt_mask mask_weights[pos_inds, ...] = 1 for i in range(num_pos): seg_targets[pos_gt_mask[i].bool()] = pos_gt_labels[i] if num_neg > 0: label_weights[neg_inds] = 1.0 return labels, label_weights, mask_targets, mask_weights, seg_targets def get_targets(self, sampling_results, rpn_train_cfg, concat=True, gt_sem_seg=None, gt_sem_cls=None): num_imgs = len(sampling_results) pos_inds_list = [res.pos_inds for res in sampling_results] neg_inds_list = [res.neg_inds for res in sampling_results] pos_mask_list = [res.pos_masks for res in sampling_results] neg_mask_list = [res.neg_masks for res in sampling_results] pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results] pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results] if gt_sem_seg is None: gt_sem_seg = [None] * num_imgs gt_sem_cls = [None] * num_imgs results = multi_apply( self._get_target_single, pos_inds_list, neg_inds_list, pos_mask_list, neg_mask_list, pos_gt_mask_list, pos_gt_labels_list, gt_sem_seg, gt_sem_cls, cfg=rpn_train_cfg) (labels, label_weights, mask_targets, mask_weights, seg_targets) = results if concat: labels = torch.cat(labels, 0) label_weights = torch.cat(label_weights, 0) mask_targets = torch.cat(mask_targets, 0) mask_weights = torch.cat(mask_weights, 0) seg_targets = torch.stack(seg_targets, 0) return labels, label_weights, mask_targets, mask_weights, seg_targets def simple_test_rpn(self, img, img_metas, ref_img_metas): """Forward function in testing stage.""" return self._decode_init_proposals(img, img_metas, ref_img_metas) def forward_dummy(self, img, img_metas, ref_img_metas): """Dummy forward function. Used in flops calculation. """ return self._decode_init_proposals(img, img_metas,ref_img_metas) ================================================ FILE: knet_vis/tracker/kernel_frame_iter_head.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import build_norm_layer from mmcv.cnn.bricks.transformer import MultiheadAttention, FFN from mmdet.core import build_assigner, build_sampler from mmdet.models.builder import HEADS, build_head from mmdet.models.roi_heads import BaseRoIHead from mmdet.utils import get_root_logger @HEADS.register_module() class KernelFrameIterHeadVideo(BaseRoIHead): def __init__(self, mask_head=None, with_mask_init=False, num_stages=3, stage_loss_weights=(1, 1, 1), proposal_feature_channel=256, assign_stages=5, num_proposals=100, num_thing_classes=80, num_stuff_classes=53, query_merge_method='mean', train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None, **kwargs): assert len(stage_loss_weights) == num_stages self.num_stages = num_stages self.stage_loss_weights = stage_loss_weights self.assign_stages = assign_stages self.num_proposals = num_proposals self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.query_merge_method = query_merge_method self.proposal_feature_channel = proposal_feature_channel super().__init__( mask_head=mask_head, train_cfg=train_cfg, test_cfg=test_cfg, init_cfg=init_cfg, **kwargs ) if self.query_merge_method == 'attention': self.init_query = nn.Embedding(self.num_proposals, self.proposal_feature_channel) _num_head = 8 _drop_out = 0. self.query_merge_attn = MultiheadAttention(self.proposal_feature_channel, _num_head, _drop_out, batch_first=True) self.query_merge_norm = build_norm_layer(dict(type='LN'), self.proposal_feature_channel)[1] self.query_merge_ffn = FFN( self.proposal_feature_channel, self.proposal_feature_channel * 8, num_ffn_fcs=2, act_cfg=dict(type='ReLU', inplace=True), ffn_drop=0.) self.query_merge_ffn_norm = build_norm_layer(dict(type='LN'), self.proposal_feature_channel)[1] elif self.query_merge_method == 'attention_pos': self.init_query = nn.Embedding(self.num_proposals, self.proposal_feature_channel) self.query_pos = nn.Embedding(self.num_proposals, self.proposal_feature_channel) _num_head = 8 _drop_out = 0. self.query_merge_attn = MultiheadAttention(self.proposal_feature_channel, _num_head, _drop_out, batch_first=True) self.query_merge_norm = build_norm_layer(dict(type='LN'), self.proposal_feature_channel)[1] self.query_merge_ffn = FFN( self.proposal_feature_channel, self.proposal_feature_channel * 8, num_ffn_fcs=2, act_cfg=dict(type='ReLU', inplace=True), ffn_drop=0.) self.query_merge_ffn_norm = build_norm_layer(dict(type='LN'), self.proposal_feature_channel)[1] self.with_mask_init = with_mask_init if self.with_mask_init: self.fc_mask = nn.Linear(proposal_feature_channel, proposal_feature_channel) self.logger = get_root_logger() def init_mask_head(self, bbox_roi_extractor=None, mask_head=None): assert bbox_roi_extractor is None self.mask_head = nn.ModuleList() if not isinstance(mask_head, list): mask_head = [mask_head for _ in range(self.num_stages)] assert len(mask_head) == self.num_stages for idx, head in enumerate(mask_head): head.update(with_cls=(idx < self.assign_stages)) self.mask_head.append(build_head(head)) def init_assigner_sampler(self): """Initialize assigner and sampler for each stage.""" self.mask_assigner = [] self.mask_sampler = [] if self.train_cfg is not None: for i in range(self.num_stages): self.mask_assigner.append( build_assigner(self.train_cfg.assigner)) self.current_stage = i self.mask_sampler.append( build_sampler(self.train_cfg.sampler, context=self)) def init_bbox_head(self, mask_roi_extractor, mask_head): """Initialize box head and box roi extractor. Args: mask_roi_extractor (dict): Config of box roi extractor. mask_head (dict): Config of box in box head. """ raise NotImplementedError def _mask_forward(self, stage, x, object_feats, mask_preds): mask_head = self.mask_head[stage] cls_score, mask_preds, object_feats = mask_head( x, object_feats, mask_preds, img_metas=None, pos=self.query_pos.weight if self.query_merge_method == 'attention_pos' else None) if mask_head.mask_upsample_stride > 1 and (stage == self.num_stages - 1 or self.training): scaled_mask_preds = [ F.interpolate( mask_preds[i], scale_factor=mask_head.mask_upsample_stride, align_corners=False, mode='bilinear' ) for i in range(mask_preds.size(0)) ] scaled_mask_preds = torch.stack(scaled_mask_preds) else: scaled_mask_preds = mask_preds mask_results = dict( cls_score=cls_score, mask_preds=mask_preds, scaled_mask_preds=scaled_mask_preds, object_feats=object_feats ) return mask_results def _query_fusion(self, obj_feats, num_imgs, num_frames): if self.query_merge_method == 'mean': object_feats = obj_feats.mean(1) elif self.query_merge_method == 'attention': assert obj_feats.size()[-2:] == (1,1), "Only supporting kernel size = 1" obj_feats = obj_feats.reshape((num_imgs, num_frames * self.num_proposals, self.proposal_feature_channel)) init_query = self.init_query.weight.expand(num_imgs, *self.init_query.weight.size()) obj_feats = self.query_merge_attn(query=init_query, key=obj_feats, value=obj_feats) obj_feats = self.query_merge_norm(obj_feats) object_feats = self.query_merge_ffn_norm(self.query_merge_ffn(obj_feats)) object_feats = object_feats[..., None, None] elif self.query_merge_method == 'attention_pos': assert obj_feats.size()[-2:] == (1, 1), "Only supporting kernel size = 1" obj_feats = obj_feats.reshape((num_imgs, num_frames * self.num_proposals, self.proposal_feature_channel)) init_query = self.init_query.weight.expand(num_imgs, *self.init_query.weight.size()) query_pos = self.query_pos.weight.repeat(num_imgs, 1, 1) key_pos = query_pos.repeat(1, num_frames, 1) obj_feats = self.query_merge_attn(query=init_query, key=obj_feats, value=obj_feats, query_pos=query_pos, key_pos=key_pos) obj_feats = self.query_merge_norm(obj_feats) object_feats = self.query_merge_ffn_norm(self.query_merge_ffn(obj_feats)) object_feats = object_feats[..., None, None] return object_feats def _mask_init(self, object_feats, x_feats, num_imgs): assert object_feats.size()[-2:] == (1, 1), "Only supporting kernel size = 1" object_feats = object_feats.flatten(-3, -1) # BNCKK -> BNC mask_feat = self.fc_mask(object_feats)[...,None, None] mask_preds = [] for i in range(num_imgs): mask_preds.append( F.conv2d( x_feats[i], mask_feat[i], padding=0) ) mask_preds = torch.stack(mask_preds, dim=0) return mask_preds def forward_train(self, x, ref_img_metas, cls_scores, masks, obj_feats, ref_gt_masks, ref_gt_labels, ref_gt_instance_ids, **kwargs): num_imgs = len(ref_img_metas) num_frames = len(ref_img_metas[0]) if len(obj_feats.size()) == 6: object_feats = self._query_fusion(obj_feats, num_imgs, num_frames) else: object_feats = obj_feats all_stage_loss = {} if self.with_mask_init: mask_preds = self._mask_init(object_feats, x, num_imgs) assert self.training if self.mask_head[0].mask_upsample_stride > 1: scaled_mask_preds = [ F.interpolate( mask_preds[i], scale_factor=self.mask_head[0].mask_upsample_stride, align_corners=False, mode='bilinear' ) for i in range(mask_preds.size(0)) ] scaled_mask_preds = torch.stack(scaled_mask_preds) else: scaled_mask_preds = mask_preds _gt_masks_matches = [] _assign_results = [] _sampling_results = [] _pred_masks_concat = [] for i in range(num_imgs): mask_for_assign = scaled_mask_preds[i][:self.num_proposals].detach() cls_for_assign = None assign_result, gt_masks_match = self.mask_assigner[0].assign( mask_for_assign, cls_for_assign, ref_gt_masks[i], ref_gt_labels[i], ref_gt_instance_ids[i]) _gt_masks_matches.append(gt_masks_match) _assign_results.append(assign_result) num_bboxes = scaled_mask_preds.size(2) h, w = scaled_mask_preds.shape[-2:] pred_masks_match = torch.einsum('fqhw->qfhw', scaled_mask_preds[i]).reshape((num_bboxes, -1, w)) sampling_result = self.mask_sampler[0].sample( assign_result, pred_masks_match, gt_masks_match) _sampling_results.append(sampling_result) _pred_masks_concat.append(pred_masks_match) pred_masks_concat = torch.stack(_pred_masks_concat) mask_targets = self.mask_head[0].get_targets( _sampling_results, self.train_cfg, True, gt_sem_seg=None, gt_sem_cls=None ) single_stage_loss = self.mask_head[0].loss( object_feats, None, pred_masks_concat, *mask_targets) for key, value in single_stage_loss.items(): all_stage_loss[f'tracker_init_{key}'] = value * self.stage_loss_weights[0] else: mask_preds = masks assign_results = [] for stage in range(self.num_stages): if stage == self.assign_stages: object_feats = object_feats[:, None].repeat(1, num_frames, 1, 1, 1, 1) mask_results = self._mask_forward(stage, x, object_feats, mask_preds) mask_preds = mask_results['mask_preds'] scaled_mask_preds = mask_results['scaled_mask_preds'] cls_score = mask_results['cls_score'] object_feats = mask_results['object_feats'] prev_mask_preds = scaled_mask_preds.detach() prev_cls_score = cls_score.detach() if cls_score is not None else None sampling_results = [] pred_masks_concat = [] if stage < self.assign_stages: assign_results = [] gt_masks_matches = [] for i in range(num_imgs): if stage < self.assign_stages: mask_for_assign = prev_mask_preds[i][:, :self.num_proposals] if prev_cls_score is not None: cls_for_assign = prev_cls_score[i][:self.num_proposals, :self.num_thing_classes] else: cls_for_assign = None assign_result, gt_masks_match = self.mask_assigner[stage].assign( mask_for_assign, cls_for_assign, ref_gt_masks[i], ref_gt_labels[i], ref_gt_instance_ids[i]) gt_masks_matches.append(gt_masks_match) assign_results.append(assign_result) num_bboxes = scaled_mask_preds.size(2) h, w = scaled_mask_preds.shape[-2:] pred_masks_match = torch.einsum('fqhw->qfhw', scaled_mask_preds[i]).reshape((num_bboxes, -1, w)) sampling_result = self.mask_sampler[stage].sample( assign_results[i], pred_masks_match, gt_masks_matches[i]) sampling_results.append(sampling_result) pred_masks_concat.append(pred_masks_match) pred_masks_concat = torch.stack(pred_masks_concat) mask_targets = self.mask_head[stage].get_targets( sampling_results, self.train_cfg, True, gt_sem_seg=None, gt_sem_cls=None ) single_stage_loss = self.mask_head[stage].loss( object_feats, cls_score, pred_masks_concat, *mask_targets) for key, value in single_stage_loss.items(): all_stage_loss[f'tracker_s{stage}_{key}'] = value * self.stage_loss_weights[stage] features = { "obj_feats": object_feats, "x_feats": x, "cls_scores": cls_score, "masks": mask_preds, } return all_stage_loss, features def simple_test(self, x, img_metas, ref_img_metas, cls_scores, masks, obj_feats, **kwargs): num_imgs = len(ref_img_metas) num_frames = len(ref_img_metas[0]) if len(obj_feats.size()) == 6: object_feats = self._query_fusion(obj_feats, num_imgs, num_frames) else: object_feats = obj_feats if self.with_mask_init: mask_preds = self._mask_init(object_feats, x, num_imgs) else: mask_preds = masks cls_score = None for stage in range(self.num_stages): if stage == self.assign_stages: object_feats = object_feats[:, None].repeat(1, num_frames, 1, 1, 1, 1) mask_results = self._mask_forward(stage, x, object_feats, mask_preds) mask_preds = mask_results['mask_preds'] scaled_mask_preds = mask_results['scaled_mask_preds'] cls_score = mask_results['cls_score'] if mask_results['cls_score'] is not None else cls_score object_feats = mask_results['object_feats'] num_classes = self.mask_head[-1].num_classes results = [] if self.mask_head[-1].loss_cls.use_sigmoid: cls_score = cls_score.sigmoid() else: cls_score = cls_score.softmax(-1)[..., :-1] for img_id in range(num_imgs): result = [] cls_score_per_img = cls_score[img_id] # h, quite tricky here, a bounding box can predict multiple results with different labels scores_per_img, topk_indices = cls_score_per_img.flatten(0, 1).topk( self.test_cfg.max_per_img, sorted=True) mask_indices = topk_indices // num_classes # Use the following when torch >= 1.9.0 # mask_indices = torch.div(topk_indices, num_classes, rounding_mode='floor') labels_per_img = topk_indices % num_classes for frame_id in range(num_frames): masks_per_img = scaled_mask_preds[img_id][frame_id][mask_indices] single_result=self.mask_head[-1].get_seg_masks_tracking( masks_per_img, labels_per_img, scores_per_img, torch.arange(self.test_cfg.max_per_img), self.test_cfg, img_metas[img_id]) result.append(single_result) results.append(result) features = { "obj_feats": object_feats, "x_feats": x, "cls_scores": cls_score, "masks": mask_preds, } return results, features def init_weights(self): if self.init_cfg is not None and self.init_cfg['type'] == 'Pretrained' and self.init_cfg['prefix'] is not None: from mmcv.cnn import initialize self.logger.info(f"Customized loading the tracker.") initialize(self, self.init_cfg) else: super().init_weights() ================================================ FILE: knet_vis/tracker/kernel_head.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import (ConvModule, bias_init_with_prob, normal_init) from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean from mmdet.models.builder import HEADS, build_loss, build_neck from mmdet.models.losses import accuracy from mmdet.utils import get_root_logger @HEADS.register_module() class ConvKernelHeadVideo(nn.Module): def __init__(self, num_proposals=100, in_channels=256, out_channels=256, num_heads=8, num_cls_fcs=1, num_seg_convs=1, num_loc_convs=1, att_dropout=False, localization_fpn=None, conv_kernel_size=1, norm_cfg=dict(type='GN', num_groups=32), semantic_fpn=True, train_cfg=None, num_classes=80, xavier_init_kernel=False, kernel_init_std=0.01, use_binary=False, proposal_feats_with_obj=False, loss_mask=None, loss_seg=None, loss_cls=None, loss_dice=None, loss_rank=None, feat_downsample_stride=1, feat_refine_stride=1, feat_refine=True, with_embed=False, feat_embed_only=False, conv_normal_init=False, mask_out_stride=4, hard_target=False, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, cat_stuff_mask=False, **kwargs): super().__init__() self.num_proposals = num_proposals self.num_cls_fcs = num_cls_fcs self.train_cfg = train_cfg self.in_channels = in_channels self.out_channels = out_channels self.num_classes = num_classes self.proposal_feats_with_obj = proposal_feats_with_obj self.sampling = False self.localization_fpn = build_neck(localization_fpn) self.semantic_fpn = semantic_fpn self.norm_cfg = norm_cfg self.num_heads = num_heads self.att_dropout = att_dropout self.mask_out_stride = mask_out_stride self.hard_target = hard_target self.conv_kernel_size = conv_kernel_size self.xavier_init_kernel = xavier_init_kernel self.kernel_init_std = kernel_init_std self.feat_downsample_stride = feat_downsample_stride self.feat_refine_stride = feat_refine_stride self.conv_normal_init = conv_normal_init self.feat_refine = feat_refine self.with_embed = with_embed self.feat_embed_only = feat_embed_only self.num_loc_convs = num_loc_convs self.num_seg_convs = num_seg_convs self.use_binary = use_binary self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.ignore_label = ignore_label self.thing_label_in_seg = thing_label_in_seg self.cat_stuff_mask = cat_stuff_mask if loss_mask is not None: self.loss_mask = build_loss(loss_mask) else: self.loss_mask = loss_mask if loss_dice is not None: self.loss_dice = build_loss(loss_dice) else: self.loss_dice = loss_dice if loss_seg is not None: self.loss_seg = build_loss(loss_seg) else: self.loss_seg = loss_seg if loss_cls is not None: self.loss_cls = build_loss(loss_cls) else: self.loss_cls = loss_cls if loss_rank is not None: self.loss_rank = build_loss(loss_rank) else: self.loss_rank = loss_rank if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) # use PseudoSampler when sampling is False if self.sampling and hasattr(self.train_cfg, 'sampler'): sampler_cfg = self.train_cfg.sampler else: sampler_cfg = dict(type='MaskPseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self._init_layers() def _init_layers(self): """Initialize a sparse set of proposal boxes and proposal features.""" self.init_kernels = nn.Conv2d( self.out_channels, self.num_proposals, self.conv_kernel_size, padding=int(self.conv_kernel_size // 2), bias=False) if self.semantic_fpn: if self.loss_seg.use_sigmoid: self.conv_seg = nn.Conv2d(self.out_channels, self.num_classes, 1) else: self.conv_seg = nn.Conv2d(self.out_channels, self.num_classes + 1, 1) if self.feat_downsample_stride > 1 and self.feat_refine: self.ins_downsample = ConvModule( self.in_channels, self.out_channels, 3, stride=self.feat_refine_stride, padding=1, norm_cfg=self.norm_cfg) self.seg_downsample = ConvModule( self.in_channels, self.out_channels, 3, stride=self.feat_refine_stride, padding=1, norm_cfg=self.norm_cfg) self.loc_convs = nn.ModuleList() for i in range(self.num_loc_convs): self.loc_convs.append( ConvModule( self.in_channels, self.out_channels, 1, norm_cfg=self.norm_cfg)) self.seg_convs = nn.ModuleList() for i in range(self.num_seg_convs): self.seg_convs.append( ConvModule( self.in_channels, self.out_channels, 1, norm_cfg=self.norm_cfg)) def init_weights(self): self.localization_fpn.init_weights() if self.feat_downsample_stride > 1 and self.conv_normal_init: logger = get_root_logger() logger.info('Initialize convs in KPN head by normal std 0.01') for conv in [self.loc_convs, self.seg_convs]: for m in conv.modules(): if isinstance(m, nn.Conv2d): normal_init(m, std=0.01) if self.semantic_fpn: bias_seg = bias_init_with_prob(0.01) if self.loss_seg.use_sigmoid: normal_init(self.conv_seg, std=0.01, bias=bias_seg) else: normal_init(self.conv_seg, mean=0, std=0.01) if self.xavier_init_kernel: logger = get_root_logger() logger.info('Initialize kernels by xavier uniform') nn.init.xavier_uniform_(self.init_kernels.weight) else: logger = get_root_logger() logger.info( f'Initialize kernels by normal std: {self.kernel_init_std}') normal_init(self.init_kernels, mean=0, std=self.kernel_init_std) def _decode_init_proposals(self, img, img_metas, ref_img_metas): num_imgs = len(img_metas) num_frames = len(ref_img_metas[0]) if self.localization_fpn.__class__.__name__.endswith('3D'): localization_feats = self.localization_fpn(img, num_imgs, num_frames) else: localization_feats = self.localization_fpn(img) if isinstance(localization_feats, list): loc_feats = localization_feats[0] else: loc_feats = localization_feats for conv in self.loc_convs: loc_feats = conv(loc_feats) if self.feat_downsample_stride > 1 and self.feat_refine: loc_feats = self.ins_downsample(loc_feats) mask_preds = self.init_kernels(loc_feats) if self.semantic_fpn: if isinstance(localization_feats, list): semantic_feats = localization_feats[1] else: semantic_feats = localization_feats for conv in self.seg_convs: semantic_feats = conv(semantic_feats) if self.feat_downsample_stride > 1 and self.feat_refine: semantic_feats = self.seg_downsample(semantic_feats) else: semantic_feats = None if semantic_feats is not None: seg_preds = self.conv_seg(semantic_feats) else: seg_preds = None proposal_feats = self.init_kernels.weight.clone() proposal_feats = proposal_feats[None].expand(num_imgs * num_frames, *proposal_feats.size()) if semantic_feats is not None: x_feats = semantic_feats + loc_feats else: x_feats = loc_feats if self.proposal_feats_with_obj: sigmoid_masks = mask_preds.sigmoid() nonzero_inds = sigmoid_masks > 0.5 if self.use_binary: sigmoid_masks = nonzero_inds.float() else: sigmoid_masks = nonzero_inds.float() * sigmoid_masks obj_feats = torch.einsum('bnhw,bchw->bnc', sigmoid_masks, x_feats) cls_scores = None if self.proposal_feats_with_obj: proposal_feats = proposal_feats + obj_feats.view( num_imgs * num_frames, self.num_proposals, self.out_channels, 1, 1) if self.cat_stuff_mask and not self.training: mask_preds = torch.cat( [mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1) stuff_kernels = self.conv_seg.weight[self. num_thing_classes:].clone() stuff_kernels = stuff_kernels[None].expand(num_imgs * num_frames, *stuff_kernels.size()) proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1) return proposal_feats, x_feats, mask_preds, cls_scores, seg_preds def forward_train(self, img, img_metas, ref_img_metas, gt_masks, gt_labels, gt_instance_ids=None, gt_sem_seg=None, gt_sem_cls=None): """Forward function in training stage.""" num_imgs = len(img_metas) num_frames = len(ref_img_metas[0]) results = self._decode_init_proposals(img, img_metas, ref_img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = results if self.feat_downsample_stride > 1: scaled_mask_preds = F.interpolate( mask_preds, scale_factor=self.feat_downsample_stride, mode='bilinear', align_corners=False) if seg_preds is not None: scaled_seg_preds = F.interpolate( seg_preds, scale_factor=self.feat_downsample_stride, mode='bilinear', align_corners=False) else: scaled_mask_preds = mask_preds scaled_seg_preds = seg_preds if self.hard_target: gt_masks = [x.bool().float() for x in gt_masks] else: gt_masks = gt_masks sampling_results = [] if cls_scores is None: detached_cls_scores = [[None] * num_frames] * num_imgs else: detached_cls_scores = cls_scores.detach() for i in range(num_imgs): for j in range(num_frames): assign_result = self.assigner.assign(scaled_mask_preds[i * num_frames + j].detach(), detached_cls_scores[i][j], gt_masks[i][j], gt_labels[i][:,1][gt_labels[i][:,0]==j], ref_img_metas[i][j]) sampling_result = self.sampler.sample(assign_result, scaled_mask_preds[i * num_frames + j], gt_masks[i][j]) sampling_results.append(sampling_result) mask_targets = self.get_targets( sampling_results, self.train_cfg, True, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls) losses = self.loss(scaled_mask_preds, cls_scores, scaled_seg_preds, proposal_feats, *mask_targets) if self.cat_stuff_mask and self.training: mask_preds = torch.cat([mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1) stuff_kernels = self.conv_seg.weight[self.num_thing_classes:].clone() stuff_kernels = stuff_kernels[None].expand(num_imgs * num_frames, *stuff_kernels.size()) proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1) return losses, proposal_feats, x_feats, mask_preds, cls_scores def loss(self, mask_pred, cls_scores, seg_preds, proposal_feats, labels, label_weights, mask_targets, mask_weights, seg_targets, reduction_override=None, **kwargs): losses = dict() bg_class_ind = self.num_classes # note in spare rcnn num_gt == num_pos pos_inds = (labels >= 0) & (labels < bg_class_ind) num_preds = mask_pred.shape[0] * mask_pred.shape[1] if cls_scores is not None: num_pos = pos_inds.sum().float() avg_factor = reduce_mean(num_pos) assert mask_pred.shape[0] == cls_scores.shape[0] assert mask_pred.shape[1] == cls_scores.shape[1] losses['loss_rpn_cls'] = self.loss_cls( cls_scores.view(num_preds, -1), labels, label_weights, avg_factor=avg_factor, reduction_override=reduction_override) losses['rpn_pos_acc'] = accuracy( cls_scores.view(num_preds, -1)[pos_inds], labels[pos_inds]) bool_pos_inds = pos_inds.type(torch.bool) # 0~self.num_classes-1 are FG, self.num_classes is BG # do not perform bounding box regression for BG anymore. H, W = mask_pred.shape[-2:] if pos_inds.any(): pos_mask_pred = mask_pred.reshape(num_preds, H, W)[bool_pos_inds] pos_mask_targets = mask_targets[bool_pos_inds] losses['loss_rpn_mask'] = self.loss_mask(pos_mask_pred, pos_mask_targets) losses['loss_rpn_dice'] = self.loss_dice(pos_mask_pred, pos_mask_targets) if self.loss_rank is not None: batch_size = mask_pred.size(0) rank_target = mask_targets.new_full((batch_size, H, W), self.ignore_label, dtype=torch.long) rank_inds = pos_inds.view(batch_size, -1).nonzero(as_tuple=False) batch_mask_targets = mask_targets.view(batch_size, -1, H, W).bool() for i in range(batch_size): curr_inds = (rank_inds[:, 0] == i) curr_rank = rank_inds[:, 1][curr_inds] for j in curr_rank: rank_target[i][batch_mask_targets[i][j]] = j losses['loss_rpn_rank'] = self.loss_rank( mask_pred, rank_target, ignore_index=self.ignore_label) else: losses['loss_rpn_mask'] = mask_pred.sum() * 0 losses['loss_rpn_dice'] = mask_pred.sum() * 0 if self.loss_rank is not None: losses['loss_rank'] = mask_pred.sum() * 0 if seg_preds is not None: if self.loss_seg.use_sigmoid: cls_channel = seg_preds.shape[1] flatten_seg = seg_preds.view( -1, cls_channel, H * W).permute(0, 2, 1).reshape(-1, cls_channel) flatten_seg_target = seg_targets.view(-1) num_dense_pos = (flatten_seg_target >= 0) & ( flatten_seg_target < bg_class_ind) num_dense_pos = num_dense_pos.sum().float().clamp(min=1.0) losses['loss_rpn_seg'] = self.loss_seg( flatten_seg, flatten_seg_target, avg_factor=num_dense_pos) else: cls_channel = seg_preds.shape[1] flatten_seg = seg_preds.view(-1, cls_channel, H * W).permute( 0, 2, 1).reshape(-1, cls_channel) flatten_seg_target = seg_targets.view(-1) losses['loss_rpn_seg'] = self.loss_seg(flatten_seg, flatten_seg_target) return losses def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask, pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls, cfg): num_pos = pos_mask.size(0) num_neg = neg_mask.size(0) num_samples = num_pos + num_neg H, W = pos_mask.shape[-2:] # original implementation uses new_zeros since BG are set to be 0 # now use empty & fill because BG cat_id = num_classes, # FG cat_id = [0, num_classes-1] labels = pos_mask.new_full((num_samples, ), self.num_classes, dtype=torch.long) label_weights = pos_mask.new_zeros(num_samples) mask_targets = pos_mask.new_zeros(num_samples, H, W) mask_weights = pos_mask.new_zeros(num_samples, H, W) seg_targets = pos_mask.new_full((H, W), self.num_classes, dtype=torch.long) if gt_sem_cls is not None and gt_sem_seg is not None: gt_sem_seg = gt_sem_seg.bool() for sem_mask, sem_cls in zip(gt_sem_seg, gt_sem_cls): seg_targets[sem_mask] = sem_cls.long() if num_pos > 0: labels[pos_inds] = pos_gt_labels pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight label_weights[pos_inds] = pos_weight mask_targets[pos_inds, ...] = pos_gt_mask mask_weights[pos_inds, ...] = 1 for i in range(num_pos): seg_targets[pos_gt_mask[i].bool()] = pos_gt_labels[i] if num_neg > 0: label_weights[neg_inds] = 1.0 return labels, label_weights, mask_targets, mask_weights, seg_targets def get_targets(self, sampling_results, rpn_train_cfg, concat=True, gt_sem_seg=None, gt_sem_cls=None): num_imgs = len(sampling_results) pos_inds_list = [res.pos_inds for res in sampling_results] neg_inds_list = [res.neg_inds for res in sampling_results] pos_mask_list = [res.pos_masks for res in sampling_results] neg_mask_list = [res.neg_masks for res in sampling_results] pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results] pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results] if gt_sem_seg is None: gt_sem_seg = [None] * num_imgs gt_sem_cls = [None] * num_imgs results = multi_apply( self._get_target_single, pos_inds_list, neg_inds_list, pos_mask_list, neg_mask_list, pos_gt_mask_list, pos_gt_labels_list, gt_sem_seg, gt_sem_cls, cfg=rpn_train_cfg) (labels, label_weights, mask_targets, mask_weights, seg_targets) = results if concat: labels = torch.cat(labels, 0) label_weights = torch.cat(label_weights, 0) mask_targets = torch.cat(mask_targets, 0) mask_weights = torch.cat(mask_weights, 0) seg_targets = torch.stack(seg_targets, 0) return labels, label_weights, mask_targets, mask_weights, seg_targets def simple_test_rpn(self, img, img_metas, ref_img_metas): """Forward function in testing stage.""" return self._decode_init_proposals(img, img_metas, ref_img_metas) def forward_dummy(self, img, img_metas, ref_img_metas): """Dummy forward function. Used in flops calculation. """ return self._decode_init_proposals(img, img_metas,ref_img_metas) ================================================ FILE: knet_vis/tracker/kernel_iter_head.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from mmdet.core import build_assigner, build_sampler from mmdet.datasets.coco_panoptic import INSTANCE_OFFSET from mmdet.models.builder import HEADS, build_head from mmdet.models.roi_heads import BaseRoIHead from knet_vis.det.mask_pseudo_sampler import MaskPseudoSampler @HEADS.register_module() class KernelIterHeadVideo(BaseRoIHead): def __init__(self, num_stages=6, recursive=False, assign_stages=5, stage_loss_weights=(1, 1, 1, 1, 1, 1), proposal_feature_channel=256, merge_cls_scores=False, do_panoptic=False, post_assign=False, hard_target=False, num_proposals=100, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, thing_label_in_seg=0, mask_head=dict( type='KernelUpdateHead', num_classes=80, num_fcs=2, num_heads=8, num_cls_fcs=1, num_reg_fcs=3, feedforward_channels=2048, hidden_channels=256, dropout=0.0, roi_feat_size=7, ffn_act_cfg=dict(type='ReLU', inplace=True)), mask_out_stride=4, train_cfg=None, test_cfg=None, **kwargs): assert mask_head is not None assert len(stage_loss_weights) == num_stages self.num_stages = num_stages self.stage_loss_weights = stage_loss_weights self.proposal_feature_channel = proposal_feature_channel self.merge_cls_scores = merge_cls_scores self.recursive = recursive self.post_assign = post_assign self.mask_out_stride = mask_out_stride self.hard_target = hard_target self.assign_stages = assign_stages self.do_panoptic = do_panoptic self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.num_classes = num_thing_classes + num_stuff_classes self.mask_assign_stride = mask_assign_stride self.thing_label_in_seg = thing_label_in_seg self.num_proposals = num_proposals super().__init__( mask_head=mask_head, train_cfg=train_cfg, test_cfg=test_cfg, **kwargs) # train_cfg would be None when run the test.py if train_cfg is not None: for stage in range(num_stages): assert isinstance(self.mask_sampler[stage], MaskPseudoSampler), \ 'Sparse Mask only support `MaskPseudoSampler`' def init_bbox_head(self, mask_roi_extractor, mask_head): """Initialize box head and box roi extractor. Args: mask_roi_extractor (dict): Config of box roi extractor. mask_head (dict): Config of box in box head. """ pass def init_assigner_sampler(self): """Initialize assigner and sampler for each stage.""" self.mask_assigner = [] self.mask_sampler = [] if self.train_cfg is not None: for idx, rcnn_train_cfg in enumerate(self.train_cfg): self.mask_assigner.append( build_assigner(rcnn_train_cfg.assigner)) self.current_stage = idx self.mask_sampler.append( build_sampler(rcnn_train_cfg.sampler, context=self)) def init_weights(self): for i in range(self.num_stages): self.mask_head[i].init_weights() def init_mask_head(self, mask_roi_extractor, mask_head): """Initialize mask head and mask roi extractor. Args: mask_roi_extractor (dict): Config of mask roi extractor. mask_head (dict): Config of mask in mask head. """ self.mask_head = nn.ModuleList() if not isinstance(mask_head, list): mask_head = [mask_head for _ in range(self.num_stages)] assert len(mask_head) == self.num_stages for head in mask_head: self.mask_head.append(build_head(head)) if self.recursive: for i in range(self.num_stages): self.mask_head[i] = self.mask_head[0] def _mask_forward(self, stage, x, object_feats, mask_preds, img_metas=None): mask_head = self.mask_head[stage] cls_score, mask_preds, object_feats = mask_head(x, object_feats, mask_preds, img_metas=img_metas) if mask_head.mask_upsample_stride > 1 and (stage == self.num_stages - 1 or self.training): scaled_mask_preds = F.interpolate( mask_preds, scale_factor=mask_head.mask_upsample_stride, align_corners=False, mode='bilinear' ) else: scaled_mask_preds = mask_preds mask_results = dict( cls_score=cls_score, mask_preds=mask_preds, scaled_mask_preds=scaled_mask_preds, object_feats=object_feats ) return mask_results def forward_train(self, x, proposal_feats, mask_preds, cls_score, ref_img_metas, gt_masks, gt_labels, gt_bboxes_ignore=None, imgs_whwh=None, gt_bboxes=None, gt_sem_seg=None, gt_sem_cls=None): num_imgs = len(ref_img_metas) num_frames = len(ref_img_metas[0]) if self.mask_head[0].mask_upsample_stride > 1: prev_mask_preds = F.interpolate( mask_preds.detach(), scale_factor=self.mask_head[0].mask_upsample_stride, mode='bilinear', align_corners=False) else: prev_mask_preds = mask_preds.detach() if cls_score is not None: prev_cls_score = cls_score.detach() else: prev_cls_score = None if self.hard_target: gt_masks = [x.bool().float() for x in gt_masks] else: gt_masks = gt_masks object_feats = proposal_feats all_stage_loss = {} all_stage_mask_results = [] assign_results = [] for stage in range(self.num_stages): mask_results = self._mask_forward(stage, x, object_feats, mask_preds, img_metas=None) all_stage_mask_results.append(mask_results) mask_preds = mask_results['mask_preds'] scaled_mask_preds = mask_results['scaled_mask_preds'] cls_score = mask_results['cls_score'] object_feats = mask_results['object_feats'] if self.post_assign: prev_mask_preds = scaled_mask_preds.detach() prev_cls_score = cls_score.detach() sampling_results = [] if stage < self.assign_stages: assign_results = [] for i in range(num_imgs): for j in range(num_frames): if stage < self.assign_stages: mask_for_assign = prev_mask_preds[i * num_frames + j][:self.num_proposals] if prev_cls_score is not None: cls_for_assign = prev_cls_score[i * num_frames + j][:self.num_proposals, :self.num_thing_classes] else: cls_for_assign = None assign_result = self.mask_assigner[stage].assign( mask_for_assign, cls_for_assign, gt_masks[i][j], gt_labels[i][:,1][gt_labels[i][:,0]==j], img_meta=None) assign_results.append(assign_result) sampling_result = self.mask_sampler[stage].sample( assign_results[i * num_frames + j], scaled_mask_preds[i * num_frames + j], gt_masks[i][j]) sampling_results.append(sampling_result) mask_targets = self.mask_head[stage].get_targets( sampling_results, self.train_cfg[stage], True, gt_sem_seg=gt_sem_seg, gt_sem_cls=gt_sem_cls) single_stage_loss = self.mask_head[stage].loss( object_feats, cls_score, scaled_mask_preds, *mask_targets, imgs_whwh=imgs_whwh) for key, value in single_stage_loss.items(): all_stage_loss[f's{stage}_{key}'] = value * \ self.stage_loss_weights[stage] if not self.post_assign: prev_mask_preds = scaled_mask_preds.detach() prev_cls_score = cls_score.detach() bs_nf, num_query, c, ks1, ks2 = object_feats.size() bs_nf2, c2, h, w = x.size() assert ks1 == ks2 assert bs_nf == bs_nf2 assert bs_nf == num_frames * num_imgs assert c == c2 features = { "obj_feats" : object_feats.reshape((num_imgs, num_frames, num_query, c, ks1, ks2)), # "x_feats":self.mask_head[-1].feat_transform(x).reshape((num_imgs, num_frames, c, h, w)), "x_feats": x.reshape((num_imgs, num_frames, c, h, w)), "cls_scores": cls_score.reshape((num_imgs, num_frames, num_query, self.num_classes)), "masks": mask_preds.reshape((num_imgs, num_frames, num_query, h, w)), } return all_stage_loss, features def simple_test(self, x, proposal_feats, mask_preds, cls_score, img_metas, ref_img_metas, imgs_whwh=None, rescale=False): # Decode initial proposals num_imgs = len(ref_img_metas) num_frames = len(ref_img_metas[0]) # num_proposals = proposal_feats.size(1) object_feats = proposal_feats for stage in range(self.num_stages): mask_results = self._mask_forward(stage, x, object_feats, mask_preds) object_feats = mask_results['object_feats'] cls_score = mask_results['cls_score'] mask_preds = mask_results['mask_preds'] scaled_mask_preds = mask_results['scaled_mask_preds'] num_classes = self.mask_head[-1].num_classes results = [] if self.mask_head[-1].loss_cls.use_sigmoid: cls_score = cls_score.sigmoid() else: cls_score = cls_score.softmax(-1)[..., :-1] bs_nf, num_query, c, ks1, ks2 = object_feats.size() bs_nf2, c2, h, w = x.size() assert ks1 == ks2 assert bs_nf == bs_nf2 assert bs_nf == num_frames * num_imgs assert c == c2 features = { "obj_feats": object_feats.reshape((num_imgs, num_frames, num_query, c, ks1, ks2)), # "x_feats":self.mask_head[-1].feat_transform(x).reshape((num_imgs, num_frames, c, h, w)), "x_feats": x.reshape((num_imgs, num_frames, c, h, w)), "cls_scores": cls_score.reshape((num_imgs, num_frames, num_query, self.num_classes)), "masks": mask_preds.reshape((num_imgs, num_frames, num_query, h, w)), } if self.do_panoptic: raise NotImplementedError # for img_id in range(num_imgs): # single_result = self.get_panoptic(cls_score[img_id], # scaled_mask_preds[img_id], # self.test_cfg, # ref_img_metas[img_id]) # results.append(single_result) else: for img_id in range(num_imgs): for frame_id in range(num_frames): cls_score_per_img = cls_score[img_id * num_frames + frame_id] # h, quite tricky here, a bounding box can predict multiple results with different labels scores_per_img, topk_indices = cls_score_per_img.flatten(0, 1).topk( self.test_cfg.max_per_img, sorted=True) mask_indices = topk_indices // num_classes # Use the following when torch >= 1.9.0 # mask_indices = torch.div(topk_indices, num_classes, rounding_mode='floor') labels_per_img = topk_indices % num_classes masks_per_img = scaled_mask_preds[img_id * num_frames + frame_id][mask_indices] single_result = self.mask_head[-1].get_seg_masks( masks_per_img, labels_per_img, scores_per_img, self.test_cfg, img_metas[img_id]) results.append(single_result) return results, features def aug_test(self, features, proposal_list, img_metas, rescale=False): raise NotImplementedError('SparseMask does not support `aug_test`') def forward_dummy(self, x, proposal_boxes, proposal_feats, img_metas): """Dummy forward function when do the flops computing.""" all_stage_mask_results = [] num_imgs = len(img_metas) num_proposals = proposal_feats.size(1) C, H, W = x.shape[-3:] mask_preds = proposal_feats.bmm(x.view(num_imgs, C, -1)).view( num_imgs, num_proposals, H, W) object_feats = proposal_feats for stage in range(self.num_stages): mask_results = self._mask_forward(stage, x, object_feats, mask_preds, img_metas) all_stage_mask_results.append(mask_results) return all_stage_mask_results def get_panoptic(self, cls_scores, mask_preds, test_cfg, img_meta): # resize mask predictions back scores = cls_scores[:self.num_proposals][:, :self.num_thing_classes] thing_scores, thing_labels = scores.max(dim=1) stuff_scores = cls_scores[ self.num_proposals:][:, self.num_thing_classes:].diag() stuff_labels = torch.arange( 0, self.num_stuff_classes) + self.num_thing_classes stuff_labels = stuff_labels.to(thing_labels.device) total_masks = self.mask_head[-1].rescale_masks(mask_preds, img_meta) total_scores = torch.cat([thing_scores, stuff_scores], dim=0) total_labels = torch.cat([thing_labels, stuff_labels], dim=0) panoptic_result = self.merge_stuff_thing(total_masks, total_labels, total_scores, test_cfg.merge_stuff_thing) return dict(pan_results=panoptic_result) def merge_stuff_thing(self, total_masks, total_labels, total_scores, merge_cfg=None): H, W = total_masks.shape[-2:] panoptic_seg = total_masks.new_full((H, W), self.num_classes, dtype=torch.long) cur_prob_masks = total_scores.view(-1, 1, 1) * total_masks cur_mask_ids = cur_prob_masks.argmax(0) # sort instance outputs by scores sorted_inds = torch.argsort(-total_scores) current_segment_id = 0 for k in sorted_inds: pred_class = total_labels[k].item() isthing = pred_class < self.num_thing_classes if isthing and total_scores[k] < merge_cfg.instance_score_thr: continue mask = cur_mask_ids == k mask_area = mask.sum().item() original_area = (total_masks[k] >= 0.5).sum().item() if mask_area > 0 and original_area > 0: if mask_area / original_area < merge_cfg.overlap_thr: continue panoptic_seg[mask] = total_labels[k] \ + current_segment_id * INSTANCE_OFFSET current_segment_id += 1 return panoptic_seg.cpu().numpy() ================================================ FILE: knet_vis/tracker/kernel_update_head.py ================================================ import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import (ConvModule, bias_init_with_prob, build_activation_layer, build_norm_layer) from mmcv.cnn.bricks.transformer import (FFN, MultiheadAttention, build_transformer_layer) from mmcv.runner import force_fp32 from mmdet.core import multi_apply from mmdet.models.builder import HEADS, build_loss from mmdet.models.dense_heads.atss_head import reduce_mean from mmdet.models.losses import accuracy from mmdet.utils import get_root_logger from mmtrack.transform import outs2results @HEADS.register_module() class KernelUpdateHeadVideo(nn.Module): def __init__(self, with_cls=True, num_proposals=100, num_classes=80, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_mask_fcs=3, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, mask_thr=0.5, act_cfg=dict(type='ReLU', inplace=True), ffn_act_cfg=dict(type='ReLU', inplace=True), conv_kernel_size=3, feat_transform_cfg=None, hard_mask_thr=0.5, kernel_init=False, with_ffn=True, mask_out_stride=4, relative_coors=False, relative_coors_off=False, feat_gather_stride=1, mask_transform_stride=1, mask_upsample_stride=1, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, ignore_label=255, thing_label_in_seg=0, # query fusion query_merge_method='mean', kernel_updator_cfg=dict( type='DynamicConv', in_channels=256, feat_channels=64, out_channels=256, input_feat_shape=1, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_rank=None, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0), loss_dice=dict(type='DiceLoss', loss_weight=3.0), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0)): super().__init__() self.num_proposals = num_proposals self.num_classes = num_classes self.loss_cls = build_loss(loss_cls) self.loss_mask = build_loss(loss_mask) self.loss_dice = build_loss(loss_dice) if loss_rank is not None: self.loss_rank = build_loss(loss_rank) else: self.loss_rank = loss_rank self.in_channels = in_channels self.out_channels = out_channels self.mask_thr = mask_thr self.fp16_enabled = False self.dropout = dropout self.num_heads = num_heads self.hard_mask_thr = hard_mask_thr self.kernel_init = kernel_init self.with_ffn = with_ffn self.mask_out_stride = mask_out_stride self.relative_coors = relative_coors self.relative_coors_off = relative_coors_off self.conv_kernel_size = conv_kernel_size self.feat_gather_stride = feat_gather_stride self.mask_transform_stride = mask_transform_stride self.mask_upsample_stride = mask_upsample_stride self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.ignore_label = ignore_label self.thing_label_in_seg = thing_label_in_seg self.attention = MultiheadAttention(in_channels * conv_kernel_size**2, num_heads, dropout) self.attention_norm = build_norm_layer( dict(type='LN'), in_channels * conv_kernel_size**2)[1] self.kernel_update_conv = build_transformer_layer(kernel_updator_cfg) if feat_transform_cfg is not None: kernel_size = feat_transform_cfg.pop('kernel_size', 1) self.feat_transform = ConvModule( in_channels, in_channels, kernel_size, stride=feat_gather_stride, padding=int(feat_gather_stride // 2), **feat_transform_cfg) else: self.feat_transform = None if self.with_ffn: self.ffn = FFN( in_channels, feedforward_channels, num_ffn_fcs, act_cfg=ffn_act_cfg, ffn_drop=dropout) self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1] self.with_cls = with_cls if self.with_cls: self.cls_fcs = nn.ModuleList() for _ in range(num_cls_fcs): self.cls_fcs.append( nn.Linear(in_channels, in_channels, bias=False)) self.cls_fcs.append( build_norm_layer(dict(type='LN'), in_channels)[1]) self.cls_fcs.append(build_activation_layer(act_cfg)) if self.loss_cls.use_sigmoid: self.fc_cls = nn.Linear(in_channels, self.num_classes) else: self.fc_cls = nn.Linear(in_channels, self.num_classes + 1) # query fusion self.query_merge_method = query_merge_method if self.query_merge_method == 'attention' and self.with_cls: _num_head = 8 _drop_out = 0. self.query_merge_attn = MultiheadAttention(self.in_channels, _num_head, _drop_out, batch_first=True) self.query_merge_norm = build_norm_layer(dict(type='LN'), self.in_channels)[1] self.query_merge_ffn = FFN( self.in_channels, self.in_channels * 8, num_ffn_fcs=2, act_cfg=dict(type='ReLU', inplace=True), ffn_drop=0.) self.query_merge_ffn_norm = build_norm_layer(dict(type='LN'), self.in_channels)[1] elif self.query_merge_method == 'attention_pos' and self.with_cls: _num_head = 8 _drop_out = 0. self.query_merge_attn = MultiheadAttention(self.in_channels, _num_head, _drop_out, batch_first=True) self.query_merge_norm = build_norm_layer(dict(type='LN'), self.in_channels)[1] self.query_merge_ffn = FFN( self.in_channels, self.in_channels * 8, num_ffn_fcs=2, act_cfg=dict(type='ReLU', inplace=True), ffn_drop=0.) self.query_merge_ffn_norm = build_norm_layer(dict(type='LN'), self.in_channels)[1] self.mask_fcs = nn.ModuleList() for _ in range(num_mask_fcs): self.mask_fcs.append( nn.Linear(in_channels, in_channels, bias=False)) self.mask_fcs.append( build_norm_layer(dict(type='LN'), in_channels)[1]) self.mask_fcs.append(build_activation_layer(act_cfg)) self.fc_mask = nn.Linear(in_channels, out_channels) def init_weights(self): """Use xavier initialization for all weight parameter and set classification head bias as a specific value when use focal loss.""" for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) else: # adopt the default initialization for # the weight and bias of the layer norm pass if self.loss_cls.use_sigmoid: bias_init = bias_init_with_prob(0.01) nn.init.constant_(self.fc_cls.bias, bias_init) if self.kernel_init: logger = get_root_logger() logger.info( 'mask kernel in mask head is normal initialized by std 0.01') nn.init.normal_(self.fc_mask.weight, mean=0, std=0.01) def forward(self, x, proposal_feat, mask_preds, prev_cls_score=None, mask_shape=None, img_metas=None, pos=None): if len(proposal_feat.size()) == 6: assert not self.with_cls is_gather_query = False N, _, num_proposals = proposal_feat.shape[:3] else: assert self.with_cls is_gather_query = True N, num_proposals = proposal_feat.shape[:2] assert self.num_proposals == num_proposals _, num_frames ,C, H, W = x.size() if self.feat_transform is not None: x = self.feat_transform(x.reshape((N * num_frames, C, H, W))).reshape((N, num_frames, C, H, W)) mask_h, mask_w = mask_preds.shape[-2:] if mask_h != H or mask_w != W: gather_mask = F.interpolate( mask_preds.reshape((N * num_proposals, C, H, W)), (H, W), align_corners=False, mode='bilinear').reshape((N, num_frames, C, H, W)) else: gather_mask = mask_preds sigmoid_masks = gather_mask.sigmoid() nonzero_inds = sigmoid_masks > self.hard_mask_thr sigmoid_masks = nonzero_inds.float() # einsum is faster than bmm by 30% if is_gather_query: # x_feat = torch.einsum('bfnhw,bfchw->bnc', sigmoid_masks, x) if self.query_merge_method == 'mean': x_feat = torch.einsum('bfnhw,bfchw->bfnc', sigmoid_masks, x).mean(1) elif self.query_merge_method == 'attention': x_feat = torch.einsum('bfnhw,bfchw->bfnc', sigmoid_masks, x) x_feat = x_feat.reshape((N, num_frames * num_proposals, self.in_channels)) assert proposal_feat.size()[-2:] == (1,1), "Only supporting kernel size = 1" init_query = proposal_feat.reshape(N, num_proposals, self.in_channels).detach() x_feat = self.query_merge_attn(query=init_query, key=x_feat, value=x_feat) x_feat = self.query_merge_norm(x_feat) x_feat = self.query_merge_ffn_norm(self.query_merge_ffn(x_feat)) elif self.query_merge_method == 'attention_pos': x_feat = torch.einsum('bfnhw,bfchw->bfnc', sigmoid_masks, x) x_feat = x_feat.reshape((N, num_frames * num_proposals, self.in_channels)) assert proposal_feat.size()[-2:] == (1, 1), "Only supporting kernel size = 1" init_query = proposal_feat.reshape(N, num_proposals, self.in_channels).detach() query_pos = pos.repeat(N, 1, 1) key_pos = query_pos.repeat(1, num_frames, 1) x_feat = self.query_merge_attn(query=init_query, key=x_feat, value=x_feat, query_pos=query_pos, key_pos=key_pos) x_feat = self.query_merge_norm(x_feat) x_feat = self.query_merge_ffn_norm(self.query_merge_ffn(x_feat)) else: raise NotImplementedError else: x_feat = torch.einsum('bfnhw,bfchw->bfnc', sigmoid_masks, x) # obj_feat in shape [B, N, C, K, K] -> [B, N, C, K*K] -> [B, N, K*K, C] if is_gather_query: proposal_feat = proposal_feat.reshape(N, num_proposals, self.in_channels, -1).permute(0, 1, 3, 2) obj_feat = self.kernel_update_conv(x_feat, proposal_feat) else: proposal_feat = proposal_feat.reshape(N * num_frames, num_proposals, self.in_channels, -1).permute(0, 1, 3, 2) obj_feat = self.kernel_update_conv(x_feat.reshape(N * num_frames, num_proposals, C), proposal_feat) N *= num_frames # [B, N, K*K, C] -> [B, N, K*K*C] -> [N, B, K*K*C] obj_feat = obj_feat.reshape(N, num_proposals, -1).permute(1, 0, 2) obj_feat = self.attention_norm(self.attention(obj_feat)) # [N, B, K*K*C] -> [B, N, K*K*C] obj_feat = obj_feat.permute(1, 0, 2) # obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C] obj_feat = obj_feat.reshape(N, num_proposals, -1, self.in_channels) # FFN if self.with_ffn: obj_feat = self.ffn_norm(self.ffn(obj_feat)) mask_feat = obj_feat if is_gather_query: cls_feat = obj_feat.sum(-2) for cls_layer in self.cls_fcs: cls_feat = cls_layer(cls_feat) cls_score = self.fc_cls(cls_feat).view(N, num_proposals, -1) else: cls_score = None for reg_layer in self.mask_fcs: mask_feat = reg_layer(mask_feat) # [B, N, K*K, C] -> [B, N, C, K*K] mask_feat = self.fc_mask(mask_feat).permute(0, 1, 3, 2) if (self.mask_transform_stride == 2 and self.feat_gather_stride == 1): mask_x = F.interpolate( x, scale_factor=0.5, mode='bilinear', align_corners=False) H, W = mask_x.shape[-2:] raise NotImplementedError else: mask_x = x # group conv is 5x faster than unfold and uses about 1/5 memory # Group conv vs. unfold vs. concat batch, 2.9ms :13.5ms :3.8ms # Group conv vs. unfold vs. concat batch, 278 : 1420 : 369 # fold_x = F.unfold( # mask_x, # self.conv_kernel_size, # padding=int(self.conv_kernel_size // 2)) # mask_feat = mask_feat.reshape(N, num_proposals, -1) # new_mask_preds = torch.einsum('bnc,bcl->bnl', mask_feat, fold_x) # [B, N, C, K*K] -> [B*N, C, K, K] mask_feat = mask_feat.reshape(N, num_proposals, C, self.conv_kernel_size, self.conv_kernel_size) # [B, C, H, W] -> [1, B*C, H, W] if is_gather_query: new_mask_preds = [] for i in range(N): new_mask_preds.append( F.conv2d( mask_x[i], mask_feat[i], padding=int(self.conv_kernel_size // 2))) new_mask_preds = torch.stack(new_mask_preds, dim=0) assert new_mask_preds.size() == (N, num_frames, num_proposals, H, W) else: N = N // num_frames new_mask_preds = [] for i in range(N): for j in range(num_frames): new_mask_preds.append( F.conv2d( mask_x[i][j][None], mask_feat[i * num_frames + j], padding=int(self.conv_kernel_size // 2))) new_mask_preds = torch.cat(new_mask_preds, dim=0) new_mask_preds = new_mask_preds.reshape(N, num_frames, num_proposals, H, W) assert new_mask_preds.size() == (N, num_frames, num_proposals, H, W) if self.mask_transform_stride == 2: new_mask_preds = F.interpolate( new_mask_preds, scale_factor=2, mode='bilinear', align_corners=False) raise NotImplementedError if mask_shape is not None and mask_shape[0] != H: new_mask_preds = F.interpolate( new_mask_preds, mask_shape, align_corners=False, mode='bilinear') raise NotImplementedError if is_gather_query: return cls_score, new_mask_preds, obj_feat.permute(0, 1, 3, 2).reshape( N, num_proposals, self.in_channels, self.conv_kernel_size, self.conv_kernel_size) else: return None, new_mask_preds, obj_feat.permute(0, 1, 3, 2).reshape( N, num_frames , num_proposals, self.in_channels, self.conv_kernel_size, self.conv_kernel_size) @force_fp32(apply_to=('cls_score', 'mask_pred')) def loss(self, object_feats, cls_score, mask_pred, labels, label_weights, mask_targets, mask_weights, imgs_whwh=None, reduction_override=None, **kwargs): losses = dict() bg_class_ind = self.num_classes # note in spare rcnn num_gt == num_pos pos_inds = (labels >= 0) & (labels < bg_class_ind) num_pos = pos_inds.sum().float() avg_factor = reduce_mean(num_pos).clamp_(min=1.0) num_preds = mask_pred.shape[0] * mask_pred.shape[1] if cls_score is not None: assert mask_pred.shape[0] == cls_score.shape[0] assert mask_pred.shape[1] == cls_score.shape[1] if cls_score is not None: if cls_score.numel() > 0: losses['loss_cls'] = self.loss_cls( cls_score.view(num_preds, -1), labels, label_weights, avg_factor=avg_factor, reduction_override=reduction_override) losses['pos_acc'] = accuracy( cls_score.view(num_preds, -1)[pos_inds], labels[pos_inds]) if mask_pred is not None: bool_pos_inds = pos_inds.type(torch.bool) # 0~self.num_classes-1 are FG, self.num_classes is BG # do not perform bounding box regression for BG anymore. H, W = mask_pred.shape[-2:] if pos_inds.any(): pos_mask_pred = mask_pred.reshape(num_preds, H, W)[bool_pos_inds] pos_mask_targets = mask_targets[bool_pos_inds] losses['loss_mask'] = self.loss_mask(pos_mask_pred, pos_mask_targets) losses['loss_dice'] = self.loss_dice(pos_mask_pred, pos_mask_targets) if self.loss_rank is not None: batch_size = mask_pred.size(0) rank_target = mask_targets.new_full((batch_size, H, W), self.ignore_label, dtype=torch.long) rank_inds = pos_inds.view(batch_size, -1).nonzero(as_tuple=False) batch_mask_targets = mask_targets.view( batch_size, -1, H, W).bool() for i in range(batch_size): curr_inds = (rank_inds[:, 0] == i) curr_rank = rank_inds[:, 1][curr_inds] for j in curr_rank: rank_target[i][batch_mask_targets[i][j]] = j losses['loss_rank'] = self.loss_rank( mask_pred, rank_target, ignore_index=self.ignore_label) else: losses['loss_mask'] = mask_pred.sum() * 0 losses['loss_dice'] = mask_pred.sum() * 0 if self.loss_rank is not None: losses['loss_rank'] = mask_pred.sum() * 0 return losses def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask, pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls, cfg): num_pos = pos_mask.size(0) num_neg = neg_mask.size(0) num_samples = num_pos + num_neg H, W = pos_mask.shape[-2:] # original implementation uses new_zeros since BG are set to be 0 # now use empty & fill because BG cat_id = num_classes, # FG cat_id = [0, num_classes-1] labels = pos_mask.new_full((num_samples, ), self.num_classes, dtype=torch.long) label_weights = pos_mask.new_zeros((num_samples, self.num_classes)) mask_targets = pos_mask.new_zeros(num_samples, H, W) mask_weights = pos_mask.new_zeros(num_samples, H, W) if num_pos > 0: labels[pos_inds] = pos_gt_labels pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight label_weights[pos_inds] = pos_weight pos_mask_targets = pos_gt_mask mask_targets[pos_inds, ...] = pos_mask_targets mask_weights[pos_inds, ...] = 1 if num_neg > 0: label_weights[neg_inds] = 1.0 if gt_sem_cls is not None and gt_sem_seg is not None: sem_labels = pos_mask.new_full((self.num_stuff_classes, ), self.num_classes, dtype=torch.long) sem_targets = pos_mask.new_zeros(self.num_stuff_classes, H, W) sem_weights = pos_mask.new_zeros(self.num_stuff_classes, H, W) sem_stuff_weights = torch.eye( self.num_stuff_classes, device=pos_mask.device) sem_thing_weights = pos_mask.new_zeros( (self.num_stuff_classes, self.num_thing_classes)) sem_label_weights = torch.cat( [sem_thing_weights, sem_stuff_weights], dim=-1) if len(gt_sem_cls > 0): sem_inds = gt_sem_cls - self.num_thing_classes sem_inds = sem_inds.long() sem_labels[sem_inds] = gt_sem_cls.long() sem_targets[sem_inds] = gt_sem_seg sem_weights[sem_inds] = 1 label_weights[:, self.num_thing_classes:] = 0 labels = torch.cat([labels, sem_labels]) label_weights = torch.cat([label_weights, sem_label_weights]) mask_targets = torch.cat([mask_targets, sem_targets]) mask_weights = torch.cat([mask_weights, sem_weights]) return labels, label_weights, mask_targets, mask_weights def get_targets(self, sampling_results, rcnn_train_cfg, concat=True, gt_sem_seg=None, gt_sem_cls=None): num_imgs = len(sampling_results) pos_inds_list = [res.pos_inds for res in sampling_results] neg_inds_list = [res.neg_inds for res in sampling_results] pos_mask_list = [res.pos_masks for res in sampling_results] neg_mask_list = [res.neg_masks for res in sampling_results] pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results] pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results] if gt_sem_seg is None: gt_sem_seg = [None] * num_imgs gt_sem_cls = [None] * num_imgs labels, label_weights, mask_targets, mask_weights = multi_apply( self._get_target_single, pos_inds_list, neg_inds_list, pos_mask_list, neg_mask_list, pos_gt_mask_list, pos_gt_labels_list, gt_sem_seg, gt_sem_cls, cfg=rcnn_train_cfg) if concat: labels = torch.cat(labels, 0) label_weights = torch.cat(label_weights, 0) mask_targets = torch.cat(mask_targets, 0) mask_weights = torch.cat(mask_weights, 0) return labels, label_weights, mask_targets, mask_weights def rescale_masks(self, masks_per_img, img_meta): h, w, _ = img_meta['img_shape'] masks_per_img = F.interpolate( masks_per_img.unsqueeze(0).sigmoid(), size=img_meta['batch_input_shape'], mode='bilinear', align_corners=False) masks_per_img = masks_per_img[:, :, :h, :w] ori_shape = img_meta['ori_shape'] seg_masks = F.interpolate( masks_per_img, size=ori_shape[:2], mode='bilinear', align_corners=False).squeeze(0) return seg_masks def get_seg_masks(self, masks_per_img, labels_per_img, scores_per_img, test_cfg, img_meta): # resize mask predictions back seg_masks = self.rescale_masks(masks_per_img, img_meta) seg_masks = seg_masks > test_cfg.mask_thr bbox_result, segm_result = self.segm2result(seg_masks, labels_per_img, scores_per_img) return bbox_result, segm_result def segm2result(self, mask_preds, det_labels, cls_scores): num_classes = self.num_classes bbox_result = None segm_result = [[] for _ in range(num_classes)] mask_preds = mask_preds.cpu().numpy() det_labels = det_labels.cpu().numpy() cls_scores = cls_scores.cpu().numpy() num_ins = mask_preds.shape[0] # fake bboxes bboxes = np.zeros((num_ins, 5), dtype=np.float32) bboxes[:, -1] = cls_scores bbox_result = [bboxes[det_labels == i, :] for i in range(num_classes)] for idx in range(num_ins): segm_result[det_labels[idx]].append(mask_preds[idx]) return bbox_result, segm_result def get_seg_masks_tracking(self, masks_per_img, labels_per_img, scores_per_img, ids_per_img, test_cfg, img_meta): num_ins = masks_per_img.shape[0] # resize mask predictions back seg_masks = self.rescale_masks(masks_per_img, img_meta) seg_masks = seg_masks > test_cfg.mask_thr # fake bboxes bboxes = torch.zeros((num_ins, 5), dtype=torch.float32) bboxes[:, -1] = scores_per_img tracks = outs2results( bboxes=bboxes, labels=labels_per_img, masks=seg_masks, ids=ids_per_img, num_classes=self.num_classes, ) return tracks['bbox_results'], tracks['mask_results'] ================================================ FILE: knet_vis/tracker/mask_hungarian_assigner.py ================================================ import numpy as np import torch from mmdet.core import AssignResult, BaseAssigner from mmdet.core.bbox.builder import BBOX_ASSIGNERS from mmdet.core.bbox.match_costs.builder import build_match_cost try: from scipy.optimize import linear_sum_assignment except ImportError: linear_sum_assignment = None @BBOX_ASSIGNERS.register_module() class MaskHungarianAssignerVideo(BaseAssigner): """Computes one-to-one matching between predictions and ground truth. This class computes an assignment between the targets and the predictions based on the costs. The costs are weighted sum of three components: classfication cost, regression L1 cost and regression iou cost. The targets don't include the no_object, so generally there are more predictions than targets. After the one-to-one matching, the un-matched are treated as backgrounds. Thus each query prediction will be assigned with `0` or a positive integer indicating the ground truth index: - 0: negative sample, no assigned gt - positive integer: positive sample, index (1-based) of assigned gt Args: cls_weight (int | float, optional): The scale factor for classification cost. Default 1.0. bbox_weight (int | float, optional): The scale factor for regression L1 cost. Default 1.0. iou_weight (int | float, optional): The scale factor for regression iou cost. Default 1.0. iou_calculator (dict | optional): The config for the iou calculation. Default type `BboxOverlaps2D`. iou_mode (str | optional): "iou" (intersection over union), "iof" (intersection over foreground), or "giou" (generalized intersection over union). Default "giou". """ def __init__(self, cls_cost=dict(type='ClassificationCost', weight=1.), mask_cost=dict(type='SigmoidCost', weight=1.0), dice_cost=dict(), boundary_cost=None, topk=1): self.cls_cost = build_match_cost(cls_cost) self.mask_cost = build_match_cost(mask_cost) self.dice_cost = build_match_cost(dice_cost) if boundary_cost is not None: self.boundary_cost = build_match_cost(boundary_cost) else: self.boundary_cost = None self.topk = topk def assign(self, bbox_pred, cls_pred, gt_bboxes, gt_labels, gt_instance_ids, img_meta=None, gt_bboxes_ignore=None, eps=1e-7): """Computes one-to-one matching based on the weighted costs. This method assign each query prediction to a ground truth or background. The `assigned_gt_inds` with -1 means don't care, 0 means negative sample, and positive number is the index (1-based) of assigned gt. The assignment is done in the following steps, the order matters. 1. assign every prediction to -1 2. compute the weighted costs 3. do Hungarian matching on CPU based on the costs 4. assign all to 0 (background) first, then for each matched pair between predictions and gts, treat this prediction as foreground and assign the corresponding gt index (plus 1) to it. Args: bbox_pred (Tensor): Predicted boxes with normalized coordinates (cx, cy, w, h), which are all in range [0, 1]. Shape [num_query, 4]. cls_pred (Tensor): Predicted classification logits, shape [num_query, num_class]. gt_bboxes (Tensor): Ground truth boxes with unnormalized coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). img_meta (dict): Meta information for current image. gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are labelled as `ignored`. Default None. eps (int | float, optional): A value added to the denominator for numerical stability. Default 1e-7. Returns: :obj:`AssignResult`: The assigned result. """ assert gt_bboxes_ignore is None, \ 'Only case when gt_bboxes_ignore is None is supported.' instances = torch.unique(gt_instance_ids[:,1]) num_frames = bbox_pred.size(0) h, w = bbox_pred.shape[-2:] gt_masks = [] gt_labels_tensor =[] for instance_id in instances: gt_instance_frame_ids = gt_instance_ids[gt_instance_ids[:, 1] == instance_id, 0] instance_masks = [] gt_label_id = None for frame_id in range(num_frames): gt_frame_instance_ids = gt_instance_ids[gt_instance_ids[:,0] == frame_id, 1] gt_frame_label_ids = gt_labels[gt_labels[:,0] == frame_id, 1] assert len(gt_frame_label_ids) == len(gt_frame_label_ids) if not (frame_id in gt_instance_frame_ids): gt_mask_frame = torch.zeros((h, w), device=gt_instance_frame_ids.device, dtype=torch.float) else: gt_index = torch.nonzero((gt_frame_instance_ids == instance_id), as_tuple=True)[0].item() gt_mask_frame = gt_bboxes[frame_id][gt_index] gt_label_id = gt_frame_label_ids[gt_index].item() if gt_label_id is None else gt_label_id assert gt_label_id == gt_frame_label_ids[gt_index].item() instance_masks.append(gt_mask_frame) gt_masks.append(torch.stack(instance_masks)) gt_labels_tensor.append(gt_label_id) gt_masks = torch.stack(gt_masks) gt_labels_tensor = torch.tensor(gt_labels_tensor, device=gt_masks.device, dtype=torch.long) num_gts, num_bboxes = len(instances), bbox_pred.size(1) # 1. assign -1 by default assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long) assigned_labels = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long) if num_gts == 0 or num_bboxes == 0: # No ground truth or boxes, return empty assignment if num_gts == 0: # No ground truth, assign all to background assigned_gt_inds[:] = 0 return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels) # 2. compute the weighted costs # classification and bboxcost. pred_masks_match = torch.einsum('fqhw->qfhw', bbox_pred).reshape((num_bboxes, -1, w)) gt_masks_match = gt_masks.reshape((num_gts, -1, w)) if self.cls_cost.weight != 0 and cls_pred is not None: cls_cost = self.cls_cost(cls_pred, gt_labels_tensor) else: cls_cost = 0 if self.mask_cost.weight != 0: reg_cost = self.mask_cost(pred_masks_match, gt_masks_match) else: reg_cost = 0 if self.dice_cost.weight != 0: dice_cost = self.dice_cost(pred_masks_match, gt_masks_match) else: dice_cost = 0 if self.boundary_cost is not None and self.boundary_cost.weight != 0: b_cost = self.boundary_cost(pred_masks_match, gt_masks_match) else: b_cost = 0 cost = cls_cost + reg_cost + dice_cost + b_cost # 3. do Hungarian matching on CPU using linear_sum_assignment cost = cost.detach().cpu() if linear_sum_assignment is None: raise ImportError('Please run "pip install scipy" ' 'to install scipy first.') if self.topk == 1: matched_row_inds, matched_col_inds = linear_sum_assignment(cost) else: topk_matched_row_inds = [] topk_matched_col_inds = [] for i in range(self.topk): matched_row_inds, matched_col_inds = linear_sum_assignment( cost) topk_matched_row_inds.append(matched_row_inds) topk_matched_col_inds.append(matched_col_inds) cost[matched_row_inds] = 1e10 matched_row_inds = np.concatenate(topk_matched_row_inds) matched_col_inds = np.concatenate(topk_matched_col_inds) matched_row_inds = torch.from_numpy(matched_row_inds).to( bbox_pred.device) matched_col_inds = torch.from_numpy(matched_col_inds).to( bbox_pred.device) # 4. assign backgrounds and foregrounds # assign all indices to backgrounds first assigned_gt_inds[:] = 0 # assign foregrounds based on matching results assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 assigned_labels[matched_row_inds] = gt_labels_tensor[matched_col_inds] return AssignResult(num_gts, assigned_gt_inds, None, labels=assigned_labels), gt_masks_match ================================================ FILE: knet_vis/tracker/positional_encoding.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py """ Various positional encodings for the transformer. """ import math import torch from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING from mmcv.runner import BaseModule @POSITIONAL_ENCODING.register_module() class PositionEmbeddingSine3D(BaseModule): """ This is a more standard version of the position embedding, very similar to the one used by the Attention is all you need paper, generalized to work on images. """ def __init__(self, num_feats=64, temperature=10000, normalize=False, scale=None): super().__init__() self.num_pos_feats = num_feats self.temperature = temperature self.normalize = normalize if scale is not None and normalize is False: raise ValueError("normalize should be True if scale is passed") if scale is None: scale = 2 * math.pi self.scale = scale def forward(self, x, mask=None): # b, t, c, h, w assert x.dim() == 5, f"{x.shape} should be a 5-dimensional Tensor, got {x.dim()}-dimensional Tensor instead" if mask is None: mask = torch.zeros((x.size(0), x.size(1), x.size(3), x.size(4)), device=x.device, dtype=torch.bool) not_mask = ~mask z_embed = not_mask.cumsum(1, dtype=torch.float32) y_embed = not_mask.cumsum(2, dtype=torch.float32) x_embed = not_mask.cumsum(3, dtype=torch.float32) if self.normalize: eps = 1e-6 z_embed = z_embed / (z_embed[:, -1:, :, :] + eps) * self.scale y_embed = y_embed / (y_embed[:, :, -1:, :] + eps) * self.scale x_embed = x_embed / (x_embed[:, :, :, -1:] + eps) * self.scale dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) dim_t_z = torch.arange((self.num_pos_feats * 2), dtype=torch.float32, device=x.device) dim_t_z = self.temperature ** (2 * (dim_t_z // 2) / (self.num_pos_feats * 2)) pos_x = x_embed[:, :, :, :, None] / dim_t pos_y = y_embed[:, :, :, :, None] / dim_t pos_z = z_embed[:, :, :, :, None] / dim_t_z pos_x = torch.stack((pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), dim=5).flatten(4) pos_y = torch.stack((pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), dim=5).flatten(4) pos_z = torch.stack((pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()), dim=5).flatten(4) pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3) # b, t, c, h, w return pos ================================================ FILE: knet_vis/tracker/semantic_fpn_wrapper3D.py ================================================ import torch import torch.nn as nn from mmcv.cnn import ConvModule, normal_init from mmdet.models.builder import NECKS from mmcv.cnn.bricks.transformer import build_positional_encoding from mmdet.utils import get_root_logger @NECKS.register_module() class SemanticFPNWrapper3D(nn.Module): """Implementation of Semantic FPN used in Panoptic FPN. Args: in_channels ([type]): [description] feat_channels ([type]): [description] out_channels ([type]): [description] start_level ([type]): [description] end_level ([type]): [description] cat_coors (bool, optional): [description]. Defaults to False. fuse_by_cat (bool, optional): [description]. Defaults to False. conv_cfg ([type], optional): [description]. Defaults to None. norm_cfg ([type], optional): [description]. Defaults to None. """ def __init__(self, in_channels, feat_channels, out_channels, start_level, end_level, cat_coors=False, positional_encoding=None, cat_coors_level=3, fuse_by_cat=False, return_list=False, upsample_times=3, with_pred=True, num_aux_convs=0, act_cfg=dict(type='ReLU', inplace=True), out_act_cfg=dict(type='ReLU'), conv_cfg=None, norm_cfg=None): super().__init__() self.in_channels = in_channels self.feat_channels = feat_channels self.start_level = start_level self.end_level = end_level assert start_level >= 0 and end_level >= start_level self.out_channels = out_channels self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.cat_coors = cat_coors self.cat_coors_level = cat_coors_level self.fuse_by_cat = fuse_by_cat self.return_list = return_list self.upsample_times = upsample_times self.with_pred = with_pred if positional_encoding is not None: self.positional_encoding = build_positional_encoding( positional_encoding) else: self.positional_encoding = None self.convs_all_levels = nn.ModuleList() for i in range(self.start_level, self.end_level + 1): convs_per_level = nn.Sequential() if i == 0: if i == self.cat_coors_level and self.cat_coors: chn = self.in_channels + 2 else: chn = self.in_channels if upsample_times == self.end_level - i: one_conv = ConvModule( chn, self.feat_channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, inplace=False) convs_per_level.add_module('conv' + str(i), one_conv) else: for i in range(self.end_level - upsample_times): one_conv = ConvModule( chn, self.feat_channels, 3, padding=1, stride=2, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, inplace=False) convs_per_level.add_module('conv' + str(i), one_conv) self.convs_all_levels.append(convs_per_level) continue for j in range(i): if j == 0: if i == self.cat_coors_level and self.cat_coors: chn = self.in_channels + 2 else: chn = self.in_channels one_conv = ConvModule( chn, self.feat_channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, inplace=False) convs_per_level.add_module('conv' + str(j), one_conv) if j < upsample_times - (self.end_level - i): one_upsample = nn.Upsample( scale_factor=2, mode='bilinear', align_corners=False) convs_per_level.add_module('upsample' + str(j), one_upsample) continue one_conv = ConvModule( self.feat_channels, self.feat_channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, inplace=False) convs_per_level.add_module('conv' + str(j), one_conv) if j < upsample_times - (self.end_level - i): one_upsample = nn.Upsample( scale_factor=2, mode='bilinear', align_corners=False) convs_per_level.add_module('upsample' + str(j), one_upsample) self.convs_all_levels.append(convs_per_level) if fuse_by_cat: in_channels = self.feat_channels * len(self.convs_all_levels) else: in_channels = self.feat_channels if self.with_pred: self.conv_pred = ConvModule( in_channels, self.out_channels, 1, padding=0, conv_cfg=self.conv_cfg, act_cfg=out_act_cfg, norm_cfg=self.norm_cfg) self.num_aux_convs = num_aux_convs self.aux_convs = nn.ModuleList() for i in range(num_aux_convs): self.aux_convs.append( ConvModule( in_channels, self.out_channels, 1, padding=0, conv_cfg=self.conv_cfg, act_cfg=out_act_cfg, norm_cfg=self.norm_cfg)) def init_weights(self): logger = get_root_logger() logger.info('Use normal intialization for semantic FPN') for m in self.modules(): if isinstance(m, nn.Conv2d): normal_init(m, std=0.01) def generate_coord(self, input_feat): x_range = torch.linspace( -1, 1, input_feat.shape[-1], device=input_feat.device) y_range = torch.linspace( -1, 1, input_feat.shape[-2], device=input_feat.device) y, x = torch.meshgrid(y_range, x_range) y = y.expand([input_feat.shape[0], 1, -1, -1]) x = x.expand([input_feat.shape[0], 1, -1, -1]) coord_feat = torch.cat([x, y], 1) return coord_feat def forward(self, inputs, num_imgs, num_frames): mlvl_feats = [] for i in range(self.start_level, self.end_level + 1): input_p = inputs[i] if i == self.cat_coors_level: if self.positional_encoding is not None: input_p = input_p.view(num_imgs, num_frames, *input_p.size()[1:]) assert self.positional_encoding.__class__.__name__.endswith('3D') positional_encoding = self.positional_encoding(input_p) input_p = (input_p + positional_encoding).reshape(num_imgs * num_frames, *input_p.size()[2:]) if self.cat_coors: coord_feat = self.generate_coord(input_p) input_p = torch.cat([input_p, coord_feat], 1) mlvl_feats.append(self.convs_all_levels[i](input_p)) if self.fuse_by_cat: feature_add_all_level = torch.cat(mlvl_feats, dim=1) else: feature_add_all_level = sum(mlvl_feats) if self.with_pred: out = self.conv_pred(feature_add_all_level) else: out = feature_add_all_level if self.num_aux_convs > 0: outs = [out] for conv in self.aux_convs: outs.append(conv(feature_add_all_level)) return outs if self.return_list: return [out] else: return out ================================================ FILE: knet_vis/tracker/track.py ================================================ import copy import torch import torch.nn as nn import torch.nn.functional as F from mmdet.models.builder import DETECTORS from mmdet.models.detectors import TwoStageDetector from mmdet.utils import get_root_logger from mmdet.models import build_head from knet_vis.det.utils import sem2ins_masks @DETECTORS.register_module() class KNetTrack(TwoStageDetector): def __init__(self, *args, num_thing_classes=80, num_stuff_classes=53, mask_assign_stride=4, thing_label_in_seg=0, direct_tracker=False, tracker_num=1, tracker=None, train_cfg=None, test_cfg=None, **kwargs): self.roi_head = None # init roi_head with None super().__init__(*args, **kwargs, train_cfg=train_cfg, test_cfg=test_cfg) assert self.with_rpn, 'KNet does not support external proposals' self.num_thing_classes = num_thing_classes self.num_stuff_classes = num_stuff_classes self.mask_assign_stride = mask_assign_stride self.thing_label_in_seg = thing_label_in_seg self.direct_tracker = direct_tracker self.tracker_num = tracker_num if tracker is not None: rcnn_train_cfg = train_cfg.tracker if train_cfg is not None else None tracker.update(train_cfg=rcnn_train_cfg) tracker.update(test_cfg=test_cfg.tracker) self.tracker = build_head(tracker) if self.tracker_num > 1: self.tracker_extra = nn.ModuleList( [build_head(tracker) for _ in range(tracker_num - 1)] ) logger = get_root_logger() logger.info(f'Model: \n{self}') def gt_transform(self, img_metas, gt_masks, gt_labels, gt_semantic_seg): # gt_masks and gt_semantic_seg are not padded when forming batch gt_masks_tensor = [] gt_sem_seg = [] gt_sem_cls = [] # batch_input_shape shoud be the same across images pad_H, pad_W = img_metas[0]['batch_input_shape'] assign_H = pad_H // self.mask_assign_stride assign_W = pad_W // self.mask_assign_stride for i, gt_mask in enumerate(gt_masks): mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device) if gt_mask.width != pad_W or gt_mask.height != pad_H: pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height) mask_tensor = F.pad(mask_tensor, pad_wh, value=0) if gt_semantic_seg is not None: # gt_semantic seg is padded by 255 and # zero indicating the first class sem_labels, sem_seg = sem2ins_masks( gt_semantic_seg[i], num_thing_classes=self.num_thing_classes) if sem_seg.shape[0] == 0: gt_sem_seg.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W)) ) else: gt_sem_seg.append( F.interpolate( sem_seg[None], (assign_H, assign_W), mode='bilinear', align_corners=False)[0] ) gt_sem_cls.append(sem_labels) else: gt_sem_seg = None gt_sem_cls = None if mask_tensor.shape[0] == 0: gt_masks_tensor.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W)) ) else: gt_masks_tensor.append( F.interpolate( mask_tensor[None], (assign_H, assign_W), mode='bilinear', align_corners=False)[0] ) return gt_masks_tensor, gt_sem_seg, gt_sem_cls def ref_gt_transform(self, ref_img_metas, ref_gt_masks, ref_gt_labels, ref_gt_semantic_seg=None ): # gt_masks and gt_semantic_seg are not padded when forming batch ref_gt_masks_tensor = [] assert ref_gt_semantic_seg is None ref_gt_sem_seg = None ref_gt_sem_cls = None # batch_input_shape shoud be the same across images pad_H, pad_W = ref_img_metas[0]['batch_input_shape'] assign_H = pad_H // self.mask_assign_stride assign_W = pad_W // self.mask_assign_stride for bs_i, gt_mask_frame in enumerate(ref_gt_masks): batch_cur_gt_masks_tensor = [] for i, gt_mask in enumerate(gt_mask_frame): mask_tensor = gt_mask.to_tensor(torch.float, ref_gt_labels[bs_i].device) if gt_mask.width != pad_W or gt_mask.height != pad_H: pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height) mask_tensor = F.pad(mask_tensor, pad_wh, value=0) if mask_tensor.shape[0] == 0: batch_cur_gt_masks_tensor.append( mask_tensor.new_zeros( (mask_tensor.size(0), assign_H, assign_W)) ) else: batch_cur_gt_masks_tensor.append( F.interpolate( mask_tensor[None], (assign_H, assign_W), mode='bilinear', align_corners=False)[0] ) ref_gt_masks_tensor.append(batch_cur_gt_masks_tensor) return ref_gt_masks_tensor, ref_gt_sem_seg, ref_gt_sem_cls def forward_train(self, img, img_metas, gt_bboxes=None, gt_labels=None, gt_bboxes_ignore=None, gt_masks=None, proposals=None, gt_semantic_seg=None, gt_instance_ids=None, # references ref_img=None, ref_img_metas=None, ref_gt_bboxes=None, ref_gt_labels=None, ref_gt_bboxes_ignore=None, ref_gt_masks=None, ref_gt_instance_ids=None, **kwargs): super(TwoStageDetector, self).forward_train(img, img_metas) assert proposals is None, 'KNet does not support external proposals' assert gt_masks is not None ref_gt_masks, ref_gt_sem_seg, ref_gt_sem_cls = \ self.ref_gt_transform(img_metas, ref_gt_masks, ref_gt_labels, ref_gt_semantic_seg=None) bs, num_frame, _, h, w = ref_img.size() x = self.extract_feat(ref_img.reshape(bs * num_frame, _, h, w)) losses = dict() rpn_losses, proposal_feats, x_feats, mask_preds, cls_scores = \ self.rpn_head.forward_train(x, img_metas, ref_img_metas, ref_gt_masks, ref_gt_labels, ref_gt_instance_ids, ref_gt_sem_seg, ref_gt_sem_cls) losses.update(rpn_losses) if self.roi_head is not None: roi_losses, features = self.roi_head.forward_train( x_feats, proposal_feats, mask_preds, cls_scores, ref_img_metas, ref_gt_masks, ref_gt_labels, gt_bboxes_ignore=ref_gt_bboxes_ignore, gt_bboxes=ref_gt_bboxes, gt_sem_seg=ref_gt_sem_seg, gt_sem_cls=ref_gt_sem_cls, imgs_whwh=None) losses.update(roi_losses) if self.direct_tracker: proposal_feats = self.rpn_head.init_kernels.weight.clone() proposal_feats = proposal_feats[None].expand(bs, *proposal_feats.size()) if mask_preds.shape[0] == bs * num_frame: mask_preds = mask_preds.reshape((bs, num_frame, *mask_preds.size()[1:])) x_feats = x_feats.reshape((bs, num_frame, *x_feats.size()[1:])) else: assert mask_preds.size()[:2] == (bs, num_frame) assert x_feats.size()[:2] == (bs, num_frame) tracker_losses, features = self.tracker.forward_train( x=x_feats, ref_img_metas=ref_img_metas, cls_scores=None, masks=mask_preds, obj_feats=proposal_feats, ref_gt_masks=ref_gt_masks, ref_gt_labels=ref_gt_labels, ref_gt_instance_ids=ref_gt_instance_ids, ) if self.tracker_num > 1: for i in range(self.tracker_num - 1): _tracker_losses, features = self.tracker_extra[i].forward_train( x=features['x_feats'], ref_img_metas=ref_img_metas, cls_scores=None, masks=features['masks'], obj_feats=features['obj_feats'], ref_gt_masks=ref_gt_masks, ref_gt_labels=ref_gt_labels, ref_gt_instance_ids=ref_gt_instance_ids, ) for key, value in _tracker_losses.items(): tracker_losses[f'extra_m{i}_{key}'] = value else: tracker_losses, _ = self.tracker.forward_train( x=features['x_feats'], ref_img_metas=ref_img_metas, cls_scores=features['cls_scores'], masks=features['masks'], obj_feats=features['obj_feats'], ref_gt_masks=ref_gt_masks, ref_gt_labels=ref_gt_labels, ref_gt_instance_ids=ref_gt_instance_ids, ) losses.update(tracker_losses) return losses def forward_test(self, imgs, img_metas, **kwargs): """ Args: imgs (List[Tensor]): the outer list indicates test-time augmentations and inner Tensor should have a shape NxCxHxW, which contains all images in the batch. img_metas (List[List[dict]]): the outer list indicates test-time augs (multiscale, flip, etc.) and the inner list indicates images in a batch. """ for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]: if not isinstance(var, list): raise TypeError(f'{name} must be a list, but got {type(var)}') num_augs = len(imgs) if num_augs != len(img_metas): raise ValueError(f'num of augmentations ({len(imgs)}) ' f'!= num of image meta ({len(img_metas)})') # NOTE the batched image size information may be useful, e.g. # in DETR, this is needed for the construction of masks, which is # then used for the transformer_head. for img, img_meta in zip(imgs, img_metas): batch_size = len(img_meta) for img_id in range(batch_size): img_meta[img_id]['batch_input_shape'] = tuple(img.size()[-2:]) if num_augs == 1: # proposals (List[List[Tensor]]): the outer list indicates # test-time augs (multiscale, flip, etc.) and the inner list # indicates images in a batch. # The Tensor should have a shape Px4, where P is the number of # proposals. if 'proposals' in kwargs: kwargs['proposals'] = kwargs['proposals'][0] kwargs['ref_img_metas'] = kwargs['ref_img_metas'][0] kwargs['ref_img'] = kwargs['ref_img'][0] return self.simple_test(imgs[0], img_metas[0], **kwargs) else: assert imgs[0].size(0) == 1, 'aug test does not support ' \ 'inference with batch size ' \ f'{imgs[0].size(0)}' # TODO: support test augmentation for predefined proposals assert 'proposals' not in kwargs return self.aug_test(imgs, img_metas, **kwargs) def simple_test(self, imgs, img_metas, **kwargs): ref_img = kwargs['ref_img'] ref_img_metas = kwargs['ref_img_metas'] # Step 1 extract features and get masks bs, num_frame, _, h, w = ref_img.size() x = self.extract_feat(ref_img.reshape(bs * num_frame, _, h, w)) proposal_feats, x_feats, mask_preds, cls_scores, seg_preds = \ self.rpn_head.simple_test_rpn(x, img_metas, ref_img_metas) if self.roi_head is not None: segm_results_single_frame, features = self.roi_head.simple_test( x_feats, proposal_feats, mask_preds, cls_scores, img_metas, ref_img_metas, imgs_whwh=None, rescale=True ) if self.direct_tracker: proposal_feats = self.rpn_head.init_kernels.weight.clone() proposal_feats = proposal_feats[None].expand(bs, *proposal_feats.size()) if mask_preds.shape[0] == bs * num_frame: mask_preds = mask_preds.reshape((bs, num_frame, *mask_preds.size()[1:])) x_feats = x_feats.reshape((bs, num_frame, *x_feats.size()[1:])) else: assert mask_preds.size()[:2] == (bs, num_frame) assert x_feats.size()[:2] == (bs, num_frame) segm_results, features = self.tracker.simple_test( x=x_feats, img_metas=img_metas, ref_img_metas=ref_img_metas, cls_scores=None, masks=mask_preds, obj_feats=proposal_feats, ) if self.tracker_num > 1: for i in range(self.tracker_num - 1): segm_results, features = self.tracker_extra[i].simple_test( x=features['x_feats'], img_metas=img_metas, ref_img_metas=ref_img_metas, cls_scores=None, masks=features['masks'], obj_feats=features['obj_feats'], ) else: segm_results, _ = self.tracker.simple_test( x=features['x_feats'], img_metas=img_metas, ref_img_metas=ref_img_metas, cls_scores=features['cls_scores'], masks=features['masks'], obj_feats=features['obj_feats'], ) return segm_results def forward_dummy(self, img): """Used for computing network flops. See `mmdetection/tools/get_flops.py` """ # backbone x = self.extract_feat(img) # rpn num_imgs = len(img) dummy_img_metas = [ dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs) ] rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas) (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = rpn_results # roi_head roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats, dummy_img_metas) return roi_outs def init_weights(self): super().init_weights() if self.init_cfg is not None and self.init_cfg['type'] == 'Pretrained': assert self.tracker.init_cfg is None self.tracker.init_cfg = copy.deepcopy(self.init_cfg) self.tracker.init_cfg['prefix']='roi_head' self.tracker.init_weights() if self.tracker_num > 1: for _ in range(self.tracker_num - 1): assert self.tracker_extra[_].init_cfg is None self.tracker_extra[_].init_cfg = copy.deepcopy(self.init_cfg) self.tracker_extra[_].init_cfg['prefix'] = 'roi_head' self.tracker_extra[_].init_weights() ================================================ FILE: mmtrack/datasets/coco_video_dataset.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import random import numpy as np from mmcv.utils import print_log from mmdet.datasets import DATASETS, CocoDataset from terminaltables import AsciiTable from mmdet.utils import get_root_logger from .parsers import CocoVID @DATASETS.register_module() class CocoVideoDataset(CocoDataset): """Base coco video dataset for VID, MOT and SOT tasks. Args: load_as_video (bool): If True, using COCOVID class to load dataset, otherwise, using COCO class. Default: True. key_img_sampler (dict): Configuration of sampling key images. ref_img_sampler (dict): Configuration of sampling ref images. test_load_ann (bool): If True, loading annotations during testing, otherwise, not loading. Default: False. """ CLASSES = None def __init__(self, load_as_video=True, key_img_sampler=dict(interval=1), ref_img_sampler=dict( frame_range=10, stride=1, num_ref_imgs=1, filter_key_img=True, method='uniform', return_key_img=True), test_load_ann=False, load_all_frames=False, *args, **kwargs): self.load_as_video = load_as_video self.key_img_sampler = key_img_sampler self.ref_img_sampler = ref_img_sampler self.test_load_ann = test_load_ann self.load_all_frames = load_all_frames assert not (self.load_all_frames and ref_img_sampler is not None), "load all frames indicate no sampler" super().__init__(*args, **kwargs) self.logger = get_root_logger() def load_annotations(self, ann_file): """Load annotations from COCO/COCOVID style annotation file. Args: ann_file (str): Path of annotation file. Returns: list[dict]: Annotation information from COCO/COCOVID api. """ if not self.load_as_video: data_infos = super().load_annotations(ann_file) else: data_infos = self.load_video_anns(ann_file) return data_infos def load_video_anns(self, ann_file): """Load annotations from COCOVID style annotation file. Args: ann_file (str): Path of annotation file. Returns: list[dict]: Annotation information from COCOVID api. """ self.coco = CocoVID(ann_file) self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES) self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} data_infos = [] self.vid_ids = self.coco.get_vid_ids() self.img_ids = [] if not self.load_all_frames else None for vid_id in self.vid_ids: img_ids = self.coco.get_img_ids_from_vid(vid_id) if self.key_img_sampler is not None: img_ids = self.key_img_sampling(img_ids, **self.key_img_sampler) if self.load_all_frames: info = self.coco.load_imgs(img_ids) info = [info[0], *info] for item in info: item['filename'] = item['file_name'] data_infos.append(info) else: self.img_ids.extend(img_ids) for img_id in img_ids: info = self.coco.load_imgs([img_id])[0] info['filename'] = info['file_name'] data_infos.append(info) return data_infos def key_img_sampling(self, img_ids, interval=1): """Sampling key images.""" return img_ids[::interval] def ref_img_sampling(self, img_info, frame_range, stride=1, num_ref_imgs=1, filter_key_img=True, method='uniform', return_key_img=True): """Sampling reference frames in the same video for key frame. Args: img_info (dict): The information of key frame. frame_range (List(int) | int): The sampling range of reference frames in the same video for key frame. stride (int): The sampling frame stride when sampling reference images. Default: 1. num_ref_imgs (int): The number of sampled reference images. Default: 1. filter_key_img (bool): If False, the key image will be in the sampling reference candidates, otherwise, it is exclude. Default: True. method (str): The sampling method. Options are 'uniform', 'bilateral_uniform', 'test_with_adaptive_stride', 'test_with_fix_stride'. 'uniform' denotes reference images are randomly sampled from the nearby frames of key frame. 'bilateral_uniform' denotes reference images are randomly sampled from the two sides of the nearby frames of key frame. 'test_with_adaptive_stride' is only used in testing, and denotes the sampling frame stride is equal to (video length / the number of reference images). test_with_fix_stride is only used in testing with sampling frame stride equalling to `stride`. Default: 'uniform'. return_key_img (bool): If True, the information of key frame is returned, otherwise, not returned. Default: True. Returns: list(dict): `img_info` and the reference images information or only the reference images information. """ assert isinstance(img_info, dict) if isinstance(frame_range, int): assert frame_range >= 0, 'frame_range can not be a negative value.' frame_range = [-frame_range, frame_range] elif isinstance(frame_range, list): assert len(frame_range) == 2, 'The length must be 2.' assert frame_range[0] <= 0 and frame_range[1] >= 0 for i in frame_range: assert isinstance(i, int), 'Each element must be int.' else: raise TypeError('The type of frame_range must be int or list.') if 'test' in method and \ (frame_range[1] - frame_range[0]) != num_ref_imgs: print_log( 'Warning:' "frame_range[1] - frame_range[0] isn't equal to num_ref_imgs." 'Set num_ref_imgs to frame_range[1] - frame_range[0].', logger=self.logger) self.ref_img_sampler[ 'num_ref_imgs'] = frame_range[1] - frame_range[0] if (not self.load_as_video) or img_info.get('frame_id', -1) < 0 \ or (frame_range[0] == 0 and frame_range[1] == 0): ref_img_infos = [] for i in range(num_ref_imgs): ref_img_infos.append(img_info.copy()) else: vid_id, img_id, frame_id = img_info['video_id'], img_info[ 'id'], img_info['frame_id'] img_ids = self.coco.get_img_ids_from_vid(vid_id) left = max(0, frame_id + frame_range[0]) right = min(frame_id + frame_range[1], len(img_ids) - 1) ref_img_ids = [] if method == 'uniform': valid_ids = img_ids[left:right + 1] if filter_key_img and img_id in valid_ids: valid_ids.remove(img_id) if num_ref_imgs != len(valid_ids): return None num_samples = min(num_ref_imgs, len(valid_ids)) ref_img_ids.extend(random.sample(valid_ids, num_samples)) elif method == 'bilateral_uniform': assert num_ref_imgs % 2 == 0, \ 'only support load even number of ref_imgs.' for mode in ['left', 'right']: if mode == 'left': valid_ids = img_ids[left:frame_id + 1] else: valid_ids = img_ids[frame_id:right + 1] if filter_key_img and img_id in valid_ids: valid_ids.remove(img_id) num_samples = min(num_ref_imgs // 2, len(valid_ids)) sampled_inds = random.sample(valid_ids, num_samples) ref_img_ids.extend(sampled_inds) elif method == 'test_with_adaptive_stride': if frame_id == 0: stride = float(len(img_ids) - 1) / (num_ref_imgs - 1) for i in range(num_ref_imgs): ref_id = round(i * stride) ref_img_ids.append(img_ids[ref_id]) elif method == 'test_with_fix_stride': if frame_id == 0: for i in range(frame_range[0], 1): ref_img_ids.append(img_ids[0]) for i in range(1, frame_range[1] + 1): ref_id = min(round(i * stride), len(img_ids) - 1) ref_img_ids.append(img_ids[ref_id]) elif frame_id % stride == 0: ref_id = min( round(frame_id + frame_range[1] * stride), len(img_ids) - 1) ref_img_ids.append(img_ids[ref_id]) img_info['num_left_ref_imgs'] = abs(frame_range[0]) \ if isinstance(frame_range, list) else frame_range img_info['frame_stride'] = stride else: raise NotImplementedError ref_img_infos = [] for ref_img_id in ref_img_ids: ref_img_info = self.coco.load_imgs([ref_img_id])[0] ref_img_info['filename'] = ref_img_info['file_name'] ref_img_infos.append(ref_img_info) ref_img_infos = sorted(ref_img_infos, key=lambda i: i['frame_id']) if return_key_img: return [img_info, *ref_img_infos] else: return ref_img_infos def get_ann_info(self, img_info): """Get COCO annotations by the information of image. Args: img_info (int): Information of image. Returns: dict: Annotation information of `img_info`. """ img_id = img_info['id'] ann_ids = self.coco.get_ann_ids(img_ids=[img_id], cat_ids=self.cat_ids) ann_info = self.coco.load_anns(ann_ids) return self._parse_ann_info(img_info, ann_info) def prepare_results(self, img_info): """Prepare results for image (e.g. the annotation information, ...).""" results = dict(img_info=img_info) if not self.test_mode or self.test_load_ann: results['ann_info'] = self.get_ann_info(img_info) if self.proposals is not None: idx = self.img_ids.index(img_info['id']) results['proposals'] = self.proposals[idx] super().pre_pipeline(results) results['is_video_data'] = self.load_as_video return results def prepare_data(self, idx): """Get data and annotations after pipeline. Args: idx (int): Index of data. Returns: dict: Data and annotations after pipeline with new keys introduced by pipeline. """ img_info = self.data_infos[idx] if self.ref_img_sampler is not None: img_infos = self.ref_img_sampling(img_info, **self.ref_img_sampler) if img_infos is None: return None results = [ self.prepare_results(img_info) for img_info in img_infos ] elif self.load_all_frames: results = [ self.prepare_results(_img_info) for _img_info in img_info ] else: results = self.prepare_results(img_info) return self.pipeline(results) def prepare_train_img(self, idx): """Get training data and annotations after pipeline. Args: idx (int): Index of data. Returns: dict: Training data and annotations after pipeline with new keys introduced by pipeline. """ return self.prepare_data(idx) def prepare_test_img(self, idx): """Get testing data after pipeline. Args: idx (int): Index of data. Returns: dict: Testing data after pipeline with new keys intorduced by pipeline. """ return self.prepare_data(idx) def _parse_ann_info(self, img_info, ann_info): """Parse bbox and mask annotations. Args: img_anfo (dict): Information of image. ann_info (list[dict]): Annotation information of image. Returns: dict: A dict containing the following keys: bboxes, bboxes_ignore, labels, instance_ids, masks, seg_map. "masks" are raw annotations and not decoded into binary masks. """ gt_bboxes = [] gt_labels = [] gt_bboxes_ignore = [] gt_masks = [] gt_instance_ids = [] for i, ann in enumerate(ann_info): if ann.get('ignore', False): continue x1, y1, w, h = ann['bbox'] inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0)) inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0)) if inter_w * inter_h == 0: continue if ann['area'] <= 0 or w < 1 or h < 1: continue if ann['category_id'] not in self.cat_ids: continue bbox = [x1, y1, x1 + w, y1 + h] if ann.get('iscrowd', False): gt_bboxes_ignore.append(bbox) else: gt_bboxes.append(bbox) gt_labels.append(self.cat2label[ann['category_id']]) if 'segmentation' in ann: gt_masks.append(ann['segmentation']) if 'instance_id' in ann: gt_instance_ids.append(ann['instance_id']) if gt_bboxes: gt_bboxes = np.array(gt_bboxes, dtype=np.float32) gt_labels = np.array(gt_labels, dtype=np.int64) else: gt_bboxes = np.zeros((0, 4), dtype=np.float32) gt_labels = np.array([], dtype=np.int64) if gt_bboxes_ignore: gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32) else: gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32) seg_map = img_info['filename'].replace('jpg', 'png') ann = dict( bboxes=gt_bboxes, labels=gt_labels, bboxes_ignore=gt_bboxes_ignore, masks=gt_masks, seg_map=seg_map) if self.load_as_video: ann['instance_ids'] = np.array(gt_instance_ids).astype(np.int) else: ann['instance_ids'] = np.arange(len(gt_labels)) return ann def evaluate(self, results, metric=['bbox', 'track'], logger=None, bbox_kwargs=dict( classwise=False, proposal_nums=(100, 300, 1000), iou_thrs=None, metric_items=None), track_kwargs=dict( iou_thr=0.5, ignore_iof_thr=0.5, ignore_by_classes=False, nproc=4)): """Evaluation in COCO protocol and CLEAR MOT metric (e.g. MOTA, IDF1). Args: results (dict): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. Options are 'bbox', 'segm', 'track'. logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. bbox_kwargs (dict): Configuration for COCO styple evaluation. track_kwargs (dict): Configuration for CLEAR MOT evaluation. Returns: dict[str, float]: COCO style and CLEAR MOT evaluation metric. """ if isinstance(metric, list): metrics = metric elif isinstance(metric, str): metrics = [metric] else: raise TypeError('metric must be a list or a str.') allowed_metrics = ['bbox', 'segm', 'track'] for metric in metrics: if metric not in allowed_metrics: raise KeyError(f'metric {metric} is not supported.') eval_results = dict() if 'track' in metrics: assert len(self.data_infos) == len(results['track_bboxes']) inds = [ i for i, _ in enumerate(self.data_infos) if _['frame_id'] == 0 ] num_vids = len(inds) inds.append(len(self.data_infos)) track_bboxes = [ results['track_bboxes'][inds[i]:inds[i + 1]] for i in range(num_vids) ] ann_infos = [self.get_ann_info(_) for _ in self.data_infos] ann_infos = [ ann_infos[inds[i]:inds[i + 1]] for i in range(num_vids) ] raise NotImplementedError("eval_mot is not implemented yet.") # track_eval_results = eval_mot( # results=track_bboxes, # annotations=ann_infos, # logger=logger, # classes=self.CLASSES, # **track_kwargs) # eval_results.update(track_eval_results) # evaluate for detectors without tracker super_metrics = ['bbox', 'segm'] super_metrics = [_ for _ in metrics if _ in super_metrics] if super_metrics: if isinstance(results, dict): if 'bbox' in super_metrics and 'segm' in super_metrics: super_results = [] for bbox, mask in zip(results['det_bboxes'], results['det_masks']): super_results.append((bbox, mask)) else: super_results = results['det_bboxes'] elif isinstance(results, list): super_results = results else: raise TypeError('Results must be a dict or a list.') super_eval_results = super().evaluate( results=super_results, metric=super_metrics, logger=logger, **bbox_kwargs) eval_results.update(super_eval_results) return eval_results def __repr__(self): """Print the number of instance number suit for video dataset.""" dataset_type = 'Test' if self.test_mode else 'Train' result = (f'\n{self.__class__.__name__} {dataset_type} dataset ' f'with number of images {len(self)}, ' f'and instance counts: \n') if self.CLASSES is None: result += 'Category names are not provided. \n' return result instance_count = np.zeros(len(self.CLASSES) + 1).astype(int) # count the instance number in each image for idx in range(len(self)): img_info = self.data_infos[idx] label = self.get_ann_info(img_info)['labels'] unique, counts = np.unique(label, return_counts=True) if len(unique) > 0: # add the occurrence number to each class instance_count[unique] += counts else: # background is the last index instance_count[-1] += 1 # create a table with category count table_data = [['category', 'count'] * 5] row_data = [] for cls, count in enumerate(instance_count): if cls < len(self.CLASSES): row_data += [f'{cls} [{self.CLASSES[cls]}]', f'{count}'] else: # add the background number row_data += ['-1 background', f'{count}'] if len(row_data) == 10: table_data.append(row_data) row_data = [] if len(row_data) >= 2: if row_data[-1] == '0': row_data = row_data[:-2] if len(row_data) >= 2: table_data.append([]) table_data.append(row_data) table = AsciiTable(table_data) result += table.table return result ================================================ FILE: mmtrack/datasets/parsers/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .coco_video_parser import CocoVID __all__ = ['CocoVID'] ================================================ FILE: mmtrack/datasets/parsers/coco_video_parser.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from collections import defaultdict import numpy as np from mmdet.datasets.api_wrappers import COCO from pycocotools.coco import _isArrayLike class CocoVID(COCO): """Inherit official COCO class in order to parse the annotations of bbox- related video tasks. Args: annotation_file (str): location of annotation file. Defaults to None. load_img_as_vid (bool): If True, convert image data to video data, which means each image is converted to a video. Defaults to False. """ def __init__(self, annotation_file=None, load_img_as_vid=False): assert annotation_file, 'Annotation file must be provided.' self.load_img_as_vid = load_img_as_vid super(CocoVID, self).__init__(annotation_file=annotation_file) def convert_img_to_vid(self, dataset): """Convert image data to video data.""" if 'images' in self.dataset: videos = [] for i, img in enumerate(self.dataset['images']): videos.append(dict(id=img['id'], name=img['file_name'])) img['video_id'] = img['id'] img['frame_id'] = 0 dataset['videos'] = videos if 'annotations' in self.dataset: for i, ann in enumerate(self.dataset['annotations']): ann['video_id'] = ann['image_id'] ann['instance_id'] = ann['id'] return dataset def createIndex(self, use_ext=False): """Create index.""" print('creating index...') anns, cats, imgs, vids = {}, {}, {}, {} (imgToAnns, catToImgs, vidToImgs, vidToInstances, instancesToImgs) = defaultdict(list), defaultdict(list), defaultdict( list), defaultdict(list), defaultdict(list) if 'videos' not in self.dataset and self.load_img_as_vid: self.dataset = self.convert_img_to_vid(self.dataset) if 'videos' in self.dataset: for video in self.dataset['videos']: vids[video['id']] = video if 'annotations' in self.dataset: for ann in self.dataset['annotations']: imgToAnns[ann['image_id']].append(ann) anns[ann['id']] = ann if 'instance_id' in ann: instancesToImgs[ann['instance_id']].append(ann['image_id']) if 'video_id' in ann and \ ann['instance_id'] not in \ vidToInstances[ann['video_id']]: vidToInstances[ann['video_id']].append( ann['instance_id']) if 'images' in self.dataset: for img in self.dataset['images']: vidToImgs[img['video_id']].append(img) imgs[img['id']] = img if 'categories' in self.dataset: for cat in self.dataset['categories']: cats[cat['id']] = cat if 'annotations' in self.dataset and 'categories' in self.dataset: for ann in self.dataset['annotations']: catToImgs[ann['category_id']].append(ann['image_id']) print('index created!') self.anns = anns self.imgToAnns = imgToAnns self.catToImgs = catToImgs self.imgs = imgs self.cats = cats self.videos = vids self.vidToImgs = vidToImgs self.vidToInstances = vidToInstances self.instancesToImgs = instancesToImgs def get_vid_ids(self, vidIds=[]): """Get video ids that satisfy given filter conditions. Default return all video ids. Args: vidIds (list[int]): The given video ids. Defaults to []. Returns: list[int]: Video ids. """ vidIds = vidIds if _isArrayLike(vidIds) else [vidIds] if len(vidIds) == 0: ids = self.videos.keys() else: ids = set(vidIds) return list(ids) def get_img_ids_from_vid(self, vidId): """Get image ids from given video id. Args: vidId (int): The given video id. Returns: list[int]: Image ids of given video id. """ img_infos = self.vidToImgs[vidId] ids = list(np.zeros([len(img_infos)], dtype=np.int64)) for img_info in img_infos: ids[img_info['frame_id']] = img_info['id'] return ids def get_ins_ids_from_vid(self, vidId): """Get instance ids from given video id. Args: vidId (int): The given video id. Returns: list[int]: Instance ids of given video id. """ return self.vidToInstances[vidId] def get_img_ids_from_ins_id(self, insId): """Get image ids from given instance id. Args: insId (int): The given instance id. Returns: list[int]: Image ids of given instance id. """ return self.instancesToImgs[insId] def load_vids(self, ids=[]): """Get video information of given video ids. Default return all videos information. Args: ids (list[int]): The given video ids. Defaults to []. Returns: list[dict]: List of video information. """ if _isArrayLike(ids): return [self.videos[id] for id in ids] elif type(ids) == int: return [self.videos[ids]] ================================================ FILE: mmtrack/datasets/youtube_vis_dataset.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os.path import os.path as osp import tempfile import zipfile import mmcv import numpy as np from mmcv.utils import print_log from mmdet.datasets import DATASETS from .coco_video_dataset import CocoVideoDataset def results2outs(bbox_results=None, mask_results=None, mask_shape=None, **kwargs): """Restore the results (list of results of each category) into the results of the model forward. Args: bbox_results (list[np.ndarray]): Each list denotes bboxes of one category. mask_results (list[list[np.ndarray]]): Each outer list denotes masks of one category. Each inner list denotes one mask belonging to the category. Each mask has shape (h, w). mask_shape (tuple[int]): The shape (h, w) of mask. Returns: tuple: tracking results of each class. It may contain keys as belows: - bboxes (np.ndarray): shape (n, 5) - labels (np.ndarray): shape (n, ) - masks (np.ndarray): shape (n, h, w) - ids (np.ndarray): shape (n, ) """ outputs = dict() if bbox_results is not None: labels = [] for i, bbox in enumerate(bbox_results): labels.extend([i] * bbox.shape[0]) labels = np.array(labels, dtype=np.int64) outputs['labels'] = labels bboxes = np.concatenate(bbox_results, axis=0).astype(np.float32) if bboxes.shape[1] == 5: outputs['bboxes'] = bboxes elif bboxes.shape[1] == 6: ids = bboxes[:, 0].astype(np.int64) bboxes = bboxes[:, 1:] outputs['bboxes'] = bboxes outputs['ids'] = ids else: raise NotImplementedError( f'Not supported bbox shape: (N, {bboxes.shape[1]})') if mask_results is not None: assert mask_shape is not None mask_height, mask_width = mask_shape mask_results = mmcv.concat_list(mask_results) if len(mask_results) == 0: masks = np.zeros((0, mask_height, mask_width)).astype(bool) else: masks = np.stack(mask_results, axis=0) outputs['masks'] = masks return outputs @DATASETS.register_module() class YouTubeVISDataset(CocoVideoDataset): """YouTube VIS dataset for video instance segmentation.""" CLASSES_2019_version = ('person', 'giant_panda', 'lizard', 'parrot', 'skateboard', 'sedan', 'ape', 'dog', 'snake', 'monkey', 'hand', 'rabbit', 'duck', 'cat', 'cow', 'fish', 'train', 'horse', 'turtle', 'bear', 'motorbike', 'giraffe', 'leopard', 'fox', 'deer', 'owl', 'surfboard', 'airplane', 'truck', 'zebra', 'tiger', 'elephant', 'snowboard', 'boat', 'shark', 'mouse', 'frog', 'eagle', 'earless_seal', 'tennis_racket') CLASSES_2021_version = ('airplane', 'bear', 'bird', 'boat', 'car', 'cat', 'cow', 'deer', 'dog', 'duck', 'earless_seal', 'elephant', 'fish', 'flying_disc', 'fox', 'frog', 'giant_panda', 'giraffe', 'horse', 'leopard', 'lizard', 'monkey', 'motorbike', 'mouse', 'parrot', 'person', 'rabbit', 'shark', 'skateboard', 'snake', 'snowboard', 'squirrel', 'surfboard', 'tennis_racket', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra') def __init__(self, dataset_version, *args, **kwargs): self.set_dataset_classes(dataset_version) super().__init__(*args, **kwargs) @classmethod def set_dataset_classes(cls, dataset_version): if dataset_version == '2019': cls.CLASSES = cls.CLASSES_2019_version elif dataset_version == '2021': cls.CLASSES = cls.CLASSES_2021_version else: raise NotImplementedError('Not supported YouTubeVIS dataset' f'version: {dataset_version}') def format_results(self, _results, resfile_path=None, metrics=['track_segm']): """Format the results to a zip file (standard format for YouTube-VIS Challenge). Args: results (dict(list[ndarray])): Testing results of the dataset. resfile_path (str, optional): Path to save the formatted results. Defaults to None. metrics (list[str], optional): The results of the specific metrics will be formatted. Defaults to ['track_segm']. Returns: tuple: (resfiles, tmp_dir), resfiles is the path of the result json file, tmp_dir is the temporal directory created for saving files. """ results = { 'track_bboxes':[item[0] for item in _results], 'track_masks':[item[1] for item in _results] } data_infos = [] for item in self.data_infos: data_infos.extend(item[1:]) assert isinstance(results, dict), 'results must be a dict.' if isinstance(metrics, str): metrics = [metrics] assert 'track_segm' in metrics if resfile_path is None: tmp_dir = tempfile.TemporaryDirectory() resfile_path = tmp_dir.name else: tmp_dir = None if not os.path.exists(resfile_path): os.makedirs(resfile_path) resfiles = osp.join(resfile_path, 'results.json') inds = [i for i, _ in enumerate(data_infos) if _['frame_id'] == 0] num_vids = len(inds) assert num_vids == len(self.vid_ids) inds.append(len(data_infos)) vid_infos = self.coco.load_vids(self.vid_ids) json_results = [] for i in range(num_vids): video_id = vid_infos[i]['id'] # collect data for each instances in a video. collect_data = dict() for frame_id, (bbox_res, mask_res) in enumerate( zip(results['track_bboxes'][inds[i]:inds[i + 1]], results['track_masks'][inds[i]:inds[i + 1]])): outs_track = results2outs(bbox_results=bbox_res) bboxes = outs_track['bboxes'] labels = outs_track['labels'] ids = outs_track['ids'] masks = mmcv.concat_list(mask_res) assert len(masks) == len(bboxes) for j, id in enumerate(ids): if id not in collect_data: collect_data[id] = dict( category_ids=[], scores=[], segmentations=dict()) collect_data[id]['category_ids'].append(labels[j]) collect_data[id]['scores'].append(bboxes[j][4]) if isinstance(masks[j]['counts'], bytes): masks[j]['counts'] = masks[j]['counts'].decode() collect_data[id]['segmentations'][frame_id] = masks[j] # transform the collected data into official format for id, id_data in collect_data.items(): output = dict() output['video_id'] = video_id output['score'] = np.array(id_data['scores']).mean().item() # majority voting for sequence category output['category_id'] = np.bincount( np.array(id_data['category_ids'])).argmax().item() + 1 output['segmentations'] = [] for frame_id in range(inds[i + 1] - inds[i]): if frame_id in id_data['segmentations']: output['segmentations'].append( id_data['segmentations'][frame_id]) else: output['segmentations'].append(None) json_results.append(output) mmcv.dump(json_results, resfiles) # zip the json file in order to submit to the test server. zip_file_name = osp.join(resfile_path, 'submission_file.zip') zf = zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED) print_log(f"zip the 'results.json' into '{zip_file_name}', " 'please submmit the zip file to the test server') zf.write(resfiles, 'results.json') zf.close() return resfiles, tmp_dir ================================================ FILE: mmtrack/pipelines/__init__.py ================================================ from .formatting import * from .loading import * from .test_time_aug import * from .transforms import * ================================================ FILE: mmtrack/pipelines/formatting.py ================================================ import numpy as np import torch from mmcv.parallel import DataContainer as DC from mmdet.datasets.builder import PIPELINES from mmdet.datasets.pipelines import to_tensor @PIPELINES.register_module() class ConcatVideoReferences(object): """Concat video references. If the input list contains at least two dicts, concat the input list of dict to one dict from 2-nd dict of the input list. Args: results (list[dict]): List of dict that contain keys such as 'img', 'img_metas', 'gt_masks','proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels','gt_semantic_seg', 'gt_instance_ids'. Returns: list[dict]: The first dict of outputs is the same as the first dict of `results`. The second dict of outputs concats the dicts in `results[1:]`. """ def __call__(self, results): assert (isinstance(results, list)), 'results must be list' outs = results[:1] for i, result in enumerate(results[1:], 1): if 'img' in result: img = result['img'] if len(img.shape) < 3: img = np.expand_dims(img, -1) if i == 1: result['img'] = np.expand_dims(img, -1) else: outs[1]['img'] = np.concatenate( (outs[1]['img'], np.expand_dims(img, -1)), axis=-1) for key in ['img_metas', 'gt_masks']: if key in result: if i == 1: result[key] = [result[key]] else: outs[1][key].append(result[key]) for key in [ 'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels', 'gt_instance_ids', ]: if key not in result: continue value = result[key] if value.ndim == 1: value = value[:, None] N = value.shape[0] value = np.concatenate((np.full( (N, 1), i - 1, dtype=int if key in ['gt_labels', 'gt_instance_ids'] else np.float32 ), value), axis=1) if i == 1: result[key] = value else: outs[1][key] = np.concatenate((outs[1][key], value), axis=0) if 'gt_semantic_seg' in result: if i == 1: result['gt_semantic_seg'] = result['gt_semantic_seg'][..., None, None] else: outs[1]['gt_semantic_seg'] = np.concatenate( (outs[1]['gt_semantic_seg'], result['gt_semantic_seg'][..., None, None]), axis=-1) if i == 1: outs.append(result) return outs @PIPELINES.register_module() class ConcatVideos(object): """Concat video references. If the input list contains at least two dicts, concat the input list of dict to one dict from 2-nd dict of the input list. Args: results (list[dict]): List of dict that contain keys such as 'img', 'img_metas', 'gt_masks','proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels','gt_semantic_seg', 'gt_instance_ids'. Returns: list[dict]: The first dict of outputs is the same as the first dict of `results`. The second dict of outputs concats the dicts in `results[1:]`. """ def __call__(self, results): assert (isinstance(results, list)), 'results must be list' outs = results[:1] # outs = [] for i, result in enumerate(results[0:], 1): if 'img' in result: img = result['img'] if len(img.shape) < 3: img = np.expand_dims(img, -1) if i == 1: result['img'] = np.expand_dims(img, -1) else: outs[1]['img'] = np.concatenate( (outs[1]['img'], np.expand_dims(img, -1)), axis=-1) for key in ['img_metas', 'gt_masks']: if key in result: if i == 1: result[key] = [result[key]] else: outs[1][key].append(result[key]) for key in [ 'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels', 'gt_instance_ids' ]: if key not in result: continue value = result[key] if value.ndim == 1: value = value[:, None] N = value.shape[0] value = np.concatenate((np.full( (N, 1), i - 1, dtype=int if key in ['gt_labels', 'gt_instance_ids'] else np.float32 ), value), axis=1) if i == 1: result[key] = value else: outs[1][key] = np.concatenate((outs[1][key], value), axis=0) if 'gt_semantic_seg' in result: if i == 1: result['gt_semantic_seg'] = result['gt_semantic_seg'][..., None, None] else: outs[1]['gt_semantic_seg'] = np.concatenate( (outs[1]['gt_semantic_seg'], result['gt_semantic_seg'][..., None, None]), axis=-1) if i == 1: outs.append(result) res = [] res.append(outs[1]) return res @PIPELINES.register_module() class MultiImagesToTensor(object): """Multi images to tensor. 1. Transpose and convert image/multi-images to Tensor. 2. Add prefix to every key in the second dict of the inputs. Then, add these keys and corresponding values into the outputs. Args: ref_prefix (str): The prefix of key added to the second dict of inputs. Defaults to 'ref'. """ def __init__(self, ref_prefix='ref'): self.ref_prefix = ref_prefix def __call__(self, results): """Multi images to tensor. 1. Transpose and convert image/multi-images to Tensor. 2. Add prefix to every key in the second dict of the inputs. Then, add these keys and corresponding values into the output dict. Args: results (list[dict]): List of two dicts. Returns: dict: Each key in the first dict of `results` remains unchanged. Each key in the second dict of `results` adds `self.ref_prefix` as prefix. """ outs = [] for _results in results: _results = self.images_to_tensor(_results) outs.append(_results) data = {} data.update(outs[0]) if len(outs) == 2: for k, v in outs[1].items(): data[f'{self.ref_prefix}_{k}'] = v return data def images_to_tensor(self, results): """Transpose and convert images/multi-images to Tensor.""" if 'img' in results: img = results['img'] if len(img.shape) == 3: # (H, W, 3) to (3, H, W) img = np.ascontiguousarray(img.transpose(2, 0, 1)) else: # (H, W, 3, N) to (N, 3, H, W) img = np.ascontiguousarray(img.transpose(3, 2, 0, 1)) results['img'] = to_tensor(img) if 'proposals' in results: results['proposals'] = to_tensor(results['proposals']) if 'img_metas' in results: results['img_metas'] = DC(results['img_metas'], cpu_only=True) return results @PIPELINES.register_module() class SeqDefaultFormatBundle(object): """Sequence Default formatting bundle. It simplifies the pipeline of formatting common fields, including "img", "img_metas", "proposals", "gt_bboxes", "gt_instance_ids", "gt_match_indices", "gt_bboxes_ignore", "gt_labels", "gt_masks" and "gt_semantic_seg". These fields are formatted as follows. - img: (1) transpose, (2) to tensor, (3) to DataContainer (stack=True) - img_metas: (1) to DataContainer (cpu_only=True) - proposals: (1) to tensor, (2) to DataContainer - gt_bboxes: (1) to tensor, (2) to DataContainer - gt_instance_ids: (1) to tensor, (2) to DataContainer - gt_match_indices: (1) to tensor, (2) to DataContainer - gt_bboxes_ignore: (1) to tensor, (2) to DataContainer - gt_labels: (1) to tensor, (2) to DataContainer - gt_masks: (1) to DataContainer (cpu_only=True) - gt_semantic_seg: (1) unsqueeze dim-0 (2) to tensor, \ (3) to DataContainer (stack=True) Args: ref_prefix (str): The prefix of key added to the second dict of input list. Defaults to 'ref'. """ def __init__(self, ref_prefix='ref'): self.ref_prefix = ref_prefix def __call__(self, results): """Sequence Default formatting bundle call function. Args: results (list[dict]): List of two dicts. Returns: dict: The result dict contains the data that is formatted with default bundle. Each key in the second dict of the input list adds `self.ref_prefix` as prefix. """ outs = [] for _results in results: _results = self.default_format_bundle(_results) outs.append(_results) data = {} if self.ref_prefix == 'ref': # origin frames data.update(outs[0]) # reference frames if len(outs) == 1: # for k in outs[0]: # data[f'{self.ref_prefix}_{k}'] = None pass else: for k, v in outs[1].items(): data[f'{self.ref_prefix}_{k}'] = v elif self.ref_prefix is None: # origin frames data.update(outs[0]) return data def default_format_bundle(self, results): """Transform and format common fields in results. Args: results (dict): Result dict contains the data to convert. Returns: dict: The result dict contains the data that is formatted with default bundle. """ if 'img' in results: img = results['img'] if len(img.shape) == 3: img = np.ascontiguousarray(img.transpose(2, 0, 1)) else: img = np.ascontiguousarray(img.transpose(3, 2, 0, 1)) results['img'] = DC(to_tensor(img), stack=True) for key in [ 'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels', 'gt_instance_ids', 'gt_match_indices', ]: if key not in results: continue results[key] = DC(to_tensor(results[key])) for key in ['img_metas', 'gt_masks']: if key in results: results[key] = DC(results[key], cpu_only=True) if 'gt_semantic_seg' in results: semantic_seg = results['gt_semantic_seg'] if len(semantic_seg.shape) == 2: semantic_seg = semantic_seg[None, ...] else: semantic_seg = np.ascontiguousarray( semantic_seg.transpose(3, 2, 0, 1)) results['gt_semantic_seg'] = DC( to_tensor(semantic_seg), stack=True) return results def __repr__(self): return self.__class__.__name__ @PIPELINES.register_module() class VideoCollect(object): """Collect data from the loader relevant to the specific task. Args: keys (Sequence[str]): Keys of results to be collected in ``data``. meta_keys (Sequence[str]): Meta keys to be converted to ``mmcv.DataContainer`` and collected in ``data[img_metas]``. Defaults to None. default_meta_keys (tuple): Default meta keys. Defaults to ('filename', 'ori_filename', 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg', 'frame_id', 'is_video_data'). """ def __init__(self, keys, meta_keys=None, reject_empty=False, num_ref_imgs=0, # no_obj_class is added for handling non-0 no-obj class default_meta_keys=('filename', 'ori_filename', 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg', 'video_id', 'frame_id', 'is_video_data', 'no_obj_class')): self.keys = keys self.meta_keys = default_meta_keys if meta_keys is not None: if isinstance(meta_keys, str): meta_keys = (meta_keys,) else: assert isinstance(meta_keys, tuple), \ 'meta_keys must be str or tuple' self.meta_keys += meta_keys self.reject_empty = reject_empty self.num_ref_imgs = num_ref_imgs def __call__(self, results): """Call function to collect keys in results. The keys in ``meta_keys`` and ``default_meta_keys`` will be converted to :obj:mmcv.DataContainer. Args: results (list[dict] | dict): List of dict or dict which contains the data to collect. Returns: list[dict] | dict: List of dict or dict that contains the following keys: - keys in ``self.keys`` - ``img_metas`` """ results_is_dict = isinstance(results, dict) if results_is_dict: results = [results] outs = [] for _results in results: _results = self._add_default_meta_keys(_results) _results = self._collect_meta_keys(_results) outs.append(_results) if results_is_dict: outs[0]['img_metas'] = DC(outs[0]['img_metas'], cpu_only=True) if self.reject_empty: if len(results[0]['gt_labels']) == 0: return None if self.num_ref_imgs > 0: if len(results) != self.num_ref_imgs + 1: raise NotImplementedError return outs[0] if results_is_dict else outs def _collect_meta_keys(self, results): """Collect `self.keys` and `self.meta_keys` from `results` (dict).""" data = {} img_meta = {} for key in self.meta_keys: if key in results: img_meta[key] = results[key] elif key in results['img_info']: img_meta[key] = results['img_info'][key] data['img_metas'] = img_meta for key in self.keys: data[key] = results[key] return data def _add_default_meta_keys(self, results): """Add default meta keys. We set default meta keys including `pad_shape`, `scale_factor` and `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and `Pad` are implemented during the whole pipeline. Args: results (dict): Result dict contains the data to convert. Returns: results (dict): Updated result dict contains the data to convert. """ img = results['img'] results.setdefault('pad_shape', img.shape) results.setdefault('scale_factor', 1.0) num_channels = 1 if len(img.shape) < 3 else img.shape[2] results.setdefault( 'img_norm_cfg', dict( mean=np.zeros(num_channels, dtype=np.float32), std=np.ones(num_channels, dtype=np.float32), to_rgb=False)) return results @PIPELINES.register_module() class ToList(object): """Use list to warp each value of the input dict. Args: results (dict): Result dict contains the data to convert. Returns: dict: Updated result dict contains the data to convert. """ def __call__(self, results): out = {} for k, v in results.items(): out[k] = [v] return out @PIPELINES.register_module() class ReIDFormatBundle(object): """ReID formatting bundle. It first concatenates common fields, then simplifies the pipeline of formatting common fields, including "img", and "gt_label". These fields are formatted as follows. - img: (1) transpose, (2) to tensor, (3) to DataContainer (stack=True) - gt_labels: (1) to tensor, (2) to DataContainer """ def __init__(self, *args, **kwargs): super().__init__() def __call__(self, results): """ReID formatting bundle call function. Args: results (list[dict] or dict): List of dicts or dict. Returns: dict: The result dict contains the data that is formatted with ReID bundle. """ inputs = dict() if isinstance(results, list): assert len(results) > 1, \ 'the \'results\' only have one item, ' \ 'please directly use normal pipeline not \'Seq\' pipeline.' inputs['img'] = np.stack([_results['img'] for _results in results], axis=3) inputs['gt_label'] = np.stack( [_results['gt_label'] for _results in results], axis=0) elif isinstance(results, dict): inputs['img'] = results['img'] inputs['gt_label'] = results['gt_label'] else: raise TypeError('results must be a list or a dict.') outs = self.reid_format_bundle(inputs) return outs def reid_format_bundle(self, results): """Transform and format gt_label fields in results. Args: results (dict): Result dict contains the data to convert. Returns: dict: The result dict contains the data that is formatted with ReID bundle. """ for key in results: if key == 'img': img = results[key] if img.ndim == 3: img = np.ascontiguousarray(img.transpose(2, 0, 1)) else: img = np.ascontiguousarray(img.transpose(3, 2, 0, 1)) results['img'] = DC(to_tensor(img), stack=True) elif key == 'gt_label': results[key] = DC( to_tensor(results[key]), stack=True, pad_dims=None) else: raise KeyError(f'key {key} is not supported') return results @PIPELINES.register_module() class ImageToTensorWithRef(object): def __init__(self, keys): self.keys = keys def __call__(self, results): for key in self.keys: if key in ['ref_img']: if isinstance(results[key], list): img_ref = [] for img in results[key]: img = np.ascontiguousarray(img.transpose(2, 0, 1)) img_ref.append(img) img_ref = np.array(img_ref) results[key] = to_tensor(img_ref) else: img = np.ascontiguousarray(results[key].transpose(2, 0, 1)) results[key] = to_tensor(img) else: results[key] = to_tensor(results[key].transpose(2, 0, 1)) return results def __repr__(self): return self.__class__.__name__ + '(keys={})'.format(self.keys) @PIPELINES.register_module() class LabelConsistentChecker: """This module is to make the annotations are consistent in each video. """ def __init__(self, num_frames=5): self.num_frames = num_frames def __call__(self, results): ref_gt_instance_ids = results['ref_gt_instance_ids'].data ins_mul_nframe = ref_gt_instance_ids.size(0) if ins_mul_nframe % self.num_frames != 0: return None num_ins = ins_mul_nframe // self.num_frames ins_id_bucket = torch.zeros((num_ins,), dtype=torch.float) for i in range(ins_mul_nframe): frame_cur = i // num_ins ins_cur = i % num_ins if ref_gt_instance_ids[i][0] != frame_cur: return None if frame_cur == 0: ins_id_bucket[ins_cur] = ref_gt_instance_ids[i][1] else: if ref_gt_instance_ids[i][1] != ins_id_bucket[ins_cur]: return None return results @PIPELINES.register_module() class MM2CLIP: """This module is to make the annotations are consistent in each video. """ def __init__(self, num_frames=5): self.num_frames = num_frames def __call__(self, results): ins_ids = np.unique(results[1]['gt_instance_ids'][:,1]) num_ins = len(ins_ids) num_frames = len(results[1]['img_metas']) ins_id_bucket = np.zeros((num_ins,), dtype=float) for i in range(num_ins * num_frames): frame_cur = i // num_ins ins_cur = i % num_ins if results[1]['gt_instance_ids'][i][0] != frame_cur: return None if frame_cur == 0: ins_id_bucket[ins_cur] = results[1]['gt_instance_ids'][i][1] else: if results[1]['gt_instance_ids'][i][1] != ins_id_bucket[ins_cur]: return None return results ================================================ FILE: mmtrack/pipelines/loading.py ================================================ import os.path as osp import numpy as np import mmcv from mmdet.core import BitmapMasks from mmdet.datasets.builder import PIPELINES from mmdet.datasets.pipelines import LoadAnnotations, LoadImageFromFile @PIPELINES.register_module() class LoadMultiImagesFromFile(LoadImageFromFile): """Load multi images from file. Please refer to `mmdet.datasets.pipelines.loading.py:LoadImageFromFile` for detailed docstring. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def __call__(self, results): """Call function. For each dict in `results`, call the call function of `LoadImageFromFile` to load image. Args: results (list[dict]): List of dict from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains loaded image. """ outs = [] for _results in results: _results = super().__call__(_results) outs.append(_results) return outs @PIPELINES.register_module() class SeqLoadAnnotations(LoadAnnotations): """Sequence load annotations. Please refer to `mmdet.datasets.pipelines.loading.py:LoadAnnotations` for detailed docstring. Args: with_track (bool): If True, load instance ids of bboxes. """ def __init__(self, with_track=False, *args, **kwargs): super().__init__(*args, **kwargs) self.with_track = with_track def _load_track(self, results): """Private function to load label annotations. Args: results (dict): Result dict from :obj:`mmtrack.CocoVideoDataset`. Returns: dict: The dict contains loaded label annotations. """ results['gt_instance_ids'] = results['ann_info']['instance_ids'].copy() return results def __call__(self, results): """Call function. For each dict in results, call the call function of `LoadAnnotations` to load annotation. Args: results (list[dict]): List of dict that from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains loaded annotations, such as bounding boxes, labels, instance ids, masks and semantic segmentation annotations. """ outs = [] for _results in results: _results = super().__call__(_results) if self.with_track: _results = self._load_track(_results) outs.append(_results) return outs @PIPELINES.register_module() class LoadRefImageFromFile(object): """ Code reading reference frame information. Specific to Cityscapes-VPS, Cityscapes, and VIPER datasets. """ def __init__(self, sample=True, to_float32=False): self.to_float32 = to_float32 self.sample = sample def __call__(self, results): # requires dirname for ref images assert results['ref_prefix'] is not None, 'ref_prefix must be specified.' filename = osp.join(results['img_prefix'], results['img_info']['filename']) img = mmcv.imread(filename) # if specified by another ref json file. if 'ref_filename' in results['img_info']: ref_filename = osp.join(results['ref_prefix'], results['img_info']['ref_filename']) ref_img = mmcv.imread(ref_filename) # [1024, 2048, 3] else: raise NotImplementedError('We need this implementation.') if self.to_float32: img = img.astype(np.float32) ref_img = ref_img.astype(np.float32) results['filename'] = filename results['ori_filename'] = results['img_info']['filename'] results['img'] = img results['img_shape'] = img.shape results['ori_shape'] = img.shape results['ref_img'] = ref_img results['iid'] = results['img_info']['id'] return results def __repr__(self): return self.__class__.__name__ + '(to_float32={})'.format( self.to_float32) def bitmasks2bboxes(bitmasks): bitmasks_array = bitmasks.masks boxes = np.zeros((bitmasks_array.shape[0], 4), dtype=np.float32) x_any = np.any(bitmasks_array, axis=1) y_any = np.any(bitmasks_array, axis=2) for idx in range(bitmasks_array.shape[0]): x = np.where(x_any[idx, :])[0] y = np.where(y_any[idx, :])[0] if len(x) > 0 and len(y) > 0: boxes[idx, :] = np.array((x[0], y[0], x[-1], y[-1]), dtype=np.float32) return boxes @PIPELINES.register_module() class LoadAnnotationsInstanceMasks: def __init__(self, with_mask=True, with_seg=True, with_inst=False, cherry=None, file_client_args=dict(backend='disk')): self.with_mask = with_mask self.with_seg = with_seg self.with_inst = with_inst self.file_client_args = file_client_args.copy() self.cherry = cherry self.file_client = None def _load_masks(self, results): """Private function to load mask annotations. Args: results (dict): Result dict from :obj:`mmdet.CustomDataset`. Returns: dict: The dict contains loaded mask annotations. If ``self.poly2mask`` is set ``True``, `gt_mask` will contain :obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used. """ img_bytes = self.file_client.get(results['ann_info']['inst_map']) inst_mask = mmcv.imfrombytes(img_bytes, flag='unchanged').squeeze() if self.with_inst: results['gt_instance_map'] = inst_mask.copy().astype(int) results['gt_instance_map'][inst_mask < 10000] *= 1000 if not self.with_mask: return results masks = [] labels = [] for inst_id in np.unique(inst_mask): if inst_id >= 10000: if self.cherry is not None and not (inst_id // 1000 in self.cherry): continue masks.append((inst_mask == inst_id).astype(int)) labels.append(inst_id // 1000) if len(masks) == 0: return None gt_masks = BitmapMasks(masks, height=inst_mask.shape[0], width=inst_mask.shape[1]) results['gt_masks'] = gt_masks results['mask_fields'].append('gt_masks') results['gt_labels'] = np.array(labels) boxes = bitmasks2bboxes(gt_masks) results['gt_bboxes'] = boxes results['bbox_fields'].append('gt_bboxes') return results def _load_semantic_seg(self, results): """Private function to load semantic segmentation annotations. Args: results (dict): Result dict from :obj:`dataset`. Returns: dict: The dict contains loaded semantic segmentation annotations. """ img_bytes = self.file_client.get(results['ann_info']['seg_map']) results['gt_semantic_seg'] = mmcv.imfrombytes( img_bytes, flag='unchanged').squeeze() results['seg_fields'].append('gt_semantic_seg') return results def __call__(self, results): """Call function to load multiple types annotations. Args: results (dict): Result dict from :obj:`mmdet.CustomDataset`. Returns: dict: The dict contains loaded bounding box, label, mask and semantic segmentation annotations. """ if self.file_client is None: self.file_client = mmcv.FileClient(**self.file_client_args) if self.with_mask or self.with_inst: results = self._load_masks(results) if results is None: return None if self.with_seg: results = self._load_semantic_seg(results) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'with_mask={self.with_mask}, ' repr_str += f'with_seg={self.with_seg}, ' return repr_str ================================================ FILE: mmtrack/pipelines/test_time_aug.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings import mmcv from mmdet.datasets.builder import PIPELINES from mmdet.datasets.pipelines import Compose @PIPELINES.register_module() class MultiScaleFlipAugVideo: """Test-time augmentation with multiple scales and flipping. An example configuration is as followed: .. code-block:: img_scale=[(1333, 400), (1333, 800)], flip=True, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ] After MultiScaleFLipAug with above configuration, the results are wrapped into lists of the same length as followed: .. code-block:: dict( img=[...], img_shape=[...], scale=[(1333, 400), (1333, 400), (1333, 800), (1333, 800)] flip=[False, True, False, True] ... ) Args: transforms (list[dict]): Transforms to apply in each augmentation. img_scale (tuple | list[tuple] | None): Images scales for resizing. scale_factor (float | list[float] | None): Scale factors for resizing. flip (bool): Whether apply flip augmentation. Default: False. flip_direction (str | list[str]): Flip augmentation directions, options are "horizontal", "vertical" and "diagonal". If flip_direction is a list, multiple flip augmentations will be applied. It has no effect when flip == False. Default: "horizontal". """ def __init__(self, transforms, img_scale=None, scale_factor=None, flip=False, flip_direction='horizontal'): self.transforms = Compose(transforms) assert (img_scale is None) ^ (scale_factor is None), ( 'Must have but only one variable can be set') if img_scale is not None: self.img_scale = img_scale if isinstance(img_scale, list) else [img_scale] self.scale_key = 'scale' assert mmcv.is_list_of(self.img_scale, tuple) else: self.img_scale = scale_factor if isinstance( scale_factor, list) else [scale_factor] self.scale_key = 'scale_factor' self.flip = flip self.flip_direction = flip_direction if isinstance( flip_direction, list) else [flip_direction] assert mmcv.is_list_of(self.flip_direction, str) if not self.flip and self.flip_direction != ['horizontal']: warnings.warn( 'flip_direction has no effect when flip is set to False') if (self.flip and not any([t['type'] == 'RandomFlip' for t in transforms])): warnings.warn( 'flip has no effect when RandomFlip is not in transforms') def __call__(self, results): """Call function to apply test time augment transforms on results. Args: results (dict): Result dict contains the data to transform. Returns: dict[str: list]: The augmented data, where each value is wrapped into a list. """ aug_data = [] flip_args = [(False, None)] if self.flip: flip_args += [(True, direction) for direction in self.flip_direction] for scale in self.img_scale: for flip, direction in flip_args: _results = [] for results_single in results: _results_single = results_single.copy() _results_single[self.scale_key] = scale _results_single['flip'] = flip _results_single['flip_direction'] = direction _results.append(_results_single) data = self.transforms(_results) aug_data.append(data) # list of dict to dict of list aug_data_dict = {key: [] for key in aug_data[0]} for data in aug_data: for key, val in data.items(): aug_data_dict[key].append(val) return aug_data_dict def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(transforms={self.transforms}, ' repr_str += f'img_scale={self.img_scale}, flip={self.flip}, ' repr_str += f'flip_direction={self.flip_direction})' return repr_str ================================================ FILE: mmtrack/pipelines/transforms.py ================================================ import cv2 import mmcv import numpy as np import warnings from mmdet.datasets.builder import PIPELINES from mmdet.datasets.pipelines import Normalize, Pad, RandomFlip, Resize @PIPELINES.register_module() class SeqColorAug(object): """Color augmention for images. Args: prob (list[float]): The probability to perform color augmention for each image. Defaults to [1.0, 1.0]. rgb_var (list[list]]): The values of color augmentaion. Defaults to [[-0.55919361, 0.98062831, -0.41940627], [1.72091413, 0.19879334, -1.82968581], [4.64467907, 4.73710203, 4.88324118]]. """ def __init__(self, prob=[1.0, 1.0], rgb_var=[[-0.55919361, 0.98062831, -0.41940627], [1.72091413, 0.19879334, -1.82968581], [4.64467907, 4.73710203, 4.88324118]]): self.prob = prob self.rgb_var = np.array(rgb_var, dtype=np.float32) def __call__(self, results): """Call function. For each dict in results, perform color augmention for image in the dict. Args: results (list[dict]): List of dict that from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains augmented color image. """ outs = [] for i, _results in enumerate(results): image = _results['img'] if self.prob[i] > np.random.random(): offset = np.dot(self.rgb_var, np.random.randn(3, 1)) # bgr to rgb offset = offset[::-1] offset = offset.reshape(3) image = (image - offset).astype(np.float32) _results['img'] = image outs.append(_results) return outs @PIPELINES.register_module() class SeqBlurAug(object): """Blur augmention for images. Args: prob (list[float]): The probability to perform blur augmention for each image. Defaults to [0.0, 0.2]. """ def __init__(self, prob=[0.0, 0.2]): self.prob = prob def __call__(self, results): """Call function. For each dict in results, perform blur augmention for image in the dict. Args: results (list[dict]): List of dict that from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains augmented blur image. """ outs = [] for i, _results in enumerate(results): image = _results['img'] if self.prob[i] > np.random.random(): sizes = np.arange(5, 46, 2) size = np.random.choice(sizes) kernel = np.zeros((size, size)) c = int(size / 2) wx = np.random.random() kernel[:, c] += 1. / size * wx kernel[c, :] += 1. / size * (1 - wx) image = cv2.filter2D(image, -1, kernel) _results['img'] = image outs.append(_results) return outs @PIPELINES.register_module() class SeqResize(Resize): """Resize images. Please refer to `mmdet.datasets.pipelines.transfroms.py:Resize` for detailed docstring. Args: share_params (bool): If True, share the resize parameters for all images. Defaults to True. """ def __init__(self, share_params=True, *args, **kwargs): super().__init__(*args, **kwargs) self.share_params = share_params def __call__(self, results): """Call function. For each dict in results, call the call function of `Resize` to resize image and corresponding annotations. Args: results (list[dict]): List of dict that from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains resized results, 'img_shape', 'pad_shape', 'scale_factor', 'keep_ratio' keys are added into result dict. """ outs, scale = [], None for i, _results in enumerate(results): if self.share_params and i > 0: _results['scale'] = scale _results = super().__call__(_results) if self.share_params and i == 0: scale = _results['scale'] outs.append(_results) return outs @PIPELINES.register_module() class SeqNormalize(Normalize): """Normalize images. Please refer to `mmdet.datasets.pipelines.transfroms.py:Normalize` for detailed docstring. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def __call__(self, results): """Call function. For each dict in results, call the call function of `Normalize` to normalize image. Args: results (list[dict]): List of dict that from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains normalized results, 'img_norm_cfg' key is added into result dict. """ outs = [] for _results in results: _results = super().__call__(_results) outs.append(_results) return outs @PIPELINES.register_module() class SeqRandomFlip(RandomFlip): """Randomly flip for images. Please refer to `mmdet.datasets.pipelines.transfroms.py:RandomFlip` for detailed docstring. Args: share_params (bool): If True, share the flip parameters for all images. Defaults to True. """ def __init__(self, share_params, *args, **kwargs): super().__init__(*args, **kwargs) self.share_params = share_params def __call__(self, results): """Call function. For each dict in results, call `RandomFlip` to randomly flip image. Args: results (list[dict]): List of dict that from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains flipped results, 'flip', 'flip_direction' keys are added into the dict. """ if self.share_params: if isinstance(self.direction, list): # None means non-flip direction_list = self.direction + [None] else: # None means non-flip direction_list = [self.direction, None] if isinstance(self.flip_ratio, list): non_flip_ratio = 1 - sum(self.flip_ratio) flip_ratio_list = self.flip_ratio + [non_flip_ratio] else: non_flip_ratio = 1 - self.flip_ratio # exclude non-flip single_ratio = self.flip_ratio / (len(direction_list) - 1) flip_ratio_list = [single_ratio] * (len(direction_list) - 1) + [non_flip_ratio] cur_dir = np.random.choice(direction_list, p=flip_ratio_list) flip = cur_dir is not None flip_direction = cur_dir for _results in results: _results['flip'] = flip _results['flip_direction'] = flip_direction outs = [] for _results in results: _results = super().__call__(_results) outs.append(_results) return outs @PIPELINES.register_module() class SeqPad(Pad): """Pad images. Please refer to `mmdet.datasets.pipelines.transfroms.py:Pad` for detailed docstring. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def __call__(self, results): """Call function. For each dict in results, call the call function of `Pad` to pad image. Args: results (list[dict]): List of dict that from :obj:`mmtrack.CocoVideoDataset`. Returns: list[dict]: List of dict that contains padding results, 'pad_shape', 'pad_fixed_size' and 'pad_size_divisor' keys are added into the dict. """ outs = [] for _results in results: _results = super().__call__(_results) outs.append(_results) return outs @PIPELINES.register_module() class SeqRandomCrop(object): """Sequentially random crop the images & bboxes & masks. The absolute `crop_size` is sampled based on `crop_type` and `image_size`, then the cropped results are generated. Args: crop_size (tuple): The relative ratio or absolute pixels of height and width. allow_negative_crop (bool, optional): Whether to allow a crop that does not contain any bbox area. Default False. share_params (bool, optional): Whether share the cropping parameters for the images. bbox_clip_border (bool, optional): Whether clip the objects outside the border of the image. Defaults to True. Note: - If the image is smaller than the absolute crop size, return the original image. - The keys for bboxes, labels and masks must be aligned. That is, `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and `gt_bboxes_ignore` corresponds to `gt_labels_ignore` and `gt_masks_ignore`. - If the crop does not contain any gt-bbox region and `allow_negative_crop` is set to False, skip this image. """ def __init__(self, crop_size, allow_negative_crop=False, share_params=False, bbox_clip_border=True, check_id_match=True ): assert crop_size[0] > 0 and crop_size[1] > 0 self.crop_size = crop_size self.allow_negative_crop = allow_negative_crop self.share_params = share_params self.bbox_clip_border = bbox_clip_border self.check_id_match = check_id_match # The key correspondence from bboxes to labels and masks. self.bbox2label = { 'gt_bboxes': ['gt_labels', 'gt_instance_ids'], 'gt_bboxes_ignore': ['gt_labels_ignore', 'gt_instance_ids_ignore'] } self.bbox2mask = { 'gt_bboxes': 'gt_masks', 'gt_bboxes_ignore': 'gt_masks_ignore' } def get_offsets(self, img): """Random generate the offsets for cropping.""" margin_h = max(img.shape[0] - self.crop_size[0], 0) margin_w = max(img.shape[1] - self.crop_size[1], 0) offset_h = np.random.randint(0, margin_h + 1) offset_w = np.random.randint(0, margin_w + 1) return offset_h, offset_w def random_crop(self, results, offsets=None): """Call function to randomly crop images, bounding boxes, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. offsets (tuple, optional): Pre-defined offsets for cropping. Default to None. Returns: dict: Randomly cropped results, 'img_shape' key in result dict is updated according to crop size. """ for key in results.get('img_fields', ['img']): img = results[key] if offsets is not None: offset_h, offset_w = offsets else: offset_h, offset_w = self.get_offsets(img) results['img_info']['crop_offsets'] = (offset_h, offset_w) crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0] crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1] # crop the image img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...] img_shape = img.shape results[key] = img results['img_shape'] = img_shape # crop bboxes accordingly and clip to the image boundary for key in results.get('bbox_fields', []): # e.g. gt_bboxes and gt_bboxes_ignore bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h], dtype=np.float32) bboxes = results[key] - bbox_offset if self.bbox_clip_border: bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1]) bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0]) valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & ( bboxes[:, 3] > bboxes[:, 1]) # If the crop does not contain any gt-bbox area and # self.allow_negative_crop is False, skip this image. if (key == 'gt_bboxes' and not valid_inds.any() and not self.allow_negative_crop): return None results[key] = bboxes[valid_inds, :] # label fields. e.g. gt_labels and gt_labels_ignore label_keys = self.bbox2label.get(key) for label_key in label_keys: if label_key in results: results[label_key] = results[label_key][valid_inds] # mask fields, e.g. gt_masks and gt_masks_ignore mask_key = self.bbox2mask.get(key) if mask_key in results: results[mask_key] = results[mask_key][ valid_inds.nonzero()[0]].crop( np.asarray([crop_x1, crop_y1, crop_x2, crop_y2])) # crop semantic seg for key in results.get('seg_fields', []): results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2] return results def __call__(self, results): """Call function to sequentially randomly crop images, bounding boxes, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. Returns: dict: Randomly cropped results, 'img_shape' key in result dict is updated according to crop size. """ if self.share_params: offsets = self.get_offsets(results[0]['img']) else: offsets = None outs = [] for _results in results: _results = self.random_crop(_results, offsets) if _results is None: return None outs.append(_results) if len(outs) == 2 and self.check_id_match: ref_result, result = outs[1], outs[0] if self.check_match(ref_result, result): return None return outs def check_match(self, ref_results, results): ref_ids = ref_results['gt_instance_ids'].tolist() gt_ids = results['gt_instance_ids'].tolist() gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids] nomatch = (np.array(gt_pids) == -1).all() return nomatch @PIPELINES.register_module() class SeqPhotoMetricDistortion(object): """Apply photometric distortion to image sequentially, every transformation is applied with a probability of 0.5. The position of random contrast is in second or second to last. 1. random brightness 2. random contrast (mode 0) 3. convert color from BGR to HSV 4. random saturation 5. random hue 6. convert color from HSV to BGR 7. random contrast (mode 1) 8. randomly swap channels Args: brightness_delta (int): delta of brightness. contrast_range (tuple): range of contrast. saturation_range (tuple): range of saturation. hue_delta (int): delta of hue. """ def __init__(self, share_params=True, brightness_delta=32, contrast_range=(0.5, 1.5), saturation_range=(0.5, 1.5), hue_delta=18): self.share_params = share_params self.brightness_delta = brightness_delta self.contrast_lower, self.contrast_upper = contrast_range self.saturation_lower, self.saturation_upper = saturation_range self.hue_delta = hue_delta def get_params(self): """Generate parameters.""" params = dict() # delta if np.random.randint(2): params['delta'] = np.random.uniform(-self.brightness_delta, self.brightness_delta) else: params['delta'] = None # mode mode = np.random.randint(2) params['contrast_first'] = True if mode == 1 else 0 # alpha if np.random.randint(2): params['alpha'] = np.random.uniform(self.contrast_lower, self.contrast_upper) else: params['alpha'] = None # saturation if np.random.randint(2): params['saturation'] = np.random.uniform(self.saturation_lower, self.saturation_upper) else: params['saturation'] = None # hue if np.random.randint(2): params['hue'] = np.random.uniform(-self.hue_delta, self.hue_delta) else: params['hue'] = None # swap if np.random.randint(2): params['permutation'] = np.random.permutation(3) else: params['permutation'] = None return params def photo_metric_distortion(self, results, params=None): """Call function to perform photometric distortion on images. Args: results (dict): Result dict from loading pipeline. params (dict, optional): Pre-defined parameters. Default to None. Returns: dict: Result dict with images distorted. """ if params is None: params = self.get_params() results['img_info']['color_jitter'] = params if 'img_fields' in results: assert results['img_fields'] == ['img'], \ 'Only single img_fields is allowed' img = results['img'] assert img.dtype == np.float32, \ 'PhotoMetricDistortion needs the input image of dtype np.float32,' \ ' please set "to_float32=True" in "LoadImageFromFile" pipeline' # random brightness if params['delta'] is not None: img += params['delta'] # mode == 0 --> do random contrast first # mode == 1 --> do random contrast last if params['contrast_first']: if params['alpha'] is not None: img *= params['alpha'] # convert color from BGR to HSV img = mmcv.bgr2hsv(img) # random saturation if params['saturation'] is not None: img[..., 1] *= params['saturation'] # random hue if params['hue'] is not None: img[..., 0] += params['hue'] img[..., 0][img[..., 0] > 360] -= 360 img[..., 0][img[..., 0] < 0] += 360 # convert color from HSV to BGR img = mmcv.hsv2bgr(img) # random contrast if not params['contrast_first']: if params['alpha'] is not None: img *= params['alpha'] # randomly swap channels if params['permutation'] is not None: img = img[..., params['permutation']] results['img'] = img return results def __call__(self, results): """Call function to perform photometric distortion on images. Args: results (dict): Result dict from loading pipeline. Returns: dict: Result dict with images distorted. """ if self.share_params: params = self.get_params() else: params = None outs = [] for _results in results: _results = self.photo_metric_distortion(_results, params) outs.append(_results) return outs def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(\nbrightness_delta={self.brightness_delta},\n' repr_str += 'contrast_range=' repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n' repr_str += 'saturation_range=' repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n' repr_str += f'hue_delta={self.hue_delta})' return repr_str @PIPELINES.register_module() class ResizeWithRef(object): """Resize images & bbox & mask. This transform resizes the input image to some scale. Bboxes and masks are then resized with the same scale factor. If the input dict contains the key "scale", then the scale in the input dict is used, otherwise the specified scale in the init method is used. `img_scale` can either be a tuple (single-scale) or a list of tuple (multi-scale). There are 3 multiscale modes: - `ratio_range` is not None: randomly sample a ratio from the ratio range and multiply it with the image scale. - `ratio_range` is None and `multiscale_mode` == "range": randomly sample a scale from the a range. - `ratio_range` is None and `multiscale_mode` == "value": randomly sample a scale from multiple scales. Args: img_scale (tuple or list[tuple]): Images scales for resizing. multiscale_mode (str): Either "range" or "value". ratio_range (tuple[float]): (min_ratio, max_ratio) keep_ratio (bool): Whether to keep the aspect ratio when resizing the image. """ def __init__(self, img_scale=None, multiscale_mode='range', ratio_range=None, keep_ratio=True): if img_scale is None: self.img_scale = None else: if isinstance(img_scale, list): self.img_scale = img_scale else: self.img_scale = [img_scale] assert mmcv.is_list_of(self.img_scale, tuple) if ratio_range is not None: # mode 1: given a scale and a range of image ratio assert len(self.img_scale) == 1 else: # mode 2: given multiple scales or a range of scales assert multiscale_mode in ['value', 'range'] self.multiscale_mode = multiscale_mode self.ratio_range = ratio_range self.keep_ratio = keep_ratio @staticmethod def random_select(img_scales): assert mmcv.is_list_of(img_scales, tuple) scale_idx = np.random.randint(len(img_scales)) img_scale = img_scales[scale_idx] return img_scale, scale_idx @staticmethod def random_sample(img_scales): assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2 img_scale_long = [max(s) for s in img_scales] img_scale_short = [min(s) for s in img_scales] long_edge = np.random.randint( min(img_scale_long), max(img_scale_long) + 1) short_edge = np.random.randint( min(img_scale_short), max(img_scale_short) + 1) img_scale = (long_edge, short_edge) return img_scale, None @staticmethod def random_sample_ratio(img_scale, ratio_range): assert isinstance(img_scale, tuple) and len(img_scale) == 2 min_ratio, max_ratio = ratio_range assert min_ratio <= max_ratio ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio) return scale, None def _random_scale(self, results): if self.ratio_range is not None: scale, scale_idx = self.random_sample_ratio( self.img_scale[0], self.ratio_range) elif len(self.img_scale) == 1: scale, scale_idx = self.img_scale[0], 0 elif self.multiscale_mode == 'range': scale, scale_idx = self.random_sample(self.img_scale) elif self.multiscale_mode == 'value': scale, scale_idx = self.random_select(self.img_scale) else: raise NotImplementedError results['scale'] = scale results['scale_idx'] = scale_idx def _resize_img(self, results): els = ['ref_img', 'img'] if 'ref_img' in results else ['img'] for el in els: if self.keep_ratio: img, scale_factor = mmcv.imrescale( results[el], results['scale'], return_scale=True) else: img, w_scale, h_scale = mmcv.imresize( results[el], results['scale'], return_scale=True) scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], dtype=np.float32) results[el] = img results['img_shape'] = img.shape results['pad_shape'] = img.shape # in case that there is no padding results['scale_factor'] = scale_factor results['keep_ratio'] = self.keep_ratio def _resize_bboxes(self, results): els = ['ref_bbox_fields', 'bbox_fields'] if 'ref_bbox_fields' in results else ['bbox_fields'] for el in els: img_shape = results['img_shape'] for key in results.get(el, []): bboxes = results[key] * results['scale_factor'] bboxes[:, 0::2] = np.clip( bboxes[:, 0::2], 0, img_shape[1] - 1) bboxes[:, 1::2] = np.clip( bboxes[:, 1::2], 0, img_shape[0] - 1) results[key] = bboxes def _resize_masks(self, results): els = ['ref_mask_fields', 'mask_fields'] if 'ref_mask_fields' in results else ['mask_fields'] for el in els: for key in results.get(el, []): if results[key] is None: continue if self.keep_ratio: masks = [ mmcv.imrescale( mask, results['scale_factor'], interpolation='nearest') for mask in results[key] ] else: mask_size = (results['img_shape'][1], results['img_shape'][0]) masks = [ mmcv.imresize(mask, mask_size, interpolation='nearest') for mask in results[key] ] results[key] = masks def __call__(self, results): if 'scale' not in results: self._random_scale(results) self._resize_img(results) self._resize_bboxes(results) self._resize_masks(results) # self._resize_semantic_seg(results) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += ('(img_scale={}, multiscale_mode={}, ratio_range={}, ' 'keep_ratio={})').format(self.img_scale, self.multiscale_mode, self.ratio_range, self.keep_ratio) return repr_str @PIPELINES.register_module() class RandomFlipWithRef(object): """Flip the image & bbox & mask. If the input dict contains the key "flip", then the flag will be used, otherwise it will be randomly decided by a ratio specified in the init method. Args: flip_ratio (float, optional): The flipping probability. """ def __init__(self, flip_ratio=None): self.flip_ratio = flip_ratio if flip_ratio is not None: assert flip_ratio >= 0 and flip_ratio <= 1 def bbox_flip(self, bboxes, img_shape): """Flip bboxes horizontally. Args: bboxes(ndarray): shape (..., 4*k) img_shape(tuple): (height, width) """ assert bboxes.shape[-1] % 4 == 0 w = img_shape[1] flipped = bboxes.copy() flipped[..., 0::4] = w - bboxes[..., 2::4] - 1 flipped[..., 2::4] = w - bboxes[..., 0::4] - 1 return flipped def __call__(self, results): if 'flip' not in results: flip = True if np.random.rand() < self.flip_ratio else False results['flip'] = flip if results['flip']: # flip image results['img'] = mmcv.imflip(results['img']) if 'ref_img' in results: results['ref_img'] = mmcv.imflip(results['ref_img']) # flip bboxes for key in results.get('bbox_fields', []): results[key] = self.bbox_flip(results[key], results['img_shape']) for key in results.get('ref_bbox_fields', []): results[key] = self.bbox_flip(results[key], results['img_shape']) # flip masks for key in results.get('mask_fields', []): results[key] = [mask[:, ::-1] for mask in results[key]] for key in results.get('ref_mask_fields', []): results[key] = [mask[:, ::-1] for mask in results[key]] return results def __repr__(self): return self.__class__.__name__ + '(flip_ratio={})'.format( self.flip_ratio) @PIPELINES.register_module() class PadWithRef(object): """Pad the image & mask. There are two padding modes: (1) pad to a fixed size and (2) pad to the minimum size that is divisible by some number. Args: size (tuple, optional): Fixed padding size. size_divisor (int, optional): The divisor of padded size. pad_val (float, optional): Padding value, 0 by default. """ def __init__(self, size=None, size_divisor=None, pad_val=0): self.size = size self.size_divisor = size_divisor self.pad_val = pad_val # only one of size and size_divisor should be valid assert size is not None or size_divisor is not None assert size is None or size_divisor is None def _pad_img(self, results): els = ['ref_img', 'img'] if 'ref_img' in results else ['img'] for el in els: if self.size is not None: padded_img = mmcv.impad(results['img'], self.size) elif self.size_divisor is not None: padded_img = mmcv.impad_to_multiple( results[el], self.size_divisor, pad_val=self.pad_val) results[el] = padded_img results['pad_shape'] = padded_img.shape results['pad_fixed_size'] = self.size results['pad_size_divisor'] = self.size_divisor def _pad_masks(self, results): els = ['ref_mask_fields', 'mask_fields'] if 'ref_mask_fields' in results else ['mask_fields'] for el in els: pad_shape = results['pad_shape'][:2] for key in results.get(el, []): padded_masks = [ mmcv.impad(mask, pad_shape, pad_val=self.pad_val) for mask in results[key] ] results[key] = np.stack(padded_masks, axis=0) def __call__(self, results): self._pad_img(results) self._pad_masks(results) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += '(size={}, size_divisor={}, pad_val={})'.format( self.size, self.size_divisor, self.pad_val) return repr_str @PIPELINES.register_module() class NormalizeWithRef(object): """Normalize the image. Args: mean (sequence): Mean values of 3 channels. std (sequence): Std values of 3 channels. to_rgb (bool): Whether to convert the image from BGR to RGB, default is true. """ def __init__(self, mean, std, to_rgb=True): self.mean = np.array(mean, dtype=np.float32) self.std = np.array(std, dtype=np.float32) self.to_rgb = to_rgb def __call__(self, results): results['img'] = mmcv.imnormalize( results['img'], self.mean, self.std, self.to_rgb) if 'ref_img' in results: results['ref_img'] = mmcv.imnormalize( results['ref_img'], self.mean, self.std, self.to_rgb) results['img_norm_cfg'] = dict( mean=self.mean, std=self.std, to_rgb=self.to_rgb) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += '(mean={}, std={}, to_rgb={})'.format( self.mean, self.std, self.to_rgb) return repr_str @PIPELINES.register_module() class RandomCropWithRef(object): """Random crop the image & bboxes & masks. Args: crop_size (tuple): Expected size after cropping, (h, w). """ def __init__(self, crop_size): self.crop_size = crop_size def __call__(self, results): img = results['img'] margin_h = max(img.shape[0] - self.crop_size[0], 0) margin_w = max(img.shape[1] - self.crop_size[1], 0) offset_h = np.random.randint(0, margin_h + 1) offset_w = np.random.randint(0, margin_w + 1) crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0] crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1] # crop the image ori_shape = img.shape img = img[crop_y1:crop_y2, crop_x1:crop_x2, :] img_shape = img.shape results['img'] = img if 'ref_img' in results: ref_img = results['ref_img'] ref_img = ref_img[crop_y1:crop_y2, crop_x1:crop_x2, :] results['ref_img'] = ref_img results['img_shape'] = img_shape results['crop_coords'] = [crop_y1, crop_y2, crop_x1, crop_x2] # crop bboxes accordingly and clip to the image boundary els = ['ref_bbox_fields', 'bbox_fields'] if 'ref_bbox_fields' in results else ['bbox_fields'] for el in els: for key in results.get(el, []): bbox_offset = np.array( [offset_w, offset_h, offset_w, offset_h], dtype=np.float32) bboxes = results[key] - bbox_offset bboxes[:, 0::2] = np.clip( bboxes[:, 0::2], 0, img_shape[1] - 1) bboxes[:, 1::2] = np.clip( bboxes[:, 1::2], 0, img_shape[0] - 1) results[key] = bboxes # filter out the gt bboxes that are completely cropped els = ['ref_bboxes', 'gt_bboxes'] if 'ref_bboxes' in results else ['gt_bboxes'] for el in els: if el in results: gt_bboxes = results[el] valid_inds = (gt_bboxes[:, 2] > gt_bboxes[:, 0]) & ( gt_bboxes[:, 3] > gt_bboxes[:, 1]) # if no gt bbox remains after cropping, just skip this image if not np.any(valid_inds): return None results[el] = gt_bboxes[valid_inds, :] ell = el.replace('_bboxes', '_labels') if ell in results: results[ell] = results[ell][valid_inds] #### filter gt_obj_ids just like gt_labes. elo = el.replace('_bboxes', '_obj_ids') if elo in results: results[elo] = results[elo][valid_inds] # filter and crop the masks elm = el.replace('_bboxes', '_masks') if elm in results: valid_gt_masks = [] for i in np.where(valid_inds)[0]: gt_mask = results[elm][i][ crop_y1:crop_y2, crop_x1:crop_x2] valid_gt_masks.append(gt_mask) results[elm] = valid_gt_masks return results def __repr__(self): return self.__class__.__name__ + '(crop_size={})'.format( self.crop_size) @PIPELINES.register_module() class PadFutureMMDet: """Pad the image & masks & segmentation map. There are two padding modes: (1) pad to a fixed size and (2) pad to the minimum size that is divisible by some number. Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor", Args: size (tuple, optional): Fixed padding size. size_divisor (int, optional): The divisor of padded size. pad_to_square (bool): Whether to pad the image into a square. Currently only used for YOLOX. Default: False. pad_val (dict, optional): A dict for padding value, the default value is `dict(img=0, masks=0, seg=255)`. """ def __init__(self, size=None, size_divisor=None, pad_to_square=False, pad_val=dict(img=0, masks=0, seg=255)): self.size = size self.size_divisor = size_divisor if isinstance(pad_val, float) or isinstance(pad_val, int): warnings.warn( 'pad_val of float type is deprecated now, ' f'please use pad_val=dict(img={pad_val}, ' f'masks={pad_val}, seg=255) instead.', DeprecationWarning) pad_val = dict(img=pad_val, masks=pad_val, seg=255) assert isinstance(pad_val, dict) self.pad_val = pad_val self.pad_to_square = pad_to_square if pad_to_square: assert size is None and size_divisor is None, \ 'The size and size_divisor must be None ' \ 'when pad2square is True' else: assert size is not None or size_divisor is not None, \ 'only one of size and size_divisor should be valid' assert size is None or size_divisor is None def _pad_img(self, results): """Pad images according to ``self.size``.""" pad_val = self.pad_val.get('img', 0) for key in results.get('img_fields', ['img']): if self.pad_to_square: max_size = max(results[key].shape[:2]) self.size = (max_size, max_size) if self.size is not None: padded_img = mmcv.impad( results[key], shape=self.size, pad_val=pad_val) elif self.size_divisor is not None: padded_img = mmcv.impad_to_multiple( results[key], self.size_divisor, pad_val=pad_val) results[key] = padded_img results['pad_shape'] = padded_img.shape results['pad_fixed_size'] = self.size results['pad_size_divisor'] = self.size_divisor def _pad_masks(self, results): """Pad masks according to ``results['pad_shape']``.""" pad_shape = results['pad_shape'][:2] pad_val = self.pad_val.get('masks', 0) for key in results.get('mask_fields', []): results[key] = results[key].pad(pad_shape, pad_val=pad_val) def _pad_seg(self, results): """Pad semantic segmentation map according to ``results['pad_shape']``.""" pad_val = self.pad_val.get('seg', 255) for key in results.get('seg_fields', []): results[key] = mmcv.impad( results[key], shape=results['pad_shape'][:2], pad_val=pad_val) def __call__(self, results): """Call function to pad images, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. Returns: dict: Updated result dict. """ self._pad_img(results) self._pad_masks(results) self._pad_seg(results) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(size={self.size}, ' repr_str += f'size_divisor={self.size_divisor}, ' repr_str += f'pad_to_square={self.pad_to_square}, ' repr_str += f'pad_val={self.pad_val})' return repr_str @PIPELINES.register_module() class KNetInsAdapter: """Adapter that is used to convert city-style instance class-ids to coco-style instance-ids (11-starting to 0-starting) """ def __init__(self, stuff_nums=11): self.stuff_nums = stuff_nums def __call__(self, results): """Call function to modify gt_labels Args: results (dict): Result dict from loading pipeline. Returns: dict: Updated result dict. """ results['gt_labels'] -= self.stuff_nums return results @PIPELINES.register_module() class KNetInsAdapterCherryPick: """Adapter that is used to convert city-style instance class-ids to coco-style instance-ids (11-starting to 0-starting) """ def __init__(self, stuff_nums=11, cherry=(11, 13)): self.cherry = cherry self.stuff_nums = stuff_nums def __call__(self, results): """Call function to modify gt_labels Args: results (dict): Result dict from loading pipeline. Returns: dict: Updated result dict. """ bias = 0 for ch in self.cherry: results['gt_labels'][results['gt_labels'] == ch] -= bias bias += 1 results['gt_labels'] -= self.stuff_nums return results ================================================ FILE: mmtrack/transform.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmdet.core import bbox2result def outs2results(bboxes=None, labels=None, masks=None, ids=None, num_classes=None, **kwargs): """Convert tracking/detection results to a list of numpy arrays. Args: bboxes (torch.Tensor | np.ndarray): shape (n, 5) labels (torch.Tensor | np.ndarray): shape (n, ) masks (torch.Tensor | np.ndarray): shape (n, h, w) ids (torch.Tensor | np.ndarray): shape (n, ) num_classes (int): class number, not including background class Returns: dict[str : list(ndarray) | list[list[np.ndarray]]]: tracking/detection results of each class. It may contain keys as belows: - bbox_results (list[np.ndarray]): Each list denotes bboxes of one category. - mask_results (list[list[np.ndarray]]): Each outer list denotes masks of one category. Each inner list denotes one mask belonging to the category. Each mask has shape (h, w). """ assert labels is not None assert num_classes is not None results = dict() if ids is not None: valid_inds = ids > -1 ids = ids[valid_inds] labels = labels[valid_inds] if bboxes is not None: if ids is not None: bboxes = bboxes[valid_inds] if bboxes.shape[0] == 0: bbox_results = [ np.zeros((0, 6), dtype=np.float32) for i in range(num_classes) ] else: if isinstance(bboxes, torch.Tensor): bboxes = bboxes.cpu().numpy() labels = labels.cpu().numpy() ids = ids.cpu().numpy() bbox_results = [ np.concatenate( (ids[labels == i, None], bboxes[labels == i, :]), axis=1) for i in range(num_classes) ] else: bbox_results = bbox2result(bboxes, labels, num_classes) results['bbox_results'] = bbox_results if masks is not None: if ids is not None: masks = masks[valid_inds] if isinstance(masks, torch.Tensor): masks = masks.detach().cpu().numpy() masks_results = [[] for _ in range(num_classes)] for i in range(bboxes.shape[0]): masks_results[labels[i]].append(masks[i]) results['mask_results'] = masks_results return results ================================================ FILE: scripts/kitti_step_prepare.py ================================================ import os import shutil train_seqs = [0, 1, 3, 4, 5, 9, 11, 12, 15, 17, 19, 20] val_seqs = [2, 6, 7, 8, 10, 13, 14, 16, 18] test_seqs = list(range(29)) # your download the KITTI STEP dataset. data_root = os.path.expanduser('/data/data1/datasets/STEP/kitti/training/') data_root_test = os.path.expanduser('/data/data1/datasets/STEP/kitti/testing/') data_out = os.path.expanduser('/data/data1/datasets/STEP/kitti_out') def build_panoptic(seq_id, input_dir, output_dir): input_panoptic_dir = os.path.join(input_dir, '{:04d}'.format(seq_id)) print("Preparing seq id : {}".format(seq_id)) panoptic_files = sorted(list(map(lambda x: str(x), os.listdir(input_panoptic_dir)))) print("Dst dir is {}".format(output_dir)) if not os.path.exists(output_dir): os.makedirs(output_dir) for file in panoptic_files: print(os.path.join(output_dir, '{:06d}_{}_panoptic.png'.format(seq_id, file.split('.')[0]))) shutil.move(os.path.join(input_panoptic_dir, file), os.path.join(output_dir, '{:06d}_{}_panoptic.png'.format(seq_id, file.split('.')[0]))) def build_img(seq_id, input_dir, output_dir): input_panoptic_dir = os.path.join(input_dir, '{:04d}'.format(seq_id)) print("Preparing seq id : {}".format(seq_id)) panoptic_files = sorted(list(map(lambda x: str(x), os.listdir(input_panoptic_dir)))) print("Dst dir is {}".format(output_dir)) if not os.path.exists(output_dir): os.makedirs(output_dir) for file in panoptic_files: print(os.path.join(output_dir, '{:06d}_{}_leftImg8bit.png'.format(seq_id, file.split('.')[0]))) shutil.move(os.path.join(input_panoptic_dir, file), os.path.join(output_dir, '{:06d}_{}_leftImg8bit.png'.format(seq_id, file.split('.')[0]))) if __name__ == '__main__': for seq_id in train_seqs: build_panoptic(seq_id, os.path.join(data_root, 'panoptic'), os.path.join(data_out, 'video_sequence', 'train')) for seq_id in val_seqs: build_panoptic(seq_id, os.path.join(data_root, 'panoptic'), os.path.join(data_out, 'video_sequence', 'val')) for seq_id in train_seqs: build_img(seq_id, os.path.join(data_root, 'image_02'), os.path.join(data_out, 'video_sequence', 'train')) for seq_id in val_seqs: build_img(seq_id, os.path.join(data_root, 'image_02'), os.path.join(data_out, 'video_sequence', 'val')) for seq_id in test_seqs: build_img(seq_id, os.path.join(data_root_test, 'image_02'), os.path.join(data_out, 'video_sequence', 'test')) ================================================ FILE: scripts/visualizer.py ================================================ import hashlib import numpy as np import cv2 city_labels = [ ('road', 0, (128, 64, 128)), ('sidewalk', 1, (244, 35, 232)), ('building', 2, (70, 70, 70)), ('wall', 3, (102, 102, 156)), ('fence', 4, (190, 153, 153)), ('pole', 5, (153, 153, 153)), ('traffic light', 6, (250, 170, 30)), ('traffic sign', 7, (220, 220, 0)), ('vegetation', 8, (107, 142, 35)), ('terrain', 9, (152, 251, 152)), ('sky', 10, (70, 130, 180)), ('person', 11, (220, 20, 60)), ('rider', 12, (255, 0, 0)), ('car', 13, (0, 0, 142)), ('truck', 14, (0, 0, 70)), ('bus', 15, (0, 60, 100)), ('train', 16, (0, 80, 100)), ('motorcycle', 17, (0, 0, 230)), ('bicycle', 18, (119, 11, 32)), ('void', 19, (0, 0, 0)), ('void', 255, (0, 0, 0)) ] def sha256num(num): hex = hashlib.sha256(str(num).encode('utf-8')).hexdigest() hex = hex[-6:] return int(hex, 16) def id2rgb(id_map): if isinstance(id_map, np.ndarray): id_map_copy = id_map.copy() rgb_shape = tuple(list(id_map.shape) + [3]) rgb_map = np.zeros(rgb_shape, dtype=np.uint8) for i in range(3): rgb_map[..., i] = id_map_copy % 256 id_map_copy //= 256 return rgb_map color = [] for _ in range(3): color.append(id_map % 256) id_map //= 256 return color def cityscapes_cat2rgb(cat_map): color_map = np.zeros_like(cat_map).astype(np.uint8) color_map = color_map[..., None].repeat(3, axis=-1) for each_class in city_labels: index = cat_map == each_class[1] if index.any(): color_map[index] = each_class[2] return color_map def trackmap2rgb(track_map): color_map = np.zeros_like(track_map).astype(np.uint8) color_map = color_map[..., None].repeat(3, axis=-1) for id_cur in np.unique(track_map): if id_cur == 0: continue color_map[track_map == id_cur] = id2rgb(sha256num(id_cur)) return color_map def draw_bbox_on_img(vis_img, bboxes): for index in range(bboxes.shape[0]): cv2.rectangle(vis_img, (int(bboxes[index][0]), int(bboxes[index][1])), (int(bboxes[index][2]), int(bboxes[index][3])), (0, 0, 255), thickness=1) return vis_img ================================================ FILE: swin/DetectRS.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch.nn as nn import torch.utils.checkpoint as cp from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init, kaiming_init) from mmcv.runner import Sequential, load_checkpoint from torch.nn.modules.batchnorm import _BatchNorm from mmdet.utils import get_root_logger from mmdet.models.builder import BACKBONES from mmdet.models.backbones.resnet import BasicBlock from mmdet.models.backbones.resnet import Bottleneck as _Bottleneck from mmdet.models.backbones.resnet import ResNet class Bottleneck(_Bottleneck): r"""Bottleneck for the ResNet backbone in `DetectoRS `_. This bottleneck allows the users to specify whether to use SAC (Switchable Atrous Convolution) and RFP (Recursive Feature Pyramid). Args: inplanes (int): The number of input channels. planes (int): The number of output channels before expansion. rfp_inplanes (int, optional): The number of channels from RFP. Default: None. If specified, an additional conv layer will be added for ``rfp_feat``. Otherwise, the structure is the same as base class. sac (dict, optional): Dictionary to construct SAC. Default: None. init_cfg (dict or list[dict], optional): Initialization config dict. Default: None """ expansion = 4 def __init__(self, inplanes, planes, rfp_inplanes=None, sac=None, init_cfg=None, **kwargs): super(Bottleneck, self).__init__( inplanes, planes, init_cfg=init_cfg, **kwargs) assert sac is None or isinstance(sac, dict) self.sac = sac self.with_sac = sac is not None if self.with_sac: self.conv2 = build_conv_layer( self.sac, planes, planes, kernel_size=3, stride=self.conv2_stride, padding=self.dilation, dilation=self.dilation, bias=False) self.rfp_inplanes = rfp_inplanes if self.rfp_inplanes: self.rfp_conv = build_conv_layer( None, self.rfp_inplanes, planes * self.expansion, 1, stride=1, bias=True) # TODO : Is this a bug ? if init_cfg is None: self.init_cfg = dict( type='Constant', val=0, override=dict(name='rfp_conv')) def rfp_forward(self, x, rfp_feat): """The forward function that also takes the RFP features as input.""" def _inner_forward(x): identity = x out = self.conv1(x) out = self.norm1(out) out = self.relu(out) if self.with_plugins: out = self.forward_plugin(out, self.after_conv1_plugin_names) out = self.conv2(out) out = self.norm2(out) out = self.relu(out) if self.with_plugins: out = self.forward_plugin(out, self.after_conv2_plugin_names) out = self.conv3(out) out = self.norm3(out) if self.with_plugins: out = self.forward_plugin(out, self.after_conv3_plugin_names) if self.downsample is not None: identity = self.downsample(x) out += identity return out if self.with_cp and x.requires_grad: out = cp.checkpoint(_inner_forward, x) else: out = _inner_forward(x) if self.rfp_inplanes: rfp_feat = self.rfp_conv(rfp_feat) out = out + rfp_feat out = self.relu(out) return out class ResLayer(Sequential): """ResLayer to build ResNet style backbone for RPF in detectoRS. The difference between this module and base class is that we pass ``rfp_inplanes`` to the first block. Args: block (nn.Module): block used to build ResLayer. inplanes (int): inplanes of block. planes (int): planes of block. num_blocks (int): number of blocks. stride (int): stride of the first block. Default: 1 avg_down (bool): Use AvgPool instead of stride conv when downsampling in the bottleneck. Default: False conv_cfg (dict): dictionary to construct and config conv layer. Default: None norm_cfg (dict): dictionary to construct and config norm layer. Default: dict(type='BN') downsample_first (bool): Downsample at the first block or last block. False for Hourglass, True for ResNet. Default: True rfp_inplanes (int, optional): The number of channels from RFP. Default: None. If specified, an additional conv layer will be added for ``rfp_feat``. Otherwise, the structure is the same as base class. """ def __init__(self, block, inplanes, planes, num_blocks, stride=1, avg_down=False, conv_cfg=None, norm_cfg=dict(type='BN'), downsample_first=True, rfp_inplanes=None, **kwargs): self.block = block assert downsample_first, f'downsample_first={downsample_first} is ' \ 'not supported in DetectoRS' downsample = None if stride != 1 or inplanes != planes * block.expansion: downsample = [] conv_stride = stride if avg_down and stride != 1: conv_stride = 1 downsample.append( nn.AvgPool2d( kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False)) downsample.extend([ build_conv_layer( conv_cfg, inplanes, planes * block.expansion, kernel_size=1, stride=conv_stride, bias=False), build_norm_layer(norm_cfg, planes * block.expansion)[1] ]) downsample = nn.Sequential(*downsample) layers = [] layers.append( block( inplanes=inplanes, planes=planes, stride=stride, downsample=downsample, conv_cfg=conv_cfg, norm_cfg=norm_cfg, rfp_inplanes=rfp_inplanes, **kwargs)) inplanes = planes * block.expansion for _ in range(1, num_blocks): layers.append( block( inplanes=inplanes, planes=planes, stride=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, **kwargs)) super(ResLayer, self).__init__(*layers) @BACKBONES.register_module() class DetectoRS_ResNet_Custom(ResNet): """ResNet backbone for DetectoRS. Args: sac (dict, optional): Dictionary to construct SAC (Switchable Atrous Convolution). Default: None. stage_with_sac (list): Which stage to use sac. Default: (False, False, False, False). rfp_inplanes (int, optional): The number of channels from RFP. Default: None. If specified, an additional conv layer will be added for ``rfp_feat``. Otherwise, the structure is the same as base class. output_img (bool): If ``True``, the input image will be inserted into the starting position of output. Default: False. """ arch_settings = { 50: (Bottleneck, (3, 4, 6, 3)), 101: (Bottleneck, (3, 4, 23, 3)), 152: (Bottleneck, (3, 8, 36, 3)) } def __init__(self, sac=None, stage_with_sac=(False, False, False, False), rfp_inplanes=None, output_img=False, pretrained=None, init_cfg=None, **kwargs): assert not (init_cfg and pretrained), \ 'init_cfg and pretrained cannot be specified at the same time' assert pretrained is None, "pretrained is not supported anymore" self.sac = sac self.stage_with_sac = stage_with_sac self.rfp_inplanes = rfp_inplanes self.output_img = output_img super().__init__(init_cfg=init_cfg, **kwargs) self.inplanes = self.stem_channels self.res_layers = [] for i, num_blocks in enumerate(self.stage_blocks): stride = self.strides[i] dilation = self.dilations[i] dcn = self.dcn if self.stage_with_dcn[i] else None sac = self.sac if self.stage_with_sac[i] else None if self.plugins is not None: stage_plugins = self.make_stage_plugins(self.plugins, i) else: stage_plugins = None planes = self.base_channels * 2 ** i res_layer = self.make_res_layer( block=self.block, inplanes=self.inplanes, planes=planes, num_blocks=num_blocks, stride=stride, dilation=dilation, style=self.style, avg_down=self.avg_down, with_cp=self.with_cp, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, dcn=dcn, sac=sac, rfp_inplanes=rfp_inplanes if i > 0 else None, plugins=stage_plugins) self.inplanes = planes * self.block.expansion layer_name = f'layer{i + 1}' self.add_module(layer_name, res_layer) self.res_layers.append(layer_name) self._freeze_stages() # In order to be properly initialized by RFP def init_weights(self): # Calling this method will cause parameter initialization exception # super(DetectoRS_ResNet, self).init_weights() if self.init_cfg is not None: super(ResNet, self).init_weights() elif self.pretrained is None: for m in self.modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) elif isinstance(m, (_BatchNorm, nn.GroupNorm)): constant_init(m, 1) if self.dcn is not None: for m in self.modules(): if isinstance(m, Bottleneck) and hasattr( m.conv2, 'conv_offset'): constant_init(m.conv2.conv_offset, 0) if self.zero_init_residual: for m in self.modules(): if isinstance(m, Bottleneck): constant_init(m.norm3, 0) elif isinstance(m, BasicBlock): constant_init(m.norm2, 0) else: raise TypeError('pretrained must be a str or None') def make_res_layer(self, **kwargs): """Pack all blocks in a stage into a ``ResLayer`` for DetectoRS.""" return ResLayer(**kwargs) def forward(self, x): """Forward function.""" outs = list(super().forward(x)) if self.output_img: outs.insert(0, x) return tuple(outs) def rfp_forward(self, x, rfp_feats): """Forward function for RFP.""" if self.deep_stem: x = self.stem(x) else: x = self.conv1(x) x = self.norm1(x) x = self.relu(x) x = self.maxpool(x) outs = [] for i, layer_name in enumerate(self.res_layers): res_layer = getattr(self, layer_name) rfp_feat = rfp_feats[i] if i > 0 else None for layer in res_layer: x = layer.rfp_forward(x, rfp_feat) if i in self.out_indices: outs.append(x) return tuple(outs) ================================================ FILE: swin/ckpt_convert.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # This script consists of several convert functions which # can modify the weights of model in original repo to be # pre-trained weights. from collections import OrderedDict import torch def pvt_convert(ckpt): new_ckpt = OrderedDict() # Process the concat between q linear weights and kv linear weights use_abs_pos_embed = False use_conv_ffn = False for k in ckpt.keys(): if k.startswith('pos_embed'): use_abs_pos_embed = True if k.find('dwconv') >= 0: use_conv_ffn = True for k, v in ckpt.items(): if k.startswith('head'): continue if k.startswith('norm.'): continue if k.startswith('cls_token'): continue if k.startswith('pos_embed'): stage_i = int(k.replace('pos_embed', '')) new_k = k.replace(f'pos_embed{stage_i}', f'layers.{stage_i - 1}.1.0.pos_embed') if stage_i == 4 and v.size(1) == 50: # 1 (cls token) + 7 * 7 new_v = v[:, 1:, :] # remove cls token else: new_v = v elif k.startswith('patch_embed'): stage_i = int(k.split('.')[0].replace('patch_embed', '')) new_k = k.replace(f'patch_embed{stage_i}', f'layers.{stage_i - 1}.0') new_v = v if 'proj.' in new_k: new_k = new_k.replace('proj.', 'projection.') elif k.startswith('block'): stage_i = int(k.split('.')[0].replace('block', '')) layer_i = int(k.split('.')[1]) new_layer_i = layer_i + use_abs_pos_embed new_k = k.replace(f'block{stage_i}.{layer_i}', f'layers.{stage_i - 1}.1.{new_layer_i}') new_v = v if 'attn.q.' in new_k: sub_item_k = k.replace('q.', 'kv.') new_k = new_k.replace('q.', 'attn.in_proj_') new_v = torch.cat([v, ckpt[sub_item_k]], dim=0) elif 'attn.kv.' in new_k: continue elif 'attn.proj.' in new_k: new_k = new_k.replace('proj.', 'attn.out_proj.') elif 'attn.sr.' in new_k: new_k = new_k.replace('sr.', 'sr.') elif 'mlp.' in new_k: string = f'{new_k}-' new_k = new_k.replace('mlp.', 'ffn.layers.') if 'fc1.weight' in new_k or 'fc2.weight' in new_k: new_v = v.reshape((*v.shape, 1, 1)) new_k = new_k.replace('fc1.', '0.') new_k = new_k.replace('dwconv.dwconv.', '1.') if use_conv_ffn: new_k = new_k.replace('fc2.', '4.') else: new_k = new_k.replace('fc2.', '3.') string += f'{new_k} {v.shape}-{new_v.shape}' elif k.startswith('norm'): stage_i = int(k[4]) new_k = k.replace(f'norm{stage_i}', f'layers.{stage_i - 1}.2') new_v = v else: new_k = k new_v = v new_ckpt[new_k] = new_v return new_ckpt def swin_converter(ckpt): new_ckpt = OrderedDict() def correct_unfold_reduction_order(x): out_channel, in_channel = x.shape x = x.reshape(out_channel, 4, in_channel // 4) x = x[:, [0, 2, 1, 3], :].transpose(1, 2).reshape(out_channel, in_channel) return x def correct_unfold_norm_order(x): in_channel = x.shape[0] x = x.reshape(4, in_channel // 4) x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel) return x for k, v in ckpt.items(): if k.startswith('head'): continue elif k.startswith('layers'): new_v = v if 'attn.' in k: new_k = k.replace('attn.', 'attn.w_msa.') elif 'mlp.' in k: if 'mlp.fc1.' in k: new_k = k.replace('mlp.fc1.', 'ffn.layers.0.0.') elif 'mlp.fc2.' in k: new_k = k.replace('mlp.fc2.', 'ffn.layers.1.') else: new_k = k.replace('mlp.', 'ffn.') elif 'downsample' in k: new_k = k if 'reduction.' in k: new_v = correct_unfold_reduction_order(v) elif 'norm.' in k: new_v = correct_unfold_norm_order(v) else: new_k = k new_k = new_k.replace('layers', 'stages', 1) elif k.startswith('patch_embed'): new_v = v if 'proj' in k: new_k = k.replace('proj', 'projection') else: new_k = k else: new_v = v new_k = k new_ckpt[new_k] = new_v return new_ckpt ================================================ FILE: swin/mix_transformer.py ================================================ # --------------------------------------------------------------- # Copyright (c) 2021, NVIDIA Corporation. All rights reserved. # # This work is licensed under the NVIDIA Source Code License # --------------------------------------------------------------- import torch import torch.nn as nn import torch.nn.functional as F from functools import partial from timm.models.layers import DropPath, to_2tuple, trunc_normal_ from timm.models.registry import register_model from timm.models.vision_transformer import _cfg from mmdet.models.builder import BACKBONES from mmdet.utils import get_root_logger from mmdet.models.backbones.resnet import ResNet from mmcv.runner import load_checkpoint, BaseModule import math class Mlp(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.dwconv = DWConv(hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) elif isinstance(m, nn.Conv2d): fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels fan_out //= m.groups m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) if m.bias is not None: m.bias.data.zero_() def forward(self, x, H, W): x = self.fc1(x) x = self.dwconv(x, H, W) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x class Attention(nn.Module): def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1): super().__init__() assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}." self.dim = dim self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim ** -0.5 self.q = nn.Linear(dim, dim, bias=qkv_bias) self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.sr_ratio = sr_ratio if sr_ratio > 1: self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio) self.norm = nn.LayerNorm(dim) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) elif isinstance(m, nn.Conv2d): fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels fan_out //= m.groups m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) if m.bias is not None: m.bias.data.zero_() def forward(self, x, H, W): B, N, C = x.shape q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) if self.sr_ratio > 1: x_ = x.permute(0, 2, 1).reshape(B, C, H, W) x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1) x_ = self.norm(x_) kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) else: kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) k, v = kv[0], kv[1] attn = (q @ k.transpose(-2, -1)) * self.scale attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, C) x = self.proj(x) x = self.proj_drop(x) return x class Block(nn.Module): def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1): super().__init__() self.norm1 = norm_layer(dim) self.attn = Attention( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) elif isinstance(m, nn.Conv2d): fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels fan_out //= m.groups m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) if m.bias is not None: m.bias.data.zero_() def forward(self, x, H, W): x = x + self.drop_path(self.attn(self.norm1(x), H, W)) x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) return x class OverlapPatchEmbed(nn.Module): """ Image to Patch Embedding """ def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dims=768): super().__init__() img_size = to_2tuple(img_size) patch_size = to_2tuple(patch_size) self.img_size = img_size self.patch_size = patch_size self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1] self.num_patches = self.H * self.W self.proj = nn.Conv2d(in_chans, embed_dims, kernel_size=patch_size, stride=stride, padding=(patch_size[0] // 2, patch_size[1] // 2)) self.norm = nn.LayerNorm(embed_dims) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) elif isinstance(m, nn.Conv2d): fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels fan_out //= m.groups m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) if m.bias is not None: m.bias.data.zero_() def forward(self, x): x = self.proj(x) _, _, H, W = x.shape x = x.flatten(2).transpose(1, 2) x = self.norm(x) return x, H, W class MixVisionTransformer(BaseModule): def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512], num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], init_cfg=None): super().__init__(init_cfg=init_cfg) self.num_classes = num_classes self.depths = depths # patch_embed self.patch_embed1 = OverlapPatchEmbed(img_size=img_size, patch_size=7, stride=4, in_chans=in_chans, embed_dims=embed_dimss[0]) self.patch_embed2 = OverlapPatchEmbed(img_size=img_size // 4, patch_size=3, stride=2, in_chans=embed_dimss[0], embed_dims=embed_dimss[1]) self.patch_embed3 = OverlapPatchEmbed(img_size=img_size // 8, patch_size=3, stride=2, in_chans=embed_dimss[1], embed_dims=embed_dimss[2]) self.patch_embed4 = OverlapPatchEmbed(img_size=img_size // 16, patch_size=3, stride=2, in_chans=embed_dimss[2], embed_dims=embed_dimss[3]) # transformer encoder dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule cur = 0 self.block1 = nn.ModuleList([Block( dim=embed_dimss[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, sr_ratio=sr_ratios[0]) for i in range(depths[0])]) self.norm1 = norm_layer(embed_dimss[0]) cur += depths[0] self.block2 = nn.ModuleList([Block( dim=embed_dimss[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, sr_ratio=sr_ratios[1]) for i in range(depths[1])]) self.norm2 = norm_layer(embed_dimss[1]) cur += depths[1] self.block3 = nn.ModuleList([Block( dim=embed_dimss[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, sr_ratio=sr_ratios[2]) for i in range(depths[2])]) self.norm3 = norm_layer(embed_dimss[2]) cur += depths[2] self.block4 = nn.ModuleList([Block( dim=embed_dimss[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, sr_ratio=sr_ratios[3]) for i in range(depths[3])]) self.norm4 = norm_layer(embed_dimss[3]) # classification head # self.head = nn.Linear(embed_dimss[3], num_classes) if num_classes > 0 else nn.Identity() self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) elif isinstance(m, nn.Conv2d): fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels fan_out //= m.groups m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) if m.bias is not None: m.bias.data.zero_() # def init_weights(self, pretrained=None): # if isinstance(pretrained, str): # logger = get_root_logger() # load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger) def reset_drop_path(self, drop_path_rate): dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths))] cur = 0 for i in range(self.depths[0]): self.block1[i].drop_path.drop_prob = dpr[cur + i] cur += self.depths[0] for i in range(self.depths[1]): self.block2[i].drop_path.drop_prob = dpr[cur + i] cur += self.depths[1] for i in range(self.depths[2]): self.block3[i].drop_path.drop_prob = dpr[cur + i] cur += self.depths[2] for i in range(self.depths[3]): self.block4[i].drop_path.drop_prob = dpr[cur + i] def freeze_patch_emb(self): self.patch_embed1.requires_grad = False @torch.jit.ignore def no_weight_decay(self): return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'} # has pos_embed may be better def get_classifier(self): return self.head def reset_classifier(self, num_classes, global_pool=''): self.num_classes = num_classes self.head = nn.Linear(self.embed_dims, num_classes) if num_classes > 0 else nn.Identity() def forward_features(self, x): B = x.shape[0] outs = [] # stage 1 x, H, W = self.patch_embed1(x) for i, blk in enumerate(self.block1): x = blk(x, H, W) x = self.norm1(x) x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() outs.append(x) # stage 2 x, H, W = self.patch_embed2(x) for i, blk in enumerate(self.block2): x = blk(x, H, W) x = self.norm2(x) x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() outs.append(x) # stage 3 x, H, W = self.patch_embed3(x) for i, blk in enumerate(self.block3): x = blk(x, H, W) x = self.norm3(x) x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() outs.append(x) # stage 4 x, H, W = self.patch_embed4(x) for i, blk in enumerate(self.block4): x = blk(x, H, W) x = self.norm4(x) x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() outs.append(x) return outs def forward(self, x): x = self.forward_features(x) # x = self.head(x) return x class DWConv(nn.Module): def __init__(self, dim=768): super(DWConv, self).__init__() self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim) def forward(self, x, H, W): B, N, C = x.shape x = x.transpose(1, 2).view(B, C, H, W) x = self.dwconv(x) x = x.flatten(2).transpose(1, 2) return x @BACKBONES.register_module() class mit_b0(MixVisionTransformer): def __init__(self, **kwargs): super(mit_b0, self).__init__( patch_size=4, embed_dims=[32, 64, 160, 256], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1], drop_rate=0.0, drop_path_rate=0.1, **kwargs) @BACKBONES.register_module() class mit_b1(MixVisionTransformer): def __init__(self, **kwargs): super(mit_b1, self).__init__( patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1], drop_rate=0.0, drop_path_rate=0.1, **kwargs) @BACKBONES.register_module() class mit_b2(MixVisionTransformer): def __init__(self, **kwargs): super(mit_b2, self).__init__( patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], drop_rate=0.0, drop_path_rate=0.1, **kwargs) @BACKBONES.register_module() class mit_b3(MixVisionTransformer): def __init__(self, **kwargs): super(mit_b3, self).__init__( patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1], drop_rate=0.0, drop_path_rate=0.1, **kwargs) @BACKBONES.register_module() class mit_b4(MixVisionTransformer): def __init__(self, **kwargs): super(mit_b4, self).__init__( patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 8, 27, 3], sr_ratios=[8, 4, 2, 1], drop_rate=0.0, drop_path_rate=0.1, **kwargs) @BACKBONES.register_module() class mit_b5(MixVisionTransformer): def __init__(self, **kwargs): super(mit_b5, self).__init__( patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 6, 40, 3], sr_ratios=[8, 4, 2, 1], drop_rate=0.0, drop_path_rate=0.1, **kwargs) @BACKBONES.register_module() class ResNetV1c(ResNet): r"""ResNetV1d variant described in `Bag of Tricks `_. Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in the input stem with three 3x3 convs. And in the downsampling block, a 2x2 avg_pool with stride 2 is added before conv, whose stride is changed to 1. """ def __init__(self, **kwargs): super(ResNetV1c, self).__init__( deep_stem=True, avg_down=False, **kwargs) ================================================ FILE: swin/swin_checkpoint.py ================================================ # Copyright (c) Open-MMLab. All rights reserved. import io import os import os.path as osp import pkgutil import time import warnings from collections import OrderedDict from importlib import import_module from tempfile import TemporaryDirectory import mmcv import torch import torchvision from mmcv.fileio import FileClient from mmcv.fileio import load as load_file from mmcv.parallel import is_module_wrapper from mmcv.runner import get_dist_info from mmcv.utils import mkdir_or_exist from torch.nn import functional as F from torch.optim import Optimizer from torch.utils import model_zoo ENV_MMCV_HOME = 'MMCV_HOME' ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME' DEFAULT_CACHE_DIR = '~/.cache' def _get_mmcv_home(): mmcv_home = os.path.expanduser( os.getenv( ENV_MMCV_HOME, os.path.join( os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv'))) mkdir_or_exist(mmcv_home) return mmcv_home def load_state_dict(module, state_dict, strict=False, logger=None): """Load state_dict to a module. This method is modified from :meth:`torch.nn.Module.load_state_dict`. Default value for ``strict`` is set to ``False`` and the message for param mismatch will be shown even if strict is False. Args: module (Module): Module that receives the state_dict. state_dict (OrderedDict): Weights. strict (bool): whether to strictly enforce that the keys in :attr:`state_dict` match the keys returned by this module's :meth:`~torch.nn.Module.state_dict` function. Default: ``False``. logger (:obj:`logging.Logger`, optional): Logger to log the error message. If not specified, print function will be used. """ unexpected_keys = [] all_missing_keys = [] err_msg = [] metadata = getattr(state_dict, '_metadata', None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata # use _load_from_state_dict to enable checkpoint version control def load(module, prefix=''): # recursively check parallel module in case that the model has a # complicated structure, e.g., nn.Module(nn.Module(DDP)) if is_module_wrapper(module): module = module.module local_metadata = {} if metadata is None else metadata.get( prefix[:-1], {}) module._load_from_state_dict(state_dict, prefix, local_metadata, True, all_missing_keys, unexpected_keys, err_msg) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + '.') load(module) load = None # break load->load reference cycle # ignore "num_batches_tracked" of BN layers missing_keys = [ key for key in all_missing_keys if 'num_batches_tracked' not in key ] if unexpected_keys: err_msg.append('unexpected key in source ' f'state_dict: {", ".join(unexpected_keys)}\n') if missing_keys: err_msg.append( f'missing keys in source state_dict: {", ".join(missing_keys)}\n') rank, _ = get_dist_info() if len(err_msg) > 0 and rank == 0: err_msg.insert( 0, 'The model and loaded state dict do not match exactly\n') err_msg = '\n'.join(err_msg) if strict: raise RuntimeError(err_msg) elif logger is not None: logger.warning(err_msg) else: print(err_msg) def load_url_dist(url, model_dir=None): """In distributed setting, this function only download checkpoint at local rank 0.""" rank, world_size = get_dist_info() rank = int(os.environ.get('LOCAL_RANK', rank)) if rank == 0: checkpoint = model_zoo.load_url(url, model_dir=model_dir) if world_size > 1: torch.distributed.barrier() if rank > 0: checkpoint = model_zoo.load_url(url, model_dir=model_dir) return checkpoint def load_pavimodel_dist(model_path, map_location=None): """In distributed setting, this function only download checkpoint at local rank 0.""" try: from pavi import modelcloud except ImportError: raise ImportError( 'Please install pavi to load checkpoint from modelcloud.') rank, world_size = get_dist_info() rank = int(os.environ.get('LOCAL_RANK', rank)) if rank == 0: model = modelcloud.get(model_path) with TemporaryDirectory() as tmp_dir: downloaded_file = osp.join(tmp_dir, model.name) model.download(downloaded_file) checkpoint = torch.load(downloaded_file, map_location=map_location) if world_size > 1: torch.distributed.barrier() if rank > 0: model = modelcloud.get(model_path) with TemporaryDirectory() as tmp_dir: downloaded_file = osp.join(tmp_dir, model.name) model.download(downloaded_file) checkpoint = torch.load( downloaded_file, map_location=map_location) return checkpoint def load_fileclient_dist(filename, backend, map_location): """In distributed setting, this function only download checkpoint at local rank 0.""" rank, world_size = get_dist_info() rank = int(os.environ.get('LOCAL_RANK', rank)) allowed_backends = ['ceph'] if backend not in allowed_backends: raise ValueError(f'Load from Backend {backend} is not supported.') if rank == 0: fileclient = FileClient(backend=backend) buffer = io.BytesIO(fileclient.get(filename)) checkpoint = torch.load(buffer, map_location=map_location) if world_size > 1: torch.distributed.barrier() if rank > 0: fileclient = FileClient(backend=backend) buffer = io.BytesIO(fileclient.get(filename)) checkpoint = torch.load(buffer, map_location=map_location) return checkpoint def get_torchvision_models(): model_urls = dict() for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__): if ispkg: continue _zoo = import_module(f'torchvision.models.{name}') if hasattr(_zoo, 'model_urls'): _urls = getattr(_zoo, 'model_urls') model_urls.update(_urls) return model_urls def get_external_models(): mmcv_home = _get_mmcv_home() default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json') default_urls = load_file(default_json_path) assert isinstance(default_urls, dict) external_json_path = osp.join(mmcv_home, 'open_mmlab.json') if osp.exists(external_json_path): external_urls = load_file(external_json_path) assert isinstance(external_urls, dict) default_urls.update(external_urls) return default_urls def get_mmcls_models(): mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json') mmcls_urls = load_file(mmcls_json_path) return mmcls_urls def get_deprecated_model_names(): deprecate_json_path = osp.join(mmcv.__path__[0], 'model_zoo/deprecated.json') deprecate_urls = load_file(deprecate_json_path) assert isinstance(deprecate_urls, dict) return deprecate_urls def _process_mmcls_checkpoint(checkpoint): state_dict = checkpoint['state_dict'] new_state_dict = OrderedDict() for k, v in state_dict.items(): if k.startswith('backbone.'): new_state_dict[k[9:]] = v new_checkpoint = dict(state_dict=new_state_dict) return new_checkpoint def _load_checkpoint(filename, map_location=None): """Load checkpoint from somewhere (modelzoo, file, url). Args: filename (str): Accept local filepath, URL, ``torchvision://xxx``, ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for details. map_location (str | None): Same as :func:`torch.load`. Default: None. Returns: dict | OrderedDict: The loaded checkpoint. It can be either an OrderedDict storing model weights or a dict containing other information, which depends on the checkpoint. """ if filename.startswith('modelzoo://'): warnings.warn('The URL scheme of "modelzoo://" is deprecated, please ' 'use "torchvision://" instead') model_urls = get_torchvision_models() model_name = filename[11:] checkpoint = load_url_dist(model_urls[model_name]) elif filename.startswith('torchvision://'): model_urls = get_torchvision_models() model_name = filename[14:] checkpoint = load_url_dist(model_urls[model_name]) elif filename.startswith('open-mmlab://'): model_urls = get_external_models() model_name = filename[13:] deprecated_urls = get_deprecated_model_names() if model_name in deprecated_urls: warnings.warn(f'open-mmlab://{model_name} is deprecated in favor ' f'of open-mmlab://{deprecated_urls[model_name]}') model_name = deprecated_urls[model_name] model_url = model_urls[model_name] # check if is url if model_url.startswith(('http://', 'https://')): checkpoint = load_url_dist(model_url) else: filename = osp.join(_get_mmcv_home(), model_url) if not osp.isfile(filename): raise IOError(f'{filename} is not a checkpoint file') checkpoint = torch.load(filename, map_location=map_location) elif filename.startswith('mmcls://'): model_urls = get_mmcls_models() model_name = filename[8:] checkpoint = load_url_dist(model_urls[model_name]) checkpoint = _process_mmcls_checkpoint(checkpoint) elif filename.startswith(('http://', 'https://')): checkpoint = load_url_dist(filename) elif filename.startswith('pavi://'): model_path = filename[7:] checkpoint = load_pavimodel_dist(model_path, map_location=map_location) elif filename.startswith('s3://'): checkpoint = load_fileclient_dist( filename, backend='ceph', map_location=map_location) else: if not osp.isfile(filename): raise IOError(f'{filename} is not a checkpoint file') checkpoint = torch.load(filename, map_location=map_location) return checkpoint def load_checkpoint(model, filename, map_location='cpu', strict=False, logger=None): """Load checkpoint from a file or URI. Args: model (Module): Module to load checkpoint. filename (str): Accept local filepath, URL, ``torchvision://xxx``, ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for details. map_location (str): Same as :func:`torch.load`. strict (bool): Whether to allow different params for the model and checkpoint. logger (:mod:`logging.Logger` or None): The logger for error message. Returns: dict or OrderedDict: The loaded checkpoint. """ checkpoint = _load_checkpoint(filename, map_location) # OrderedDict is a subclass of dict if not isinstance(checkpoint, dict): raise RuntimeError( f'No state_dict found in checkpoint file {filename}') # get state_dict from checkpoint if 'state_dict' in checkpoint: state_dict = checkpoint['state_dict'] elif 'model' in checkpoint: state_dict = checkpoint['model'] else: state_dict = checkpoint # strip prefix of state_dict if list(state_dict.keys())[0].startswith('module.'): state_dict = {k[7:]: v for k, v in state_dict.items()} # reshape absolute position embedding if state_dict.get('absolute_pos_embed') is not None: absolute_pos_embed = state_dict['absolute_pos_embed'] N1, L, C1 = absolute_pos_embed.size() N2, C2, H, W = model.absolute_pos_embed.size() if N1 != N2 or C1 != C2 or L != H * W: logger.warning('Error in loading absolute_pos_embed, pass') else: state_dict['absolute_pos_embed'] = absolute_pos_embed.view( N2, H, W, C2).permute(0, 3, 1, 2) # interpolate position bias table if needed relative_position_bias_table_keys = [ k for k in state_dict.keys() if 'relative_position_bias_table' in k ] for table_key in relative_position_bias_table_keys: table_pretrained = state_dict[table_key] table_current = model.state_dict()[table_key] L1, nH1 = table_pretrained.size() L2, nH2 = table_current.size() if nH1 != nH2: logger.warning(f'Error in loading {table_key}, pass') else: if L1 != L2: S1 = int(L1**0.5) S2 = int(L2**0.5) table_pretrained_resized = F.interpolate( table_pretrained.permute(1, 0).view(1, nH1, S1, S1), size=(S2, S2), mode='bicubic') state_dict[table_key] = table_pretrained_resized.view( nH2, L2).permute(1, 0) # load state_dict load_state_dict(model, state_dict, strict, logger) return checkpoint def weights_to_cpu(state_dict): """Copy a model state_dict to cpu. Args: state_dict (OrderedDict): Model weights on GPU. Returns: OrderedDict: Model weights on GPU. """ state_dict_cpu = OrderedDict() for key, val in state_dict.items(): state_dict_cpu[key] = val.cpu() return state_dict_cpu def _save_to_state_dict(module, destination, prefix, keep_vars): """Saves module state to `destination` dictionary. This method is modified from :meth:`torch.nn.Module._save_to_state_dict`. Args: module (nn.Module): The module to generate state_dict. destination (dict): A dict where state will be stored. prefix (str): The prefix for parameters and buffers used in this module. """ for name, param in module._parameters.items(): if param is not None: destination[prefix + name] = param if keep_vars else param.detach() for name, buf in module._buffers.items(): # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d if buf is not None: destination[prefix + name] = buf if keep_vars else buf.detach() def get_state_dict(module, destination=None, prefix='', keep_vars=False): """Returns a dictionary containing a whole state of the module. Both parameters and persistent buffers (e.g. running averages) are included. Keys are corresponding parameter and buffer names. This method is modified from :meth:`torch.nn.Module.state_dict` to recursively check parallel module in case that the model has a complicated structure, e.g., nn.Module(nn.Module(DDP)). Args: module (nn.Module): The module to generate state_dict. destination (OrderedDict): Returned dict for the state of the module. prefix (str): Prefix of the key. keep_vars (bool): Whether to keep the variable property of the parameters. Default: False. Returns: dict: A dictionary containing a whole state of the module. """ # recursively check parallel module in case that the model has a # complicated structure, e.g., nn.Module(nn.Module(DDP)) if is_module_wrapper(module): module = module.module # below is the same as torch.nn.Module.state_dict() if destination is None: destination = OrderedDict() destination._metadata = OrderedDict() destination._metadata[prefix[:-1]] = local_metadata = dict( version=module._version) _save_to_state_dict(module, destination, prefix, keep_vars) for name, child in module._modules.items(): if child is not None: get_state_dict( child, destination, prefix + name + '.', keep_vars=keep_vars) for hook in module._state_dict_hooks.values(): hook_result = hook(module, destination, prefix, local_metadata) if hook_result is not None: destination = hook_result return destination def save_checkpoint(model, filename, optimizer=None, meta=None): """Save checkpoint to file. The checkpoint will have 3 fields: ``meta``, ``state_dict`` and ``optimizer``. By default ``meta`` will contain version and time info. Args: model (Module): Module whose params are to be saved. filename (str): Checkpoint filename. optimizer (:obj:`Optimizer`, optional): Optimizer to be saved. meta (dict, optional): Metadata to be saved in checkpoint. """ if meta is None: meta = {} elif not isinstance(meta, dict): raise TypeError(f'meta must be a dict or None, but got {type(meta)}') meta.update(mmcv_version=mmcv.__version__, time=time.asctime()) if is_module_wrapper(model): model = model.module if hasattr(model, 'CLASSES') and model.CLASSES is not None: # save class name to the meta meta.update(CLASSES=model.CLASSES) checkpoint = { 'meta': meta, 'state_dict': weights_to_cpu(get_state_dict(model)) } # save optimizer state dict in the checkpoint if isinstance(optimizer, Optimizer): checkpoint['optimizer'] = optimizer.state_dict() elif isinstance(optimizer, dict): checkpoint['optimizer'] = {} for name, optim in optimizer.items(): checkpoint['optimizer'][name] = optim.state_dict() if filename.startswith('pavi://'): try: from pavi import modelcloud from pavi.exception import NodeNotFoundError except ImportError: raise ImportError( 'Please install pavi to load checkpoint from modelcloud.') model_path = filename[7:] root = modelcloud.Folder() model_dir, model_name = osp.split(model_path) try: model = modelcloud.get(model_dir) except NodeNotFoundError: model = root.create_training_model(model_dir) with TemporaryDirectory() as tmp_dir: checkpoint_file = osp.join(tmp_dir, model_name) with open(checkpoint_file, 'wb') as f: torch.save(checkpoint, f) f.flush() model.create_file(checkpoint_file, name=model_name) else: mmcv.mkdir_or_exist(osp.dirname(filename)) # immediately flush buffer with open(filename, 'wb') as f: torch.save(checkpoint, f) f.flush() ================================================ FILE: swin/swin_transformer.py ================================================ # -------------------------------------------------------- # Swin Transformer # Copyright (c) 2021 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ze Liu, Yutong Lin, Yixuan Wei # -------------------------------------------------------- import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as checkpoint from mmdet.models.builder import BACKBONES from mmdet.utils import get_root_logger from timm.models.layers import DropPath, to_2tuple, trunc_normal_ from .swin_checkpoint import load_checkpoint class Mlp(nn.Module): """Multilayer perceptron.""" def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x def window_partition(x, window_size): """ Args: x: (B, H, W, C) window_size (int): window size Returns: windows: (num_windows*B, window_size, window_size, C) """ B, H, W, C = x.shape x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) return windows def window_reverse(windows, window_size, H, W): """ Args: windows: (num_windows*B, window_size, window_size, C) window_size (int): Window size H (int): Height of image W (int): Width of image Returns: x: (B, H, W, C) """ B = int(windows.shape[0] / (H * W / window_size / window_size)) x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) return x class WindowAttention(nn.Module): """Window based multi-head self attention (W-MSA) module with relative position bias. It supports both of shifted and non-shifted window. Args: dim (int): Number of input channels. window_size (tuple[int]): The height and width of the window. num_heads (int): Number of attention heads. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 proj_drop (float, optional): Dropout ratio of output. Default: 0.0 """ def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.): super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim**-0.5 # define a parameter table of relative position bias self.relative_position_bias_table = nn.Parameter( torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH # get pair-wise relative position index for each token inside the window coords_h = torch.arange(self.window_size[0]) coords_w = torch.arange(self.window_size[1]) coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute( 1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww self.register_buffer('relative_position_index', relative_position_index) self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) trunc_normal_(self.relative_position_bias_table, std=.02) self.softmax = nn.Softmax(dim=-1) def forward(self, x, mask=None): """Forward function. Args: x: input features with shape of (num_windows*B, N, C) mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None """ B_, N, C = x.shape qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[ 2] # make torchscript happy (cannot use tensor as tuple) q = q * self.scale attn = (q @ k.transpose(-2, -1)) relative_position_bias = self.relative_position_bias_table[ self.relative_position_index.view(-1)].view( self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.permute( 2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) if mask is not None: nW = mask.shape[0] attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) attn = attn.view(-1, self.num_heads, N, N) attn = self.softmax(attn) else: attn = self.softmax(attn) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B_, N, C) x = self.proj(x) x = self.proj_drop(x) return x class SwinTransformerBlock(nn.Module): """Swin Transformer Block. Args: dim (int): Number of input channels. num_heads (int): Number of attention heads. window_size (int): Window size. shift_size (int): Shift size for SW-MSA. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float, optional): Stochastic depth rate. Default: 0.0 act_layer (nn.Module, optional): Activation layer. Default: nn.GELU norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__(self, dim, num_heads, window_size=7, shift_size=0, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): super().__init__() self.dim = dim self.num_heads = num_heads self.window_size = window_size self.shift_size = shift_size self.mlp_ratio = mlp_ratio assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size' self.norm1 = norm_layer(dim) self.attn = WindowAttention( dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) self.drop_path = DropPath( drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) self.H = None self.W = None def forward(self, x, mask_matrix): """Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. mask_matrix: Attention mask for cyclic shift. """ B, L, C = x.shape H, W = self.H, self.W assert L == H * W, 'input feature has wrong size' shortcut = x x = self.norm1(x) x = x.view(B, H, W, C) # pad feature maps to multiples of window size pad_l = pad_t = 0 pad_r = (self.window_size - W % self.window_size) % self.window_size pad_b = (self.window_size - H % self.window_size) % self.window_size x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) _, Hp, Wp, _ = x.shape # cyclic shift if self.shift_size > 0: shifted_x = torch.roll( x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) attn_mask = mask_matrix else: shifted_x = x attn_mask = None # partition windows x_windows = window_partition( shifted_x, self.window_size) # nW*B, window_size, window_size, C x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C # W-MSA/SW-MSA attn_windows = self.attn( x_windows, mask=attn_mask) # nW*B, window_size*window_size, C # merge windows attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C # reverse cyclic shift if self.shift_size > 0: x = torch.roll( shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) else: x = shifted_x if pad_r > 0 or pad_b > 0: x = x[:, :H, :W, :].contiguous() x = x.view(B, H * W, C) # FFN x = shortcut + self.drop_path(x) x = x + self.drop_path(self.mlp(self.norm2(x))) return x class PatchMerging(nn.Module): """ Patch Merging Layer Args: dim (int): Number of input channels. norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__(self, dim, norm_layer=nn.LayerNorm): super().__init__() self.dim = dim self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) self.norm = norm_layer(4 * dim) def forward(self, x, H, W): """Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. """ B, L, C = x.shape assert L == H * W, 'input feature has wrong size' x = x.view(B, H, W, C) # padding pad_input = (H % 2 == 1) or (W % 2 == 1) if pad_input: x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C x = self.norm(x) x = self.reduction(x) return x class BasicLayer(nn.Module): """A basic Swin Transformer layer for one stage. Args: dim (int): Number of feature channels depth (int): Depths of this stage. num_heads (int): Number of attention head. window_size (int): Local window size. Default: 7. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None with_cp (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__(self, dim, depth, num_heads, window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., norm_layer=nn.LayerNorm, downsample=None, with_cp=False): super().__init__() self.window_size = window_size self.shift_size = window_size // 2 self.depth = depth self.with_cp = with_cp # build blocks self.blocks = nn.ModuleList([ SwinTransformerBlock( dim=dim, num_heads=num_heads, window_size=window_size, shift_size=0 if (i % 2 == 0) else window_size // 2, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop, drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, norm_layer=norm_layer) for i in range(depth) ]) # patch merging layer if downsample is not None: self.downsample = downsample(dim=dim, norm_layer=norm_layer) else: self.downsample = None def forward(self, x, H, W): """Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. """ # calculate attention mask for SW-MSA Hp = int(np.ceil(H / self.window_size)) * self.window_size Wp = int(np.ceil(W / self.window_size)) * self.window_size img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1 h_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) w_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 mask_windows = window_partition( img_mask, self.window_size) # nW, window_size, window_size, 1 mask_windows = mask_windows.view(-1, self.window_size * self.window_size) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( attn_mask == 0, float(0.0)) attn_mask = attn_mask.to(dtype=x.dtype) for blk in self.blocks: blk.H, blk.W = H, W if self.with_cp: x = checkpoint.checkpoint(blk, x, attn_mask) else: x = blk(x, attn_mask) if self.downsample is not None: x_down = self.downsample(x, H, W) Wh, Ww = (H + 1) // 2, (W + 1) // 2 return x, H, W, x_down, Wh, Ww else: return x, H, W, x, H, W class PatchEmbed(nn.Module): """ Image to Patch Embedding Args: patch_size (int): Patch token size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dims (int): Number of linear projection output channels. Default: 96. norm_layer (nn.Module, optional): Normalization layer. Default: None """ def __init__(self, patch_size=4, in_chans=3, embed_dims=96, norm_layer=None): super().__init__() patch_size = to_2tuple(patch_size) self.patch_size = patch_size self.in_chans = in_chans self.embed_dims = embed_dims self.proj = nn.Conv2d( in_chans, embed_dims, kernel_size=patch_size, stride=patch_size) if norm_layer is not None: self.norm = norm_layer(embed_dims) else: self.norm = None def forward(self, x): """Forward function.""" # padding _, _, H, W = x.size() if W % self.patch_size[1] != 0: x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1])) if H % self.patch_size[0] != 0: x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) x = self.proj(x) # B C Wh Ww if self.norm is not None: Wh, Ww = x.size(2), x.size(3) x = x.flatten(2).transpose(1, 2) x = self.norm(x) x = x.transpose(1, 2).view(-1, self.embed_dims, Wh, Ww) return x # @BACKBONES_Seg.register_module() @BACKBONES.register_module() class SwinTransformerDIY(nn.Module): """ Swin Transformer backbone. A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - https://arxiv.org/pdf/2103.14030 Args: pretrain_img_size (int): Input image size for training the pretrained model, used in absolute postion embedding. Default 224. patch_size (int | tuple(int)): Patch size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dims (int): Number of linear projection output channels. Default: 96. depths (tuple[int]): Depths of each Swin Transformer stage. num_heads (tuple[int]): Number of attention head of each stage. window_size (int): Window size. Default: 7. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. drop_rate (float): Dropout rate. attn_drop_rate (float): Attention dropout rate. Default: 0. drop_path_rate (float): Stochastic depth rate. Default: 0.2. norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. ape (bool): If True, add absolute position embedding to the patch embedding. Default: False. patch_norm (bool): If True, add normalization after patch embedding. Default: True. out_indices (Sequence[int]): Output from which stages. frozen_stages (int): Stages to be frozen (stop grad and set eval mode). -1 means not freezing any parameters. with_cp (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__(self, pretrain_img_size=224, patch_size=4, in_chans=3, embed_dims=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.2, norm_layer=nn.LayerNorm, use_abs_pos_embed=False, patch_norm=True, out_indices=(0, 1, 2, 3), frozen_stages=-1, with_cp=False, output_img=False, pretrained=None): super().__init__() self.output_img = output_img self.pretrain_img_size = pretrain_img_size self.num_layers = len(depths) self.embed_dims = embed_dims self.ape = use_abs_pos_embed self.patch_norm = patch_norm self.out_indices = out_indices self.frozen_stages = frozen_stages self.pretrained = pretrained # split image into non-overlapping patches self.patch_embed = PatchEmbed( patch_size=patch_size, in_chans=in_chans, embed_dims=embed_dims, norm_layer=norm_layer if self.patch_norm else None) # absolute position embedding if self.ape: pretrain_img_size = to_2tuple(pretrain_img_size) patch_size = to_2tuple(patch_size) patches_resolution = [ pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1] ] self.absolute_pos_embed = nn.Parameter( torch.zeros(1, embed_dims, patches_resolution[0], patches_resolution[1])) trunc_normal_(self.absolute_pos_embed, std=.02) self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) ] # stochastic depth decay rule # build layers self.layers = nn.ModuleList() for i_layer in range(self.num_layers): layer = BasicLayer( dim=int(embed_dims * 2**i_layer), depth=depths[i_layer], num_heads=num_heads[i_layer], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], norm_layer=norm_layer, downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, with_cp=with_cp) self.layers.append(layer) num_features = [int(embed_dims * 2**i) for i in range(self.num_layers)] self.num_features = num_features # add a norm layer for each output for i_layer in out_indices: layer = norm_layer(num_features[i_layer]) layer_name = f'norm{i_layer}' self.add_module(layer_name, layer) self._freeze_stages() def _freeze_stages(self): if self.frozen_stages >= 0: self.patch_embed.eval() for param in self.patch_embed.parameters(): param.requires_grad = False if self.frozen_stages >= 1 and self.ape: self.absolute_pos_embed.requires_grad = False if self.frozen_stages >= 2: self.pos_drop.eval() for i in range(0, self.frozen_stages - 1): m = self.layers[i] m.eval() for param in m.parameters(): param.requires_grad = False def init_weights(self, pretrained=None): """Initialize the weights in backbone. Args: pretrained (str, optional): Path to pre-trained weights. Defaults to None. """ if pretrained is None and self.pretrained is not None: pretrained = self.pretrained def _init_weights(m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) if isinstance(pretrained, str): self.apply(_init_weights) logger = get_root_logger() load_checkpoint(self, pretrained, strict=False, logger=logger) elif pretrained is None: self.apply(_init_weights) else: raise TypeError('pretrained must be a str or None') def forward(self, x): """Forward function.""" x_idty = x x = self.patch_embed(x) Wh, Ww = x.size(2), x.size(3) if self.ape: # interpolate the position embedding to the corresponding size absolute_pos_embed = F.interpolate( self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic') x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C else: x = x.flatten(2).transpose(1, 2) x = self.pos_drop(x) outs = [] for i in range(self.num_layers): layer = self.layers[i] x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) if i in self.out_indices: norm_layer = getattr(self, f'norm{i}') x_out = norm_layer(x_out) out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous() outs.append(out) if self.output_img: outs.insert(0, x_idty) return tuple(outs) def train(self, mode=True): """Convert the model into training mode while keep layers freezed.""" super().train(mode) self._freeze_stages() ================================================ FILE: swin/swin_transformer_rfp.py ================================================ import warnings from collections import OrderedDict from copy import deepcopy import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as cp from mmcv.cnn import build_norm_layer, constant_init, trunc_normal_init, build_conv_layer from mmcv.cnn.bricks.transformer import FFN, build_dropout from mmcv.runner import BaseModule, ModuleList, _load_checkpoint from mmcv.utils import to_2tuple from mmdet.utils import get_root_logger from mmdet.models.builder import BACKBONES from .ckpt_convert import swin_converter from .transformer import PatchEmbed, PatchMerging class WindowMSA(BaseModule): """Window based multi-head self-attention (W-MSA) module with relative position bias. Args: embed_dims (int): Number of input channels. num_heads (int): Number of attention heads. window_size (tuple[int]): The height and width of the window. qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. Default: True. qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. attn_drop_rate (float, optional): Dropout ratio of attention weight. Default: 0.0 proj_drop_rate (float, optional): Dropout ratio of output. Default: 0. init_cfg (dict | None, optional): The Config for initialization. Default: None. """ def __init__(self, embed_dims, num_heads, window_size, qkv_bias=True, qk_scale=None, attn_drop_rate=0., proj_drop_rate=0., init_cfg=None): super().__init__() self.embed_dims = embed_dims self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_embed_dims = embed_dims // num_heads self.scale = qk_scale or head_embed_dims ** -0.5 self.init_cfg = init_cfg # define a parameter table of relative position bias self.relative_position_bias_table = nn.Parameter( torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH # About 2x faster than original impl Wh, Ww = self.window_size rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww) rel_position_index = rel_index_coords + rel_index_coords.T rel_position_index = rel_position_index.flip(1).contiguous() self.register_buffer('relative_position_index', rel_position_index) self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop_rate) self.proj = nn.Linear(embed_dims, embed_dims) self.proj_drop = nn.Dropout(proj_drop_rate) self.softmax = nn.Softmax(dim=-1) def init_weights(self): trunc_normal_init(self.relative_position_bias_table, std=0.02) def forward(self, x, mask=None): """ Args: x (tensor): input features with shape of (num_windows*B, N, C) mask (tensor | None, Optional): mask with shape of (num_windows, Wh*Ww, Wh*Ww), value should be between (-inf, 0]. """ B, N, C = x.shape qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) # make torchscript happy (cannot use tensor as tuple) q, k, v = qkv[0], qkv[1], qkv[2] q = q * self.scale attn = (q @ k.transpose(-2, -1)) relative_position_bias = self.relative_position_bias_table[ self.relative_position_index.view(-1)].view( self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.permute( 2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) if mask is not None: nW = mask.shape[0] attn = attn.view(B // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) attn = attn.view(-1, self.num_heads, N, N) attn = self.softmax(attn) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, C) x = self.proj(x) x = self.proj_drop(x) return x @staticmethod def double_step_seq(step1, len1, step2, len2): seq1 = torch.arange(0, step1 * len1, step1) seq2 = torch.arange(0, step2 * len2, step2) return (seq1[:, None] + seq2[None, :]).reshape(1, -1) class ShiftWindowMSA(BaseModule): """Shifted Window Multihead Self-Attention Module. Args: embed_dims (int): Number of input channels. num_heads (int): Number of attention heads. window_size (int): The height and width of the window. shift_size (int, optional): The shift step of each window towards right-bottom. If zero, act as regular window-msa. Defaults to 0. qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Defaults: None. attn_drop_rate (float, optional): Dropout ratio of attention weight. Defaults: 0. proj_drop_rate (float, optional): Dropout ratio of output. Defaults: 0. dropout_layer (dict, optional): The dropout_layer used before output. Defaults: dict(type='DropPath', drop_prob=0.). init_cfg (dict, optional): The extra config for initialization. Default: None. """ def __init__(self, embed_dims, num_heads, window_size, shift_size=0, qkv_bias=True, qk_scale=None, attn_drop_rate=0, proj_drop_rate=0, dropout_layer=dict(type='DropPath', drop_prob=0.), init_cfg=None): super().__init__(init_cfg) self.window_size = window_size self.shift_size = shift_size assert 0 <= self.shift_size < self.window_size self.w_msa = WindowMSA( embed_dims=embed_dims, num_heads=num_heads, window_size=to_2tuple(window_size), qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop_rate=attn_drop_rate, proj_drop_rate=proj_drop_rate, init_cfg=None) self.drop = build_dropout(dropout_layer) def forward(self, query, hw_shape): B, L, C = query.shape H, W = hw_shape assert L == H * W, 'input feature has wrong size' query = query.view(B, H, W, C) # pad feature maps to multiples of window size pad_r = (self.window_size - W % self.window_size) % self.window_size pad_b = (self.window_size - H % self.window_size) % self.window_size query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b)) H_pad, W_pad = query.shape[1], query.shape[2] # cyclic shift if self.shift_size > 0: shifted_query = torch.roll( query, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) # calculate attention mask for SW-MSA img_mask = torch.zeros((1, H_pad, W_pad, 1), device=query.device) h_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) w_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 # nW, window_size, window_size, 1 mask_windows = self.window_partition(img_mask) mask_windows = mask_windows.view( -1, self.window_size * self.window_size) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( attn_mask == 0, float(0.0)) else: shifted_query = query attn_mask = None # nW*B, window_size, window_size, C query_windows = self.window_partition(shifted_query) # nW*B, window_size*window_size, C query_windows = query_windows.view(-1, self.window_size ** 2, C) # W-MSA/SW-MSA (nW*B, window_size*window_size, C) attn_windows = self.w_msa(query_windows, mask=attn_mask) # merge windows attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) # B H' W' C shifted_x = self.window_reverse(attn_windows, H_pad, W_pad) # reverse cyclic shift if self.shift_size > 0: x = torch.roll( shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) else: x = shifted_x if pad_r > 0 or pad_b: x = x[:, :H, :W, :].contiguous() x = x.view(B, H * W, C) x = self.drop(x) return x def window_reverse(self, windows, H, W): """ Args: windows: (num_windows*B, window_size, window_size, C) H (int): Height of image W (int): Width of image Returns: x: (B, H, W, C) """ window_size = self.window_size B = int(windows.shape[0] / (H * W / window_size / window_size)) x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) return x def window_partition(self, x): """ Args: x: (B, H, W, C) Returns: windows: (num_windows*B, window_size, window_size, C) """ B, H, W, C = x.shape window_size = self.window_size x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous() windows = windows.view(-1, window_size, window_size, C) return windows class SwinBlock(BaseModule): """" Args: embed_dims (int): The feature dimension. num_heads (int): Parallel attention heads. feedforward_channels (int): The hidden dimension for FFNs. window_size (int, optional): The local window scale. Default: 7. shift (bool, optional): whether to shift window or not. Default False. qkv_bias (bool, optional): enable bias for qkv if True. Default: True. qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. drop_rate (float, optional): Dropout rate. Default: 0. attn_drop_rate (float, optional): Attention dropout rate. Default: 0. drop_path_rate (float, optional): Stochastic depth rate. Default: 0. act_cfg (dict, optional): The config dict of activation function. Default: dict(type='GELU'). norm_cfg (dict, optional): The config dict of normalization. Default: dict(type='LN'). with_cp (bool, optional): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. init_cfg (dict | list | None, optional): The init config. Default: None. """ def __init__(self, embed_dims, num_heads, feedforward_channels, window_size=7, shift=False, qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), with_cp=False, init_cfg=None): super(SwinBlock, self).__init__() self.init_cfg = init_cfg self.with_cp = with_cp self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1] self.attn = ShiftWindowMSA( embed_dims=embed_dims, num_heads=num_heads, window_size=window_size, shift_size=window_size // 2 if shift else 0, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop_rate=attn_drop_rate, proj_drop_rate=drop_rate, dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), init_cfg=None) self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1] self.ffn = FFN( embed_dims=embed_dims, feedforward_channels=feedforward_channels, num_fcs=2, ffn_drop=drop_rate, dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), act_cfg=act_cfg, add_identity=True, init_cfg=None) def forward(self, x, hw_shape): def _inner_forward(x): identity = x x = self.norm1(x) x = self.attn(x, hw_shape) x = x + identity identity = x x = self.norm2(x) x = self.ffn(x, identity=identity) return x if self.with_cp and x.requires_grad: x = cp.checkpoint(_inner_forward, x) else: x = _inner_forward(x) return x class SwinBlockSequence(BaseModule): """Implements one stage in Swin Transformer. Args: embed_dims (int): The feature dimension. num_heads (int): Parallel attention heads. feedforward_channels (int): The hidden dimension for FFNs. depth (int): The number of blocks in this stage. window_size (int, optional): The local window scale. Default: 7. qkv_bias (bool, optional): enable bias for qkv if True. Default: True. qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. drop_rate (float, optional): Dropout rate. Default: 0. attn_drop_rate (float, optional): Attention dropout rate. Default: 0. drop_path_rate (float | list[float], optional): Stochastic depth rate. Default: 0. downsample (BaseModule | None, optional): The downsample operation module. Default: None. act_cfg (dict, optional): The config dict of activation function. Default: dict(type='GELU'). norm_cfg (dict, optional): The config dict of normalization. Default: dict(type='LN'). with_cp (bool, optional): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. init_cfg (dict | list | None, optional): The init config. Default: None. """ def __init__(self, embed_dims, num_heads, feedforward_channels, depth, window_size=7, qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., downsample=None, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), with_cp=False, init_cfg=None): super().__init__(init_cfg=init_cfg) if isinstance(drop_path_rate, list): drop_path_rates = drop_path_rate assert len(drop_path_rates) == depth else: drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)] self.blocks = ModuleList() for i in range(depth): block = SwinBlock( embed_dims=embed_dims, num_heads=num_heads, feedforward_channels=feedforward_channels, window_size=window_size, shift=False if i % 2 == 0 else True, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=drop_path_rates[i], act_cfg=act_cfg, norm_cfg=norm_cfg, with_cp=with_cp, init_cfg=None) self.blocks.append(block) self.downsample = downsample def forward(self, x, hw_shape): for block in self.blocks: x = block(x, hw_shape) if self.downsample: x_down, down_hw_shape = self.downsample(x, hw_shape) return x_down, down_hw_shape, x, hw_shape else: return x, hw_shape, x, hw_shape class SwinTransformer(BaseModule): """ Swin Transformer A PyTorch implement of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - https://arxiv.org/abs/2103.14030 Inspiration from https://github.com/microsoft/Swin-Transformer Args: pretrain_img_size (int | tuple[int]): The size of input image when pretrain. Defaults: 224. in_channels (int): The num of input channels. Defaults: 3. embed_dims (int): The feature dimension. Default: 96. patch_size (int | tuple[int]): Patch size. Default: 4. window_size (int): Window size. Default: 7. mlp_ratio (int): Ratio of mlp hidden dim to embedding dim. Default: 4. depths (tuple[int]): Depths of each Swin Transformer stage. Default: (2, 2, 6, 2). num_heads (tuple[int]): Parallel attention heads of each Swin Transformer stage. Default: (3, 6, 12, 24). strides (tuple[int]): The patch merging or patch embedding stride of each Swin Transformer stage. (In swin, we set kernel size equal to stride.) Default: (4, 2, 2, 2). out_indices (tuple[int]): Output from which stages. Default: (0, 1, 2, 3). qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. patch_norm (bool): If add a norm layer for patch embed and patch merging. Default: True. drop_rate (float): Dropout rate. Defaults: 0. attn_drop_rate (float): Attention dropout rate. Default: 0. drop_path_rate (float): Stochastic depth rate. Defaults: 0.1. use_abs_pos_embed (bool): If True, add absolute position embedding to the patch embedding. Defaults: False. act_cfg (dict): Config dict for activation layer. Default: dict(type='LN'). norm_cfg (dict): Config dict for normalization layer at output of backone. Defaults: dict(type='LN'). with_cp (bool, optional): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. pretrained (str, optional): model pretrained path. Default: None. convert_weights (bool): The flag indicates whether the pre-trained model is from the original repo. We may need to convert some keys to make it compatible. Default: False. frozen_stages (int): Stages to be frozen (stop grad and set eval mode). -1 means not freezing any parameters. init_cfg (dict, optional): The Config for initialization. Defaults to None. """ def __init__(self, pretrain_img_size=224, in_channels=3, embed_dims=96, patch_size=4, window_size=7, mlp_ratio=4, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), strides=(4, 2, 2, 2), out_indices=(0, 1, 2, 3), qkv_bias=True, qk_scale=None, patch_norm=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, use_abs_pos_embed=False, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), with_cp=False, pretrained=None, convert_weights=False, frozen_stages=-1, init_cfg=None): self.convert_weights = convert_weights self.frozen_stages = frozen_stages if isinstance(pretrain_img_size, int): pretrain_img_size = to_2tuple(pretrain_img_size) elif isinstance(pretrain_img_size, tuple): if len(pretrain_img_size) == 1: pretrain_img_size = to_2tuple(pretrain_img_size[0]) assert len(pretrain_img_size) == 2, \ f'The size of image should have length 1 or 2, ' \ f'but got {len(pretrain_img_size)}' assert not (init_cfg and pretrained), \ 'init_cfg and pretrained cannot be specified at the same time' if isinstance(pretrained, str): warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) elif pretrained is None: self.init_cfg = init_cfg else: raise TypeError('pretrained must be a str or None') super(SwinTransformer, self).__init__(init_cfg=init_cfg) num_layers = len(depths) self.out_indices = out_indices self.use_abs_pos_embed = use_abs_pos_embed assert strides[0] == patch_size, 'Use non-overlapping patch embed.' self.patch_embed = PatchEmbed( in_channels=in_channels, embed_dims=embed_dims, conv_type='Conv2d', kernel_size=patch_size, stride=strides[0], norm_cfg=norm_cfg if patch_norm else None, init_cfg=None) if self.use_abs_pos_embed: patch_row = pretrain_img_size[0] // patch_size patch_col = pretrain_img_size[1] // patch_size num_patches = patch_row * patch_col self.absolute_pos_embed = nn.Parameter( torch.zeros((1, num_patches, embed_dims))) self.drop_after_pos = nn.Dropout(p=drop_rate) # set stochastic depth decay rule total_depth = sum(depths) dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, total_depth) ] self.stages = ModuleList() in_channels = embed_dims for i in range(num_layers): if i < num_layers - 1: downsample = PatchMerging( in_channels=in_channels, out_channels=2 * in_channels, stride=strides[i + 1], norm_cfg=norm_cfg if patch_norm else None, init_cfg=None) else: downsample = None stage = SwinBlockSequence( embed_dims=in_channels, num_heads=num_heads[i], feedforward_channels=mlp_ratio * in_channels, depth=depths[i], window_size=window_size, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])], downsample=downsample, act_cfg=act_cfg, norm_cfg=norm_cfg, with_cp=with_cp, init_cfg=None) self.stages.append(stage) if downsample: in_channels = downsample.out_channels self.num_features = [int(embed_dims * 2 ** i) for i in range(num_layers)] # Add a norm layer for each output for i in out_indices: layer = build_norm_layer(norm_cfg, self.num_features[i])[1] layer_name = f'norm{i}' self.add_module(layer_name, layer) def train(self, mode=True): """Convert the model into training mode while keep layers freezed.""" super(SwinTransformer, self).train(mode) self._freeze_stages() def _freeze_stages(self): if self.frozen_stages >= 0: self.patch_embed.eval() for param in self.patch_embed.parameters(): param.requires_grad = False if self.use_abs_pos_embed: self.absolute_pos_embed.requires_grad = False self.drop_after_pos.eval() for i in range(1, self.frozen_stages + 1): if (i - 1) in self.out_indices: norm_layer = getattr(self, f'norm{i - 1}') norm_layer.eval() for param in norm_layer.parameters(): param.requires_grad = False m = self.stages[i - 1] m.eval() for param in m.parameters(): param.requires_grad = False def init_weights(self): logger = get_root_logger() if self.init_cfg is None: logger.warn(f'No pre-trained weights for ' f'{self.__class__.__name__}, ' f'training start from scratch') if self.use_abs_pos_embed: trunc_normal_init(self.absolute_pos_embed, std=0.02) for m in self.modules(): if isinstance(m, nn.Linear): trunc_normal_init(m.weight, std=.02) if m.bias is not None: constant_init(m.bias, 0) elif isinstance(m, nn.LayerNorm): constant_init(m.bias, 0) constant_init(m.weight, 1.0) else: assert 'checkpoint' in self.init_cfg, f'Only support ' \ f'specify `Pretrained` in ' \ f'`init_cfg` in ' \ f'{self.__class__.__name__} ' ckpt = _load_checkpoint( self.init_cfg.checkpoint, logger=logger, map_location='cpu') if 'state_dict' in ckpt: _state_dict = ckpt['state_dict'] elif 'model' in ckpt: _state_dict = ckpt['model'] else: _state_dict = ckpt state_dict = OrderedDict() for k, v in _state_dict.items(): if k.startswith('backbone.'): state_dict[k[9:]] = v if self.convert_weights: # supported loading weight from original repo, state_dict = swin_converter(state_dict) # strip prefix of state_dict if list(state_dict.keys())[0].startswith('module.'): state_dict = {k[7:]: v for k, v in state_dict.items()} # reshape absolute position embedding if state_dict.get('absolute_pos_embed') is not None: absolute_pos_embed = state_dict['absolute_pos_embed'] N1, L, C1 = absolute_pos_embed.size() N2, C2, H, W = self.absolute_pos_embed.size() if N1 != N2 or C1 != C2 or L != H * W: logger.warning('Error in loading absolute_pos_embed, pass') else: state_dict['absolute_pos_embed'] = absolute_pos_embed.view( N2, H, W, C2).permute(0, 3, 1, 2).contiguous() # interpolate position bias table if needed relative_position_bias_table_keys = [ k for k in state_dict.keys() if 'relative_position_bias_table' in k ] for table_key in relative_position_bias_table_keys: table_pretrained = state_dict[table_key] table_current = self.state_dict()[table_key] L1, nH1 = table_pretrained.size() L2, nH2 = table_current.size() if nH1 != nH2: logger.warning(f'Error in loading {table_key}, pass') elif L1 != L2: S1 = int(L1 ** 0.5) S2 = int(L2 ** 0.5) table_pretrained_resized = F.interpolate( table_pretrained.permute(1, 0).reshape(1, nH1, S1, S1), size=(S2, S2), mode='bicubic') state_dict[table_key] = table_pretrained_resized.view( nH2, L2).permute(1, 0).contiguous() # load state_dict self.load_state_dict(state_dict, False) def forward(self, x): x, hw_shape = self.patch_embed(x) if self.use_abs_pos_embed: x = x + self.absolute_pos_embed x = self.drop_after_pos(x) outs = [] for i, stage in enumerate(self.stages): x, hw_shape, out, out_hw_shape = stage(x, hw_shape) if i in self.out_indices: norm_layer = getattr(self, f'norm{i}') out = norm_layer(out) out = out.view(-1, *out_hw_shape, self.num_features[i]).permute(0, 3, 1, 2).contiguous() outs.append(out) return outs class SwinRFPLayer(BaseModule): """Implements one stage in Swin Transformer. Args: embed_dims (int): The feature dimension. num_heads (int): Parallel attention heads. feedforward_channels (int): The hidden dimension for FFNs. depth (int): The number of blocks in this stage. window_size (int, optional): The local window scale. Default: 7. qkv_bias (bool, optional): enable bias for qkv if True. Default: True. qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. drop_rate (float, optional): Dropout rate. Default: 0. attn_drop_rate (float, optional): Attention dropout rate. Default: 0. drop_path_rate (float | list[float], optional): Stochastic depth rate. Default: 0. downsample (BaseModule | None, optional): The downsample operation module. Default: None. act_cfg (dict, optional): The config dict of activation function. Default: dict(type='GELU'). norm_cfg (dict, optional): The config dict of normalization. Default: dict(type='LN'). with_cp (bool, optional): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. init_cfg (dict | list | None, optional): The init config. Default: None. """ def __init__(self, embed_dims, num_heads, feedforward_channels, depth, window_size=7, qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., downsample=None, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), with_cp=False, # Added rfp_inplanes=None, # Added Done init_cfg=None): super().__init__(init_cfg=init_cfg) if isinstance(drop_path_rate, list): drop_path_rates = drop_path_rate assert len(drop_path_rates) == depth else: drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)] self.blocks = ModuleList() for i in range(depth): block = SwinBlock( embed_dims=embed_dims, num_heads=num_heads, feedforward_channels=feedforward_channels, window_size=window_size, shift=False if i % 2 == 0 else True, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=drop_path_rates[i], act_cfg=act_cfg, norm_cfg=norm_cfg, with_cp=with_cp, init_cfg=None) self.blocks.append(block) self.downsample = downsample self.rfp_inplanes = rfp_inplanes if self.rfp_inplanes: self.rfp_conv = build_conv_layer( None, self.rfp_inplanes, embed_dims, 1, stride=1, bias=True) def forward(self, x, hw_shape): for block in self.blocks: x = block(x, hw_shape) if self.downsample: x_down, down_hw_shape = self.downsample(x, hw_shape) return x_down, down_hw_shape, x, hw_shape else: return x, hw_shape, x, hw_shape def rfp_forward(self, x, hw_shape, rfp_feat): for block in self.blocks: x = block(x, hw_shape) haw = hw_shape[0] * hw_shape[1] if self.rfp_inplanes: rfp_feat = self.rfp_conv(rfp_feat) x = x + rfp_feat.permute((0, 2, 3, 1)) \ .view(x.shape[0], haw, x.shape[2]).contiguous() if self.downsample: x_down, down_hw_shape = self.downsample(x, hw_shape) return x_down, down_hw_shape, x, hw_shape else: return x, hw_shape, x, hw_shape @BACKBONES.register_module() class SwinTransformerRFP(SwinTransformer): def __init__( self, rfp_inplanes=None, output_img=False, # Old settings pretrain_img_size=224, in_channels=3, embed_dims=96, patch_size=4, window_size=7, mlp_ratio=4, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), strides=(4, 2, 2, 2), out_indices=(0, 1, 2, 3), qkv_bias=True, qk_scale=None, patch_norm=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, use_abs_pos_embed=False, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), with_cp=False, pretrained=None, convert_weights=False, frozen_stages=-1, init_cfg=None): self.rfp_inplanes = rfp_inplanes self.output_img = output_img super().__init__( pretrain_img_size=pretrain_img_size, in_channels=in_channels, embed_dims=embed_dims, patch_size=patch_size, window_size=window_size, mlp_ratio=mlp_ratio, depths=depths, num_heads=num_heads, strides=strides, out_indices=out_indices, qkv_bias=qkv_bias, qk_scale=qk_scale, patch_norm=patch_norm, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=drop_path_rate, use_abs_pos_embed=use_abs_pos_embed, act_cfg=act_cfg, norm_cfg=norm_cfg, with_cp=with_cp, pretrained=pretrained, convert_weights=convert_weights, frozen_stages=frozen_stages, init_cfg=init_cfg ) # Re-write Swin Block self.stages = ModuleList() in_channels = embed_dims num_layers = len(depths) total_depth = sum(depths) dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, total_depth) ] for i in range(num_layers): if i < num_layers - 1: downsample = PatchMerging( in_channels=in_channels, out_channels=2 * in_channels, stride=strides[i + 1], norm_cfg=norm_cfg if patch_norm else None, init_cfg=None) else: downsample = None stage = SwinRFPLayer( embed_dims=in_channels, num_heads=num_heads[i], feedforward_channels=mlp_ratio * in_channels, depth=depths[i], window_size=window_size, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])], downsample=downsample, act_cfg=act_cfg, norm_cfg=norm_cfg, with_cp=with_cp, rfp_inplanes=rfp_inplanes if i > 0 else None, init_cfg=None) self.stages.append(stage) if downsample: in_channels = downsample.out_channels def forward(self, x): """Forward function.""" outs = list(super().forward(x)) if self.output_img: outs.insert(0, x) return tuple(outs) def rfp_forward(self, x, rfp_feats): x, hw_shape = self.patch_embed(x) if self.use_abs_pos_embed: x = x + self.absolute_pos_embed x = self.drop_after_pos(x) outs = [] for i, stage in enumerate(self.stages): rfp_feat = rfp_feats[i] if i > 0 else None x, hw_shape, out, out_hw_shape = stage.rfp_forward(x, hw_shape, rfp_feat) if i in self.out_indices: norm_layer = getattr(self, f'norm{i}') out = norm_layer(out) out = out.view(-1, *out_hw_shape, self.num_features[i]).permute(0, 3, 1, 2).contiguous() outs.append(out) return outs ================================================ FILE: swin/transformer.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import math import warnings from typing import Sequence import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import (build_activation_layer, build_conv_layer, build_norm_layer, xavier_init) from mmcv.cnn.bricks.registry import (TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) from mmcv.cnn.bricks.transformer import (BaseTransformerLayer, TransformerLayerSequence, build_transformer_layer_sequence) from mmcv.runner.base_module import BaseModule from mmcv.utils import to_2tuple from torch.nn.init import normal_ from mmdet.models.utils.builder import TRANSFORMER try: from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention except ImportError: warnings.warn( '`MultiScaleDeformableAttention` in MMCV has been moved to ' '`mmcv.ops.multi_scale_deform_attn`, please update your MMCV') from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention def nlc_to_nchw(x, hw_shape): """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor. Args: x (Tensor): The input tensor of shape [N, L, C] before convertion. hw_shape (Sequence[int]): The height and width of output feature map. Returns: Tensor: The output tensor of shape [N, C, H, W] after convertion. """ H, W = hw_shape assert len(x.shape) == 3 B, L, C = x.shape assert L == H * W, 'The seq_len does not match H, W' return x.transpose(1, 2).reshape(B, C, H, W).contiguous() def nchw_to_nlc(x): """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor. Args: x (Tensor): The input tensor of shape [N, C, H, W] before convertion. Returns: Tensor: The output tensor of shape [N, L, C] after convertion. """ assert len(x.shape) == 4 return x.flatten(2).transpose(1, 2).contiguous() class AdaptivePadding(nn.Module): """Applies padding to input (if needed) so that input can get fully covered by filter you specified. It support two modes "same" and "corner". The "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around input. The "corner" mode would pad zero to bottom right. Args: kernel_size (int | tuple): Size of the kernel: stride (int | tuple): Stride of the filter. Default: 1: dilation (int | tuple): Spacing between kernel elements. Default: 1 padding (str): Support "same" and "corner", "corner" mode would pad zero to bottom right, and "same" mode would pad zero around input. Default: "corner". Example: >>> kernel_size = 16 >>> stride = 16 >>> dilation = 1 >>> input = torch.rand(1, 1, 15, 17) >>> adap_pad = AdaptivePadding( >>> kernel_size=kernel_size, >>> stride=stride, >>> dilation=dilation, >>> padding="corner") >>> out = adap_pad(input) >>> assert (out.shape[2], out.shape[3]) == (16, 32) >>> input = torch.rand(1, 1, 16, 17) >>> out = adap_pad(input) >>> assert (out.shape[2], out.shape[3]) == (16, 32) """ def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'): super(AdaptivePadding, self).__init__() assert padding in ('same', 'corner') kernel_size = to_2tuple(kernel_size) stride = to_2tuple(stride) padding = to_2tuple(padding) dilation = to_2tuple(dilation) self.padding = padding self.kernel_size = kernel_size self.stride = stride self.dilation = dilation def get_pad_shape(self, input_shape): input_h, input_w = input_shape kernel_h, kernel_w = self.kernel_size stride_h, stride_w = self.stride output_h = math.ceil(input_h / stride_h) output_w = math.ceil(input_w / stride_w) pad_h = max((output_h - 1) * stride_h + (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0) pad_w = max((output_w - 1) * stride_w + (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0) return pad_h, pad_w def forward(self, x): pad_h, pad_w = self.get_pad_shape(x.size()[-2:]) if pad_h > 0 or pad_w > 0: if self.padding == 'corner': x = F.pad(x, [0, pad_w, 0, pad_h]) elif self.padding == 'same': x = F.pad(x, [ pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2 ]) return x class PatchEmbed(BaseModule): """Image to Patch Embedding. We use a conv layer to implement PatchEmbed. Args: in_channels (int): The num of input channels. Default: 3 embed_dims (int): The dimensions of embedding. Default: 768 conv_type (str): The config dict for embedding conv layer type selection. Default: "Conv2d. kernel_size (int): The kernel_size of embedding conv. Default: 16. stride (int): The slide stride of embedding conv. Default: None (Would be set as `kernel_size`). padding (int | tuple | string ): The padding length of embedding conv. When it is a string, it means the mode of adaptive padding, support "same" and "corner" now. Default: "corner". dilation (int): The dilation rate of embedding conv. Default: 1. bias (bool): Bias of embed conv. Default: True. norm_cfg (dict, optional): Config dict for normalization layer. Default: None. input_size (int | tuple | None): The size of input, which will be used to calculate the out size. Only work when `dynamic_size` is False. Default: None. init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization. Default: None. """ def __init__( self, in_channels=3, embed_dims=768, conv_type='Conv2d', kernel_size=16, stride=16, padding='corner', dilation=1, bias=True, norm_cfg=None, input_size=None, init_cfg=None, ): super(PatchEmbed, self).__init__(init_cfg=init_cfg) self.embed_dims = embed_dims if stride is None: stride = kernel_size kernel_size = to_2tuple(kernel_size) stride = to_2tuple(stride) dilation = to_2tuple(dilation) if isinstance(padding, str): self.adap_padding = AdaptivePadding( kernel_size=kernel_size, stride=stride, dilation=dilation, padding=padding) # disable the padding of conv padding = 0 else: self.adap_padding = None padding = to_2tuple(padding) self.projection = build_conv_layer( dict(type=conv_type), in_channels=in_channels, out_channels=embed_dims, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, bias=bias) if norm_cfg is not None: self.norm = build_norm_layer(norm_cfg, embed_dims)[1] else: self.norm = None if input_size: input_size = to_2tuple(input_size) # `init_out_size` would be used outside to # calculate the num_patches # when `use_abs_pos_embed` outside self.init_input_size = input_size if self.adap_padding: pad_h, pad_w = self.adap_padding.get_pad_shape(input_size) input_h, input_w = input_size input_h = input_h + pad_h input_w = input_w + pad_w input_size = (input_h, input_w) # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html h_out = (input_size[0] + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) // stride[0] + 1 w_out = (input_size[1] + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) // stride[1] + 1 self.init_out_size = (h_out, w_out) else: self.init_input_size = None self.init_out_size = None def forward(self, x): """ Args: x (Tensor): Has shape (B, C, H, W). In most case, C is 3. Returns: tuple: Contains merged results and its spatial shape. - x (Tensor): Has shape (B, out_h * out_w, embed_dims) - out_size (tuple[int]): Spatial shape of x, arrange as (out_h, out_w). """ if self.adap_padding: x = self.adap_padding(x) x = self.projection(x) out_size = (x.shape[2], x.shape[3]) x = x.flatten(2).transpose(1, 2) if self.norm is not None: x = self.norm(x) return x, out_size class PatchMerging(BaseModule): """Merge patch feature map. This layer groups feature map by kernel_size, and applies norm and linear layers to the grouped feature map. Our implementation uses `nn.Unfold` to merge patch, which is about 25% faster than original implementation. Instead, we need to modify pretrained models for compatibility. Args: in_channels (int): The num of input channels. to gets fully covered by filter and stride you specified.. Default: True. out_channels (int): The num of output channels. kernel_size (int | tuple, optional): the kernel size in the unfold layer. Defaults to 2. stride (int | tuple, optional): the stride of the sliding blocks in the unfold layer. Default: None. (Would be set as `kernel_size`) padding (int | tuple | string ): The padding length of embedding conv. When it is a string, it means the mode of adaptive padding, support "same" and "corner" now. Default: "corner". dilation (int | tuple, optional): dilation parameter in the unfold layer. Default: 1. bias (bool, optional): Whether to add bias in linear layer or not. Defaults: False. norm_cfg (dict, optional): Config dict for normalization layer. Default: dict(type='LN'). init_cfg (dict, optional): The extra config for initialization. Default: None. """ def __init__(self, in_channels, out_channels, kernel_size=2, stride=None, padding='corner', dilation=1, bias=False, norm_cfg=dict(type='LN'), init_cfg=None): super().__init__(init_cfg=init_cfg) self.in_channels = in_channels self.out_channels = out_channels if stride: stride = stride else: stride = kernel_size kernel_size = to_2tuple(kernel_size) stride = to_2tuple(stride) dilation = to_2tuple(dilation) if isinstance(padding, str): self.adap_padding = AdaptivePadding( kernel_size=kernel_size, stride=stride, dilation=dilation, padding=padding) # disable the padding of unfold padding = 0 else: self.adap_padding = None padding = to_2tuple(padding) self.sampler = nn.Unfold( kernel_size=kernel_size, dilation=dilation, padding=padding, stride=stride) sample_dim = kernel_size[0] * kernel_size[1] * in_channels if norm_cfg is not None: self.norm = build_norm_layer(norm_cfg, sample_dim)[1] else: self.norm = None self.reduction = nn.Linear(sample_dim, out_channels, bias=bias) def forward(self, x, input_size): """ Args: x (Tensor): Has shape (B, H*W, C_in). input_size (tuple[int]): The spatial shape of x, arrange as (H, W). Default: None. Returns: tuple: Contains merged results and its spatial shape. - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out) - out_size (tuple[int]): Spatial shape of x, arrange as (Merged_H, Merged_W). """ B, L, C = x.shape assert isinstance(input_size, Sequence), f'Expect ' \ f'input_size is ' \ f'`Sequence` ' \ f'but get {input_size}' H, W = input_size assert L == H * W, 'input feature has wrong size' x = x.view(B, H, W, C).permute([0, 3, 1, 2]) # B, C, H, W # Use nn.Unfold to merge patch. About 25% faster than original method, # but need to modify pretrained model for compatibility if self.adap_padding: x = self.adap_padding(x) H, W = x.shape[-2:] x = self.sampler(x) # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2) out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] * (self.sampler.kernel_size[0] - 1) - 1) // self.sampler.stride[0] + 1 out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] * (self.sampler.kernel_size[1] - 1) - 1) // self.sampler.stride[1] + 1 output_size = (out_h, out_w) x = x.transpose(1, 2) # B, H/2*W/2, 4*C x = self.norm(x) if self.norm else x x = self.reduction(x) return x, output_size def inverse_sigmoid(x, eps=1e-5): """Inverse function of sigmoid. Args: x (Tensor): The tensor to do the inverse. eps (float): EPS avoid numerical overflow. Defaults 1e-5. Returns: Tensor: The x has passed the inverse function of sigmoid, has same shape with input. """ x = x.clamp(min=0, max=1) x1 = x.clamp(min=eps) x2 = (1 - x).clamp(min=eps) return torch.log(x1 / x2) ================================================ FILE: tools/dataset/cityscapes_instance_idmap.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import os.path as osp import mmcv from cityscapesscripts.preparation.json2instanceImg import json2instanceImg def convert_json_to_label(json_file): label_file = json_file.replace('_polygons.json', '_instanceTrainIds.png') json2instanceImg(json_file, label_file, 'trainIds') def parse_args(): parser = argparse.ArgumentParser( description='Convert Cityscapes annotations to TrainIds') parser.add_argument('cityscapes_path', help='cityscapes data path') parser.add_argument('--gt-dir', default='gtFine', type=str) parser.add_argument('-o', '--out-dir', help='output path') parser.add_argument( '--nproc', default=1, type=int, help='number of process') args = parser.parse_args() return args def main(): args = parse_args() cityscapes_path = args.cityscapes_path out_dir = args.out_dir if args.out_dir else cityscapes_path mmcv.mkdir_or_exist(out_dir) gt_dir = osp.join(cityscapes_path, args.gt_dir) poly_files = [] for poly in mmcv.scandir(gt_dir, '_polygons.json', recursive=True): poly_file = osp.join(gt_dir, poly) poly_files.append(poly_file) if args.nproc > 1: mmcv.track_parallel_progress(convert_json_to_label, poly_files, args.nproc) else: mmcv.track_progress(convert_json_to_label, poly_files) # install mmcv and cityscapesscripts # python cityscapes_instance.py {PATH/TO/CITYSCAPES} --nproc 56 if __name__ == '__main__': main() ================================================ FILE: tools/dataset/youtubevis2coco.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import copy import os import os.path as osp from collections import defaultdict import mmcv def parse_args(): parser = argparse.ArgumentParser( description='YouTube-VIS to COCO Video format') parser.add_argument( '-i', '--input', help='root directory of YouTube-VIS annotations', ) parser.add_argument( '-o', '--output', help='directory to save coco formatted label file', ) parser.add_argument( '--version', choices=['2019', '2021'], help='The version of YouTube-VIS Dataset', ) return parser.parse_args() def convert_vis(ann_dir, save_dir, dataset_version, mode='train'): """Convert YouTube-VIS dataset in COCO style. Args: ann_dir (str): The path of YouTube-VIS dataset. save_dir (str): The path to save `VIS`. dataset_version (str): The version of dataset. Options are '2019', '2021'. mode (str): Convert train dataset or validation dataset or test dataset. Options are 'train', 'valid', 'test'. Default: 'train'. """ assert dataset_version in ['2019', '2021'] assert mode in ['train', 'valid', 'test'] VIS = defaultdict(list) records = dict(vid_id=1, img_id=1, ann_id=1, global_instance_id=1) obj_num_classes = dict() if dataset_version == '2019': official_anns = mmcv.load(osp.join(ann_dir, f'{mode}.json')) elif dataset_version == '2021': official_anns = mmcv.load(osp.join(ann_dir, mode, 'instances.json')) VIS['categories'] = copy.deepcopy(official_anns['categories']) has_annotations = mode == 'train' if has_annotations: vid_to_anns = defaultdict(list) for ann_info in official_anns['annotations']: vid_to_anns[ann_info['video_id']].append(ann_info) video_infos = official_anns['videos'] for video_info in video_infos: video_name = video_info['file_names'][0].split('/')[0] video = dict(id=video_info['id'], name=video_name) VIS['videos'].append(video) num_frames = len(video_info['file_names']) width = video_info['width'] height = video_info['height'] if has_annotations: ann_infos_in_video = vid_to_anns[video_info['id']] instance_id_maps = dict() for frame_id in range(num_frames): image = dict( file_name=video_info['file_names'][frame_id], height=height, width=width, id=records['img_id'], frame_id=frame_id, video_id=video_info['id']) VIS['images'].append(image) if has_annotations: for ann_info in ann_infos_in_video: bbox = ann_info['bboxes'][frame_id] if bbox is None: continue category_id = ann_info['category_id'] track_id = ann_info['id'] segmentation = ann_info['segmentations'][frame_id] area = ann_info['areas'][frame_id] assert isinstance(category_id, int) assert isinstance(track_id, int) assert segmentation is not None assert area is not None if track_id in instance_id_maps: instance_id = instance_id_maps[track_id] else: instance_id = records['global_instance_id'] records['global_instance_id'] += 1 instance_id_maps[track_id] = instance_id ann = dict( id=records['ann_id'], video_id=video_info['id'], image_id=records['img_id'], category_id=category_id, instance_id=instance_id, bbox=bbox, segmentation=segmentation, area=area, iscrowd=ann_info['iscrowd']) if category_id not in obj_num_classes: obj_num_classes[category_id] = 1 else: obj_num_classes[category_id] += 1 VIS['annotations'].append(ann) records['ann_id'] += 1 records['img_id'] += 1 records['vid_id'] += 1 if not osp.isdir(save_dir): os.makedirs(save_dir) mmcv.dump(VIS, osp.join(save_dir, f'youtube_vis_{dataset_version}_{mode}.json')) print(f'-----YouTube VIS {dataset_version} {mode}------') print(f'{records["vid_id"]- 1} videos') print(f'{records["img_id"]- 1} images') if has_annotations: print(f'{records["ann_id"] - 1} objects') print(f'{records["global_instance_id"] - 1} instances') print('-----------------------') if has_annotations: for i in range(1, len(VIS['categories']) + 1): class_name = VIS['categories'][i - 1]['name'] print(f'Class {i} {class_name} has {obj_num_classes[i]} objects.') def main(): args = parse_args() for sub_set in ['train', 'valid', 'test']: convert_vis(args.input, args.output, args.version, sub_set) if __name__ == '__main__': main() ================================================ FILE: tools/dist_step_test.sh ================================================ #!/usr/bin/env bash CONFIG=$1 CHECKPOINT=$2 GPUS=$3 PORT=${PORT:-29500} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ $(dirname "$0")/test_step.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} ================================================ FILE: tools/dist_test.sh ================================================ #!/usr/bin/env bash CONFIG=$1 CHECKPOINT=$2 GPUS=$3 PORT=${PORT:-29500} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} ================================================ FILE: tools/dist_train.sh ================================================ #!/usr/bin/env bash CONFIG=$1 GPUS=$2 PORT=${PORT:-$((29500 + $RANDOM % 29))} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} ================================================ FILE: tools/dist_train_new.sh ================================================ #!/usr/bin/env bash CONFIG=$1 GPUS=$2 PORT=${PORT:-$((29500 + $RANDOM % 29))} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ python -m torch.distributed.run --nproc_per_node=$GPUS --master_port=$PORT \ $(dirname "$0")/train_new.py $CONFIG --launcher pytorch ${@:3} ================================================ FILE: tools/dist_vps_test.sh ================================================ #!/usr/bin/env bash CONFIG=$1 CHECKPOINT=$2 GPUS=$3 PORT=${PORT:-29500} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ $(dirname "$0")/test_vps.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} ================================================ FILE: tools/docker.sh ================================================ #!/bin/bash DATALOC=${DATALOC:-~/datasets} LOGLOC=${LOGLOC:-~/logger} IMG=${IMG:-"harbory/openmmlab:latest"} docker run --gpus all -it --rm --ipc=host --net=host -v $(pwd):/data -v $DATALOC:/data/data -v $LOGLOC:/data/logger $IMG ================================================ FILE: tools/eval_dstq.py ================================================ import argparse import os import mmcv import numpy as np import torch from mmcv import ProgressBar import torch.nn.functional as F from tools.utils.DSTQ import DSTQuality from tools.utils.STQ import STQuality def parse_args(): parser = argparse.ArgumentParser(description='Evaluation of DSTQ') parser.add_argument('result_path') parser.add_argument('--gt-path', default='data/kitti-dvps') parser.add_argument('--split', default='val') parser.add_argument( '--depth', action='store_true', help='eval depth') parser.add_argument('--nproc', default=1, type=int, help='number of process') args = parser.parse_args() return args def updater(pred_ins_name, pred_cls_name, pred_dep_name, gt_cls_seq_name, gt_ins_seq_name, gt_dep_seq_name, updater_obj, seq_id): pred_ins = mmcv.imread(pred_ins_name, flag='unchanged').astype(np.int32) pred_cls = mmcv.imread(pred_cls_name, flag='unchanged').astype(np.int32) pred_dep = mmcv.imread(pred_dep_name, flag='unchanged').astype(np.float32) if pred_dep_name is not None else None gt_ins = mmcv.imread(gt_ins_seq_name, flag='unchanged').astype(np.int32) gt_cls = mmcv.imread(gt_cls_seq_name, flag='unchanged').astype(np.int32) gt_dep = mmcv.imread(gt_dep_seq_name, flag='unchanged').astype(np.float32) if gt_dep_seq_name is not None else None if pred_dep is not None: pred_dep = F.interpolate(torch.from_numpy(pred_dep)[None][None], size=gt_dep.shape)[0][0].numpy() valid_mask_seg = gt_cls != 255 pred_masked_ps = pred_cls[valid_mask_seg] * (2 ** 16) + pred_ins[valid_mask_seg] gt_masked_ps = gt_cls[valid_mask_seg] * (2 ** 16) + gt_ins[valid_mask_seg] if pred_dep_name is not None: valid_mask_dep = gt_dep > 0. pred_masked_depth = pred_dep[valid_mask_dep] gt_masked_depth = gt_dep[valid_mask_dep] updater_obj.update_state(gt_masked_ps, pred_masked_ps, gt_masked_depth, pred_masked_depth, seq_id) else: updater_obj.update_state(gt_masked_ps, pred_masked_ps, seq_id) def eval_dstq(result_dir, gt_dir, seq_ids, with_depth=True): if with_depth: dstq_obj = DSTQuality( num_classes=19, things_list=list(range(8)), ignore_label=255, label_bit_shift=16, offset=2 ** 16 * 256, depth_threshold=(1.25,), ) else: dstq_obj = STQuality( num_classes=19, things_list=list(range(8)), ignore_label=255, label_bit_shift=16, offset=2 ** 16 * 256, ) gt_names = list(mmcv.scandir(gt_dir)) gt_cls_names = sorted(list(filter(lambda x: 'gtFine_class' in x, gt_names))) gt_ins_names = sorted(list(filter(lambda x: 'gtFine_instance' in x, gt_names))) if with_depth: gt_dep_names = sorted(list(filter(lambda x: 'depth' in x, gt_names))) else: gt_dep_names = None for seq_id in seq_ids: pred_name_panoptic = list(mmcv.scandir(os.path.join(result_dir, 'panoptic', str(seq_id)))) pred_ins_names = sorted(list(filter(lambda x: 'ins' in x, pred_name_panoptic))) pred_cls_names = sorted(list(filter(lambda x: 'cat' in x, pred_name_panoptic))) if with_depth: pred_name_depth = list(mmcv.scandir(os.path.join(result_dir, 'depth', str(seq_id)))) pred_dep_names = sorted(pred_name_depth) else: pred_dep_names = [None] * len(pred_ins_names) gt_cls_seq_names = list(filter(lambda x: x.startswith('{:06d}'.format(seq_id)), gt_cls_names)) gt_ins_seq_names = list(filter(lambda x: x.startswith('{:06d}'.format(seq_id)), gt_ins_names)) if with_depth: gt_dep_seq_names = list(filter(lambda x: x.startswith('{:06d}'.format(seq_id)), gt_dep_names)) else: gt_dep_seq_names = [None] * len(gt_cls_seq_names) prog_bar = ProgressBar(len(pred_ins_names)) for pred_ins_name, pred_cls_name, pred_dep_name, gt_cls_seq_name, gt_ins_seq_name, gt_dep_seq_name in zip( pred_ins_names, pred_cls_names, pred_dep_names, gt_cls_seq_names, gt_ins_seq_names, gt_dep_seq_names ): prog_bar.update() updater( os.path.join(result_dir, 'panoptic', str(seq_id), pred_ins_name), os.path.join(result_dir, 'panoptic', str(seq_id), pred_cls_name), os.path.join(result_dir, 'depth', str(seq_id), pred_dep_name) if pred_dep_name is not None else None, os.path.join(gt_dir, gt_cls_seq_name), os.path.join(gt_dir, gt_ins_seq_name), os.path.join(gt_dir, gt_dep_seq_name) if gt_dep_seq_name is not None else None, dstq_obj, seq_id ) result = dstq_obj.result() print(result) if __name__ == '__main__': args = parse_args() result_path = args.result_path gt_path = args.gt_path split = args.split eval_dstq(result_path, os.path.join(gt_path, 'video_sequence', split), [8], args.depth) ================================================ FILE: tools/eval_dstq_step.py ================================================ import argparse import os import mmcv import numpy as np import torch from mmcv import ProgressBar import torch.nn.functional as F from tools.utils.DSTQ import DSTQuality from tools.utils.STQ import STQuality def parse_args(): parser = argparse.ArgumentParser(description='Evaluation of DSTQ') parser.add_argument('result_path') parser.add_argument('--gt-path', default='data/kitti-step') parser.add_argument('--split', default='val') parser.add_argument( '--depth', action='store_true', help='eval depth') parser.add_argument('--nproc', default=1, type=int, help='number of process') args = parser.parse_args() return args def updater(pred_ins_name, pred_cls_name, pred_dep_name, gt_pan_seq_name, gt_dep_seq_name, updater_obj, seq_id): pred_ins = mmcv.imread(pred_ins_name, flag='unchanged').astype(np.int32) pred_cls = mmcv.imread(pred_cls_name, flag='unchanged').astype(np.int32) pred_dep = mmcv.imread(pred_dep_name, flag='unchanged').astype(np.float32) if pred_dep_name is not None else None gt_pan = mmcv.imread(gt_pan_seq_name, flag='color', channel_order='rgb') gt_cls = gt_pan[..., 0].astype(np.int32) gt_ins = gt_pan[..., 1].astype(np.int32) * 256 + gt_pan[..., 2].astype(np.int32) gt_dep = mmcv.imread(gt_dep_seq_name, flag='unchanged').astype(np.float32) if gt_dep_seq_name is not None else None if pred_dep is not None: pred_dep = F.interpolate(torch.from_numpy(pred_dep)[None][None], size=gt_dep.shape)[0][0].numpy() valid_mask_seg = gt_cls != 255 pred_masked_ps = pred_cls[valid_mask_seg] * (2 ** 16) + pred_ins[valid_mask_seg] gt_masked_ps = gt_cls[valid_mask_seg] * (2 ** 16) + gt_ins[valid_mask_seg] if pred_dep_name is not None: valid_mask_dep = gt_dep > 0. pred_masked_depth = pred_dep[valid_mask_dep] gt_masked_depth = gt_dep[valid_mask_dep] updater_obj.update_state(gt_masked_ps, pred_masked_ps, gt_masked_depth, pred_masked_depth, seq_id) else: updater_obj.update_state(gt_masked_ps, pred_masked_ps, seq_id) def eval_dstq(result_dir, gt_dir, seq_ids, with_depth=True): if with_depth: dstq_obj = DSTQuality( num_classes=19, things_list=list(range(11, 19)), ignore_label=255, label_bit_shift=16, offset=2 ** 16 * 256, depth_threshold=(1.25,), ) else: dstq_obj = STQuality( num_classes=19, things_list=list(range(11, 19)), ignore_label=255, label_bit_shift=16, offset=2 ** 16 * 256, ) gt_names = list(mmcv.scandir(gt_dir)) gt_pan_names = sorted(list(filter(lambda x: 'panoptic' in x, gt_names))) if with_depth: gt_dep_names = sorted(list(filter(lambda x: 'depth' in x, gt_names))) else: gt_dep_names = None for seq_id in seq_ids: pred_name_panoptic = list(mmcv.scandir(os.path.join(result_dir, 'panoptic', str(seq_id)))) pred_ins_names = sorted(list(filter(lambda x: 'ins' in x, pred_name_panoptic))) pred_cls_names = sorted(list(filter(lambda x: 'cat' in x, pred_name_panoptic))) if with_depth: pred_name_depth = list(mmcv.scandir(os.path.join(result_dir, 'depth', str(seq_id)))) pred_dep_names = sorted(pred_name_depth) else: pred_dep_names = [None] * len(pred_ins_names) gt_pan_seq_names = list(filter(lambda x: x.startswith('{:06d}'.format(seq_id)), gt_pan_names)) if with_depth: gt_dep_seq_names = list(filter(lambda x: x.startswith('{:06d}'.format(seq_id)), gt_dep_names)) else: gt_dep_seq_names = [None] * len(gt_pan_seq_names) prog_bar = ProgressBar(len(pred_ins_names)) for pred_ins_name, pred_cls_name, pred_dep_name, gt_pan_seq_name, gt_dep_seq_name in zip( pred_ins_names, pred_cls_names, pred_dep_names, gt_pan_seq_names, gt_dep_seq_names ): prog_bar.update() updater( os.path.join(result_dir, 'panoptic', str(seq_id), pred_ins_name), os.path.join(result_dir, 'panoptic', str(seq_id), pred_cls_name), os.path.join(result_dir, 'depth', str(seq_id), pred_dep_name) if pred_dep_name is not None else None, os.path.join(gt_dir, gt_pan_seq_name), os.path.join(gt_dir, gt_dep_seq_name) if gt_dep_seq_name is not None else None, dstq_obj, seq_id ) result = dstq_obj.result() print(result) if __name__ == '__main__': args = parse_args() result_path = args.result_path gt_path = args.gt_path split = args.split eval_dstq(result_path, os.path.join(gt_path, 'video_sequence', split), [2, 6, 7, 8, 10, 13, 14, 16, 18], args.depth) ================================================ FILE: tools/eval_dstq_vipseg.py ================================================ import argparse import os import mmcv import numpy as np import torch from mmcv import ProgressBar import torch.nn.functional as F from tools.utils.DSTQ import DSTQuality from tools.utils.STQ import STQuality CLASSES = [ {"id": 0, "name": "wall", "isthing": 0, "color": [120, 120, 120]}, {"id": 1, "name": "ceiling", "isthing": 0, "color": [180, 120, 120]}, {"id": 2, "name": "door", "isthing": 1, "color": [6, 230, 230]}, {"id": 3, "name": "stair", "isthing": 0, "color": [80, 50, 50]}, {"id": 4, "name": "ladder", "isthing": 1, "color": [4, 200, 3]}, {"id": 5, "name": "escalator", "isthing": 0, "color": [120, 120, 80]}, {"id": 6, "name": "Playground_slide", "isthing": 0, "color": [140, 140, 140]}, {"id": 7, "name": "handrail_or_fence", "isthing": 0, "color": [204, 5, 255]}, {"id": 8, "name": "window", "isthing": 1, "color": [230, 230, 230]}, {"id": 9, "name": "rail", "isthing": 0, "color": [4, 250, 7]}, {"id": 10, "name": "goal", "isthing": 1, "color": [224, 5, 255]}, {"id": 11, "name": "pillar", "isthing": 0, "color": [235, 255, 7]}, {"id": 12, "name": "pole", "isthing": 0, "color": [150, 5, 61]}, {"id": 13, "name": "floor", "isthing": 0, "color": [120, 120, 70]}, {"id": 14, "name": "ground", "isthing": 0, "color": [8, 255, 51]}, {"id": 15, "name": "grass", "isthing": 0, "color": [255, 6, 82]}, {"id": 16, "name": "sand", "isthing": 0, "color": [143, 255, 140]}, {"id": 17, "name": "athletic_field", "isthing": 0, "color": [204, 255, 4]}, {"id": 18, "name": "road", "isthing": 0, "color": [255, 51, 7]}, {"id": 19, "name": "path", "isthing": 0, "color": [204, 70, 3]}, {"id": 20, "name": "crosswalk", "isthing": 0, "color": [0, 102, 200]}, {"id": 21, "name": "building", "isthing": 0, "color": [61, 230, 250]}, {"id": 22, "name": "house", "isthing": 0, "color": [255, 6, 51]}, {"id": 23, "name": "bridge", "isthing": 0, "color": [11, 102, 255]}, {"id": 24, "name": "tower", "isthing": 0, "color": [255, 7, 71]}, {"id": 25, "name": "windmill", "isthing": 0, "color": [255, 9, 224]}, {"id": 26, "name": "well_or_well_lid", "isthing": 0, "color": [9, 7, 230]}, {"id": 27, "name": "other_construction", "isthing": 0, "color": [220, 220, 220]}, {"id": 28, "name": "sky", "isthing": 0, "color": [255, 9, 92]}, {"id": 29, "name": "mountain", "isthing": 0, "color": [112, 9, 255]}, {"id": 30, "name": "stone", "isthing": 0, "color": [8, 255, 214]}, {"id": 31, "name": "wood", "isthing": 0, "color": [7, 255, 224]}, {"id": 32, "name": "ice", "isthing": 0, "color": [255, 184, 6]}, {"id": 33, "name": "snowfield", "isthing": 0, "color": [10, 255, 71]}, {"id": 34, "name": "grandstand", "isthing": 0, "color": [255, 41, 10]}, {"id": 35, "name": "sea", "isthing": 0, "color": [7, 255, 255]}, {"id": 36, "name": "river", "isthing": 0, "color": [224, 255, 8]}, {"id": 37, "name": "lake", "isthing": 0, "color": [102, 8, 255]}, {"id": 38, "name": "waterfall", "isthing": 0, "color": [255, 61, 6]}, {"id": 39, "name": "water", "isthing": 0, "color": [255, 194, 7]}, {"id": 40, "name": "billboard_or_Bulletin_Board", "isthing": 0, "color": [255, 122, 8]}, {"id": 41, "name": "sculpture", "isthing": 1, "color": [0, 255, 20]}, {"id": 42, "name": "pipeline", "isthing": 0, "color": [255, 8, 41]}, {"id": 43, "name": "flag", "isthing": 1, "color": [255, 5, 153]}, {"id": 44, "name": "parasol_or_umbrella", "isthing": 1, "color": [6, 51, 255]}, {"id": 45, "name": "cushion_or_carpet", "isthing": 0, "color": [235, 12, 255]}, {"id": 46, "name": "tent", "isthing": 1, "color": [160, 150, 20]}, {"id": 47, "name": "roadblock", "isthing": 1, "color": [0, 163, 255]}, {"id": 48, "name": "car", "isthing": 1, "color": [140, 140, 140]}, {"id": 49, "name": "bus", "isthing": 1, "color": [250, 10, 15]}, {"id": 50, "name": "truck", "isthing": 1, "color": [20, 255, 0]}, {"id": 51, "name": "bicycle", "isthing": 1, "color": [31, 255, 0]}, {"id": 52, "name": "motorcycle", "isthing": 1, "color": [255, 31, 0]}, {"id": 53, "name": "wheeled_machine", "isthing": 0, "color": [255, 224, 0]}, {"id": 54, "name": "ship_or_boat", "isthing": 1, "color": [153, 255, 0]}, {"id": 55, "name": "raft", "isthing": 1, "color": [0, 0, 255]}, {"id": 56, "name": "airplane", "isthing": 1, "color": [255, 71, 0]}, {"id": 57, "name": "tyre", "isthing": 0, "color": [0, 235, 255]}, {"id": 58, "name": "traffic_light", "isthing": 0, "color": [0, 173, 255]}, {"id": 59, "name": "lamp", "isthing": 0, "color": [31, 0, 255]}, {"id": 60, "name": "person", "isthing": 1, "color": [11, 200, 200]}, {"id": 61, "name": "cat", "isthing": 1, "color": [255, 82, 0]}, {"id": 62, "name": "dog", "isthing": 1, "color": [0, 255, 245]}, {"id": 63, "name": "horse", "isthing": 1, "color": [0, 61, 255]}, {"id": 64, "name": "cattle", "isthing": 1, "color": [0, 255, 112]}, {"id": 65, "name": "other_animal", "isthing": 1, "color": [0, 255, 133]}, {"id": 66, "name": "tree", "isthing": 0, "color": [255, 0, 0]}, {"id": 67, "name": "flower", "isthing": 0, "color": [255, 163, 0]}, {"id": 68, "name": "other_plant", "isthing": 0, "color": [255, 102, 0]}, {"id": 69, "name": "toy", "isthing": 0, "color": [194, 255, 0]}, {"id": 70, "name": "ball_net", "isthing": 0, "color": [0, 143, 255]}, {"id": 71, "name": "backboard", "isthing": 0, "color": [51, 255, 0]}, {"id": 72, "name": "skateboard", "isthing": 1, "color": [0, 82, 255]}, {"id": 73, "name": "bat", "isthing": 0, "color": [0, 255, 41]}, {"id": 74, "name": "ball", "isthing": 1, "color": [0, 255, 173]}, {"id": 75, "name": "cupboard_or_showcase_or_storage_rack", "isthing": 0, "color": [10, 0, 255]}, {"id": 76, "name": "box", "isthing": 1, "color": [173, 255, 0]}, {"id": 77, "name": "traveling_case_or_trolley_case", "isthing": 1, "color": [0, 255, 153]}, {"id": 78, "name": "basket", "isthing": 1, "color": [255, 92, 0]}, {"id": 79, "name": "bag_or_package", "isthing": 1, "color": [255, 0, 255]}, {"id": 80, "name": "trash_can", "isthing": 0, "color": [255, 0, 245]}, {"id": 81, "name": "cage", "isthing": 0, "color": [255, 0, 102]}, {"id": 82, "name": "plate", "isthing": 1, "color": [255, 173, 0]}, {"id": 83, "name": "tub_or_bowl_or_pot", "isthing": 1, "color": [255, 0, 20]}, {"id": 84, "name": "bottle_or_cup", "isthing": 1, "color": [255, 184, 184]}, {"id": 85, "name": "barrel", "isthing": 1, "color": [0, 31, 255]}, {"id": 86, "name": "fishbowl", "isthing": 1, "color": [0, 255, 61]}, {"id": 87, "name": "bed", "isthing": 1, "color": [0, 71, 255]}, {"id": 88, "name": "pillow", "isthing": 1, "color": [255, 0, 204]}, {"id": 89, "name": "table_or_desk", "isthing": 1, "color": [0, 255, 194]}, {"id": 90, "name": "chair_or_seat", "isthing": 1, "color": [0, 255, 82]}, {"id": 91, "name": "bench", "isthing": 1, "color": [0, 10, 255]}, {"id": 92, "name": "sofa", "isthing": 1, "color": [0, 112, 255]}, {"id": 93, "name": "shelf", "isthing": 0, "color": [51, 0, 255]}, {"id": 94, "name": "bathtub", "isthing": 0, "color": [0, 194, 255]}, {"id": 95, "name": "gun", "isthing": 1, "color": [0, 122, 255]}, {"id": 96, "name": "commode", "isthing": 1, "color": [0, 255, 163]}, {"id": 97, "name": "roaster", "isthing": 1, "color": [255, 153, 0]}, {"id": 98, "name": "other_machine", "isthing": 0, "color": [0, 255, 10]}, {"id": 99, "name": "refrigerator", "isthing": 1, "color": [255, 112, 0]}, {"id": 100, "name": "washing_machine", "isthing": 1, "color": [143, 255, 0]}, {"id": 101, "name": "Microwave_oven", "isthing": 1, "color": [82, 0, 255]}, {"id": 102, "name": "fan", "isthing": 1, "color": [163, 255, 0]}, {"id": 103, "name": "curtain", "isthing": 0, "color": [255, 235, 0]}, {"id": 104, "name": "textiles", "isthing": 0, "color": [8, 184, 170]}, {"id": 105, "name": "clothes", "isthing": 0, "color": [133, 0, 255]}, {"id": 106, "name": "painting_or_poster", "isthing": 1, "color": [0, 255, 92]}, {"id": 107, "name": "mirror", "isthing": 1, "color": [184, 0, 255]}, {"id": 108, "name": "flower_pot_or_vase", "isthing": 1, "color": [255, 0, 31]}, {"id": 109, "name": "clock", "isthing": 1, "color": [0, 184, 255]}, {"id": 110, "name": "book", "isthing": 0, "color": [0, 214, 255]}, {"id": 111, "name": "tool", "isthing": 0, "color": [255, 0, 112]}, {"id": 112, "name": "blackboard", "isthing": 0, "color": [92, 255, 0]}, {"id": 113, "name": "tissue", "isthing": 0, "color": [0, 224, 255]}, {"id": 114, "name": "screen_or_television", "isthing": 1, "color": [112, 224, 255]}, {"id": 115, "name": "computer", "isthing": 1, "color": [70, 184, 160]}, {"id": 116, "name": "printer", "isthing": 1, "color": [163, 0, 255]}, {"id": 117, "name": "Mobile_phone", "isthing": 1, "color": [153, 0, 255]}, {"id": 118, "name": "keyboard", "isthing": 1, "color": [71, 255, 0]}, {"id": 119, "name": "other_electronic_product", "isthing": 0, "color": [255, 0, 163]}, {"id": 120, "name": "fruit", "isthing": 0, "color": [255, 204, 0]}, {"id": 121, "name": "food", "isthing": 0, "color": [255, 0, 143]}, {"id": 122, "name": "instrument", "isthing": 1, "color": [0, 255, 235]}, {"id": 123, "name": "train", "isthing": 1, "color": [133, 255, 0]} ] CLASSES_THING = [ {'id': 2, 'name': 'door', 'isthing': 1, 'color': [6, 230, 230]}, {'id': 4, 'name': 'ladder', 'isthing': 1, 'color': [4, 200, 3]}, {'id': 8, 'name': 'window', 'isthing': 1, 'color': [230, 230, 230]}, {'id': 10, 'name': 'goal', 'isthing': 1, 'color': [224, 5, 255]}, {'id': 41, 'name': 'sculpture', 'isthing': 1, 'color': [0, 255, 20]}, {'id': 43, 'name': 'flag', 'isthing': 1, 'color': [255, 5, 153]}, {'id': 44, 'name': 'parasol_or_umbrella', 'isthing': 1, 'color': [6, 51, 255]}, {'id': 46, 'name': 'tent', 'isthing': 1, 'color': [160, 150, 20]}, {'id': 47, 'name': 'roadblock', 'isthing': 1, 'color': [0, 163, 255]}, {'id': 48, 'name': 'car', 'isthing': 1, 'color': [140, 140, 140]}, {'id': 49, 'name': 'bus', 'isthing': 1, 'color': [250, 10, 15]}, {'id': 50, 'name': 'truck', 'isthing': 1, 'color': [20, 255, 0]}, {'id': 51, 'name': 'bicycle', 'isthing': 1, 'color': [31, 255, 0]}, {'id': 52, 'name': 'motorcycle', 'isthing': 1, 'color': [255, 31, 0]}, {'id': 54, 'name': 'ship_or_boat', 'isthing': 1, 'color': [153, 255, 0]}, {'id': 55, 'name': 'raft', 'isthing': 1, 'color': [0, 0, 255]}, {'id': 56, 'name': 'airplane', 'isthing': 1, 'color': [255, 71, 0]}, {'id': 60, 'name': 'person', 'isthing': 1, 'color': [11, 200, 200]}, {'id': 61, 'name': 'cat', 'isthing': 1, 'color': [255, 82, 0]}, {'id': 62, 'name': 'dog', 'isthing': 1, 'color': [0, 255, 245]}, {'id': 63, 'name': 'horse', 'isthing': 1, 'color': [0, 61, 255]}, {'id': 64, 'name': 'cattle', 'isthing': 1, 'color': [0, 255, 112]}, {'id': 65, 'name': 'other_animal', 'isthing': 1, 'color': [0, 255, 133]}, {'id': 72, 'name': 'skateboard', 'isthing': 1, 'color': [0, 82, 255]}, {'id': 74, 'name': 'ball', 'isthing': 1, 'color': [0, 255, 173]}, {'id': 76, 'name': 'box', 'isthing': 1, 'color': [173, 255, 0]}, {'id': 77, 'name': 'traveling_case_or_trolley_case', 'isthing': 1, 'color': [0, 255, 153]}, {'id': 78, 'name': 'basket', 'isthing': 1, 'color': [255, 92, 0]}, {'id': 79, 'name': 'bag_or_package', 'isthing': 1, 'color': [255, 0, 255]}, {'id': 82, 'name': 'plate', 'isthing': 1, 'color': [255, 173, 0]}, {'id': 83, 'name': 'tub_or_bowl_or_pot', 'isthing': 1, 'color': [255, 0, 20]}, {'id': 84, 'name': 'bottle_or_cup', 'isthing': 1, 'color': [255, 184, 184]}, {'id': 85, 'name': 'barrel', 'isthing': 1, 'color': [0, 31, 255]}, {'id': 86, 'name': 'fishbowl', 'isthing': 1, 'color': [0, 255, 61]}, {'id': 87, 'name': 'bed', 'isthing': 1, 'color': [0, 71, 255]}, {'id': 88, 'name': 'pillow', 'isthing': 1, 'color': [255, 0, 204]}, {'id': 89, 'name': 'table_or_desk', 'isthing': 1, 'color': [0, 255, 194]}, {'id': 90, 'name': 'chair_or_seat', 'isthing': 1, 'color': [0, 255, 82]}, {'id': 91, 'name': 'bench', 'isthing': 1, 'color': [0, 10, 255]}, {'id': 92, 'name': 'sofa', 'isthing': 1, 'color': [0, 112, 255]}, {'id': 95, 'name': 'gun', 'isthing': 1, 'color': [0, 122, 255]}, {'id': 96, 'name': 'commode', 'isthing': 1, 'color': [0, 255, 163]}, {'id': 97, 'name': 'roaster', 'isthing': 1, 'color': [255, 153, 0]}, {'id': 99, 'name': 'refrigerator', 'isthing': 1, 'color': [255, 112, 0]}, {'id': 100, 'name': 'washing_machine', 'isthing': 1, 'color': [143, 255, 0]}, {'id': 101, 'name': 'Microwave_oven', 'isthing': 1, 'color': [82, 0, 255]}, {'id': 102, 'name': 'fan', 'isthing': 1, 'color': [163, 255, 0]}, {'id': 106, 'name': 'painting_or_poster', 'isthing': 1, 'color': [0, 255, 92]}, {'id': 107, 'name': 'mirror', 'isthing': 1, 'color': [184, 0, 255]}, {'id': 108, 'name': 'flower_pot_or_vase', 'isthing': 1, 'color': [255, 0, 31]}, {'id': 109, 'name': 'clock', 'isthing': 1, 'color': [0, 184, 255]}, {'id': 114, 'name': 'screen_or_television', 'isthing': 1, 'color': [112, 224, 255]}, {'id': 115, 'name': 'computer', 'isthing': 1, 'color': [70, 184, 160]}, {'id': 116, 'name': 'printer', 'isthing': 1, 'color': [163, 0, 255]}, {'id': 117, 'name': 'Mobile_phone', 'isthing': 1, 'color': [153, 0, 255]}, {'id': 118, 'name': 'keyboard', 'isthing': 1, 'color': [71, 255, 0]}, {'id': 122, 'name': 'instrument', 'isthing': 1, 'color': [0, 255, 235]}, {'id': 123, 'name': 'train', 'isthing': 1, 'color': [133, 255, 0]} ] CLASSES_STUFF = [ {'id': 0, 'name': 'wall', 'isthing': 0, 'color': [120, 120, 120]}, {'id': 1, 'name': 'ceiling', 'isthing': 0, 'color': [180, 120, 120]}, {'id': 3, 'name': 'stair', 'isthing': 0, 'color': [80, 50, 50]}, {'id': 5, 'name': 'escalator', 'isthing': 0, 'color': [120, 120, 80]}, {'id': 6, 'name': 'Playground_slide', 'isthing': 0, 'color': [140, 140, 140]}, {'id': 7, 'name': 'handrail_or_fence', 'isthing': 0, 'color': [204, 5, 255]}, {'id': 9, 'name': 'rail', 'isthing': 0, 'color': [4, 250, 7]}, {'id': 11, 'name': 'pillar', 'isthing': 0, 'color': [235, 255, 7]}, {'id': 12, 'name': 'pole', 'isthing': 0, 'color': [150, 5, 61]}, {'id': 13, 'name': 'floor', 'isthing': 0, 'color': [120, 120, 70]}, {'id': 14, 'name': 'ground', 'isthing': 0, 'color': [8, 255, 51]}, {'id': 15, 'name': 'grass', 'isthing': 0, 'color': [255, 6, 82]}, {'id': 16, 'name': 'sand', 'isthing': 0, 'color': [143, 255, 140]}, {'id': 17, 'name': 'athletic_field', 'isthing': 0, 'color': [204, 255, 4]}, {'id': 18, 'name': 'road', 'isthing': 0, 'color': [255, 51, 7]}, {'id': 19, 'name': 'path', 'isthing': 0, 'color': [204, 70, 3]}, {'id': 20, 'name': 'crosswalk', 'isthing': 0, 'color': [0, 102, 200]}, {'id': 21, 'name': 'building', 'isthing': 0, 'color': [61, 230, 250]}, {'id': 22, 'name': 'house', 'isthing': 0, 'color': [255, 6, 51]}, {'id': 23, 'name': 'bridge', 'isthing': 0, 'color': [11, 102, 255]}, {'id': 24, 'name': 'tower', 'isthing': 0, 'color': [255, 7, 71]}, {'id': 25, 'name': 'windmill', 'isthing': 0, 'color': [255, 9, 224]}, {'id': 26, 'name': 'well_or_well_lid', 'isthing': 0, 'color': [9, 7, 230]}, {'id': 27, 'name': 'other_construction', 'isthing': 0, 'color': [220, 220, 220]}, {'id': 28, 'name': 'sky', 'isthing': 0, 'color': [255, 9, 92]}, {'id': 29, 'name': 'mountain', 'isthing': 0, 'color': [112, 9, 255]}, {'id': 30, 'name': 'stone', 'isthing': 0, 'color': [8, 255, 214]}, {'id': 31, 'name': 'wood', 'isthing': 0, 'color': [7, 255, 224]}, {'id': 32, 'name': 'ice', 'isthing': 0, 'color': [255, 184, 6]}, {'id': 33, 'name': 'snowfield', 'isthing': 0, 'color': [10, 255, 71]}, {'id': 34, 'name': 'grandstand', 'isthing': 0, 'color': [255, 41, 10]}, {'id': 35, 'name': 'sea', 'isthing': 0, 'color': [7, 255, 255]}, {'id': 36, 'name': 'river', 'isthing': 0, 'color': [224, 255, 8]}, {'id': 37, 'name': 'lake', 'isthing': 0, 'color': [102, 8, 255]}, {'id': 38, 'name': 'waterfall', 'isthing': 0, 'color': [255, 61, 6]}, {'id': 39, 'name': 'water', 'isthing': 0, 'color': [255, 194, 7]}, {'id': 40, 'name': 'billboard_or_Bulletin_Board', 'isthing': 0, 'color': [255, 122, 8]}, {'id': 42, 'name': 'pipeline', 'isthing': 0, 'color': [255, 8, 41]}, {'id': 45, 'name': 'cushion_or_carpet', 'isthing': 0, 'color': [235, 12, 255]}, {'id': 53, 'name': 'wheeled_machine', 'isthing': 0, 'color': [255, 224, 0]}, {'id': 57, 'name': 'tyre', 'isthing': 0, 'color': [0, 235, 255]}, {'id': 58, 'name': 'traffic_light', 'isthing': 0, 'color': [0, 173, 255]}, {'id': 59, 'name': 'lamp', 'isthing': 0, 'color': [31, 0, 255]}, {'id': 66, 'name': 'tree', 'isthing': 0, 'color': [255, 0, 0]}, {'id': 67, 'name': 'flower', 'isthing': 0, 'color': [255, 163, 0]}, {'id': 68, 'name': 'other_plant', 'isthing': 0, 'color': [255, 102, 0]}, {'id': 69, 'name': 'toy', 'isthing': 0, 'color': [194, 255, 0]}, {'id': 70, 'name': 'ball_net', 'isthing': 0, 'color': [0, 143, 255]}, {'id': 71, 'name': 'backboard', 'isthing': 0, 'color': [51, 255, 0]}, {'id': 73, 'name': 'bat', 'isthing': 0, 'color': [0, 255, 41]}, {'id': 75, 'name': 'cupboard_or_showcase_or_storage_rack', 'isthing': 0, 'color': [10, 0, 255]}, {'id': 80, 'name': 'trash_can', 'isthing': 0, 'color': [255, 0, 245]}, {'id': 81, 'name': 'cage', 'isthing': 0, 'color': [255, 0, 102]}, {'id': 93, 'name': 'shelf', 'isthing': 0, 'color': [51, 0, 255]}, {'id': 94, 'name': 'bathtub', 'isthing': 0, 'color': [0, 194, 255]}, {'id': 98, 'name': 'other_machine', 'isthing': 0, 'color': [0, 255, 10]}, {'id': 103, 'name': 'curtain', 'isthing': 0, 'color': [255, 235, 0]}, {'id': 104, 'name': 'textiles', 'isthing': 0, 'color': [8, 184, 170]}, {'id': 105, 'name': 'clothes', 'isthing': 0, 'color': [133, 0, 255]}, {'id': 110, 'name': 'book', 'isthing': 0, 'color': [0, 214, 255]}, {'id': 111, 'name': 'tool', 'isthing': 0, 'color': [255, 0, 112]}, {'id': 112, 'name': 'blackboard', 'isthing': 0, 'color': [92, 255, 0]}, {'id': 113, 'name': 'tissue', 'isthing': 0, 'color': [0, 224, 255]}, {'id': 119, 'name': 'other_electronic_product', 'isthing': 0, 'color': [255, 0, 163]}, {'id': 120, 'name': 'fruit', 'isthing': 0, 'color': [255, 204, 0]}, {'id': 121, 'name': 'food', 'isthing': 0, 'color': [255, 0, 143]} ] NO_OBJ = 0 NO_OBJ_HB = 255 DIVISOR_PAN = 100 DIVISOR_NEW = 1000 NUM_THING = 58 NUM_STUFF = 66 THING_B_STUFF = False def vip2hb(pan_map): assert not THING_B_STUFF, "VIPSeg only supports stuff -> thing" pan_new = - np.ones_like(pan_map) vip2hb_thing = {itm['id'] + 1: idx for idx, itm in enumerate(CLASSES_THING)} vip2hb_stuff = {itm['id'] + 1: idx for idx, itm in enumerate(CLASSES_STUFF)} for idx in np.unique(pan_map): if idx == NO_OBJ or idx == 200: pan_new[pan_map == idx] = NO_OBJ_HB * DIVISOR_NEW elif idx > 128: cls_id = idx // DIVISOR_PAN cls_new_id = vip2hb_thing[cls_id] inst_id = idx % DIVISOR_PAN # since stuff first -> thing the second cls_new_id += NUM_STUFF pan_new[pan_map == idx] = cls_new_id * DIVISOR_NEW + inst_id + 1 else: pan_new[pan_map == idx] = vip2hb_stuff[idx] * DIVISOR_NEW assert -1. not in np.unique(pan_new) return pan_new def parse_args(): parser = argparse.ArgumentParser(description='Evaluation of DSTQ') parser.add_argument('result_path') parser.add_argument('--gt-path', default='data/kitti-step') parser.add_argument('--split', default='val') parser.add_argument( '--depth', action='store_true', help='eval depth') parser.add_argument('--nproc', default=1, type=int, help='number of process') args = parser.parse_args() return args def updater(pred_ins_name, pred_cls_name, pred_dep_name, gt_pan_seq_name, gt_dep_seq_name, updater_obj, seq_id): pred_ins = mmcv.imread(pred_ins_name, flag='unchanged').astype(np.int32) pred_cls = mmcv.imread(pred_cls_name, flag='unchanged').astype(np.int32) pred_dep = mmcv.imread(pred_dep_name, flag='unchanged').astype(np.float32) if pred_dep_name is not None else None gt_pan = mmcv.imread(gt_pan_seq_name, flag='unchanged').astype(np.int64) gt_pan = vip2hb(gt_pan) gt_cls = gt_pan // DIVISOR_NEW gt_ins = gt_pan % DIVISOR_NEW gt_dep = mmcv.imread(gt_dep_seq_name, flag='unchanged').astype(np.float32) if gt_dep_seq_name is not None else None if pred_dep is not None: pred_dep = F.interpolate(torch.from_numpy(pred_dep)[None][None], size=gt_dep.shape)[0][0].numpy() valid_mask_seg = gt_cls != NO_OBJ_HB pred_masked_ps = pred_cls[valid_mask_seg] * (2 ** 16) + pred_ins[valid_mask_seg] gt_masked_ps = gt_cls[valid_mask_seg] * (2 ** 16) + gt_ins[valid_mask_seg] if pred_dep_name is not None: valid_mask_dep = gt_dep > 0. pred_masked_depth = pred_dep[valid_mask_dep] gt_masked_depth = gt_dep[valid_mask_dep] updater_obj.update_state(gt_masked_ps, pred_masked_ps, gt_masked_depth, pred_masked_depth, seq_id) else: updater_obj.update_state(gt_masked_ps, pred_masked_ps, seq_id) def eval_dstq(result_dir, gt_dir, with_depth=True): if with_depth: dstq_obj = DSTQuality( num_classes=len(CLASSES), things_list=list(range(66, 124)), ignore_label=NO_OBJ_HB, label_bit_shift=16, offset=2 ** 16 * 256, depth_threshold=(1.25,), ) else: dstq_obj = STQuality( num_classes=len(CLASSES), things_list=list(range(66, 124)), ignore_label=NO_OBJ_HB, label_bit_shift=16, offset=2 ** 16 * 256, ) ann_folders = mmcv.list_from_file(os.path.join(gt_dir, "{}.txt".format(split)), prefix=os.path.join(gt_dir, 'panomasks') + '/') seq_ids = np.arange(0, len(ann_folders)).tolist() for seq_id in seq_ids: gt_names = list(mmcv.scandir(ann_folders[seq_id])) gt_pan_names = sorted(list(filter(lambda x: '.png' in x, gt_names))) if with_depth: gt_dep_names = sorted(list(filter(lambda x: 'depth' in x, gt_names))) else: gt_dep_names = [None] * len(gt_pan_names) pred_name_panoptic = list(mmcv.scandir(os.path.join(result_dir, 'panoptic', str(seq_id)))) pred_ins_names = sorted(list(filter(lambda x: 'ins' in x, pred_name_panoptic))) pred_cls_names = sorted(list(filter(lambda x: 'cat' in x, pred_name_panoptic))) if len(gt_pan_names) != len(pred_ins_names): print("Error when seq_id is {}. But cal existing seqs.".format(seq_id)) break if with_depth: pred_name_depth = list(mmcv.scandir(os.path.join(result_dir, 'depth', str(seq_id)))) pred_dep_names = sorted(pred_name_depth) else: pred_dep_names = [None] * len(pred_ins_names) prog_bar = ProgressBar(len(pred_ins_names)) for pred_ins_name, pred_cls_name, pred_dep_name, gt_pan_seq_name, gt_dep_seq_name in zip( pred_ins_names, pred_cls_names, pred_dep_names, gt_pan_names, gt_dep_names ): prog_bar.update() updater( os.path.join(result_dir, 'panoptic', str(seq_id), pred_ins_name), os.path.join(result_dir, 'panoptic', str(seq_id), pred_cls_name), os.path.join(result_dir, 'depth', str(seq_id), pred_dep_name) if pred_dep_name is not None else None, os.path.join(ann_folders[seq_id], gt_pan_seq_name), os.path.join(ann_folders[seq_id], gt_dep_seq_name) if gt_dep_seq_name is not None else None, dstq_obj, seq_id ) result = dstq_obj.result() print(result) # usage python eval_dstq_vipseg.py /opt/data/results/test --gt-path /opt/data/VIPSeg if __name__ == '__main__': args = parse_args() result_path = args.result_path gt_path = args.gt_path split = args.split eval_dstq(result_path, gt_path, args.depth) ================================================ FILE: tools/eval_dvpq_step.py ================================================ import numpy as np from PIL import Image import six import os import multiprocessing as mp import argparse parser = argparse.ArgumentParser(description='') parser.add_argument('result_path') parser.add_argument('--eval_frames', type=int, default=1) parser.add_argument('--depth_thres', type=float, default=0) args = parser.parse_args() eval_frames = args.eval_frames pred_dir_all = os.path.join(args.result_path, 'panoptic') depth_dir_all = os.path.join(args.result_path, 'depth') gt_dir = 'data/kitti-step/video_sequence/val/' depth_thres = args.depth_thres def vpq_eval(element): pred_ids, gt_ids = element max_ins = 2 ** 16 ign_id = 255 offset = 2 ** 30 num_cat = 20 iou_per_class = np.zeros(num_cat, dtype=np.float64) tp_per_class = np.zeros(num_cat, dtype=np.float64) fn_per_class = np.zeros(num_cat, dtype=np.float64) fp_per_class = np.zeros(num_cat, dtype=np.float64) def _ids_to_counts(id_array): ids, counts = np.unique(id_array, return_counts=True) return dict(six.moves.zip(ids, counts)) pred_areas = _ids_to_counts(pred_ids) gt_areas = _ids_to_counts(gt_ids) void_id = ign_id * max_ins ign_ids = { gt_id for gt_id in six.iterkeys(gt_areas) if (gt_id // max_ins) == ign_id } int_ids = gt_ids.astype(np.int64) * offset + pred_ids.astype(np.int64) int_areas = _ids_to_counts(int_ids) def prediction_void_overlap(pred_id): void_int_id = void_id * offset + pred_id return int_areas.get(void_int_id, 0) def prediction_ignored_overlap(pred_id): total_ignored_overlap = 0 for _ign_id in ign_ids: int_id = _ign_id * offset + pred_id total_ignored_overlap += int_areas.get(int_id, 0) return total_ignored_overlap gt_matched = set() pred_matched = set() for int_id, int_area in six.iteritems(int_areas): gt_id = int(int_id // offset) gt_cat = int(gt_id // max_ins) pred_id = int(int_id % offset) pred_cat = int(pred_id // max_ins) if gt_cat != pred_cat: continue union = ( gt_areas[gt_id] + pred_areas[pred_id] - int_area - prediction_void_overlap(pred_id) ) iou = int_area / union if iou > 0.5: tp_per_class[gt_cat] += 1 iou_per_class[gt_cat] += iou gt_matched.add(gt_id) pred_matched.add(pred_id) for gt_id in six.iterkeys(gt_areas): if gt_id in gt_matched: continue cat_id = gt_id // max_ins if cat_id == ign_id: continue fn_per_class[cat_id] += 1 for pred_id in six.iterkeys(pred_areas): if pred_id in pred_matched: continue if (prediction_ignored_overlap(pred_id) / pred_areas[pred_id]) > 0.5: continue cat = pred_id // max_ins fp_per_class[cat] += 1 return (iou_per_class, tp_per_class, fn_per_class, fp_per_class) def eval(element): max_ins = 2 ** 16 pred_cat, pred_ins, gts, depth_preds, depth_gts = element pred_cat = [np.array(Image.open(image)) for image in pred_cat] pred_ins = [np.array(Image.open(image)) for image in pred_ins] pred_cat = np.concatenate(pred_cat, axis=1) pred_ins = np.concatenate(pred_ins, axis=1) pred = pred_cat.astype(np.int32) * max_ins + pred_ins.astype(np.int32) gts_pan = [np.array(Image.open(image)) for image in gts] gts = [gt_pan[..., 0].astype(np.int32) * max_ins + gt_pan[..., 1].astype(np.int32) * 256 + gt_pan[..., 2].astype(np.int32) for gt_pan in gts_pan] abs_rel = 0. if depth_thres > 0: depth_preds = [np.array(Image.open(name)) for name in depth_preds] depth_gts = [np.array(Image.open(name)) for name in depth_gts] depth_preds = np.concatenate(depth_preds, axis=1) depth_gts = np.concatenate(depth_gts, axis=1) depth_mask = depth_gts > 0 abs_rel = np.mean( np.abs( depth_preds[depth_mask] - depth_gts[depth_mask]) / depth_gts[depth_mask]) pred_in_mask = pred[:, :depth_preds.shape[1]] pred_in_depth_mask = pred_in_mask[depth_mask] ignored_pred_mask = ( np.abs( depth_preds[depth_mask] - depth_gts[depth_mask]) / depth_gts[depth_mask]) > depth_thres pred_in_depth_mask[ignored_pred_mask] = 19 * max_ins pred_in_mask[depth_mask] = pred_in_depth_mask pred[:, :depth_preds.shape[1]] = pred_in_mask gt = np.concatenate(gts, axis=1) result = vpq_eval([pred, gt]) return result + (abs_rel, ) def main(): gt_names_all = os.scandir(gt_dir) gt_names_all = [name.name for name in gt_names_all if 'panoptic' in name.name] gt_names_all = [os.path.join(gt_dir, name) for name in gt_names_all] gt_names_all = sorted(gt_names_all) if args.depth_thres > 0: depth_gt_names_all = os.scandir(gt_dir) depth_gt_names_all = [ name.name for name in depth_gt_names_all if 'depth' in name.name] depth_gt_names_all = [os.path.join(gt_dir, name) for name in depth_gt_names_all] depth_gt_names_all = sorted(depth_gt_names_all) iou_per_class_all = [] tp_per_class_all = [] fn_per_class_all = [] fp_per_class_all = [] things_index = np.zeros((19,)).astype(bool) things_index[11] = True things_index[13] = True for i in [2, 6, 7, 8, 10, 13, 14, 16, 18]: if args.depth_thres > 0: depth_dir = os.path.join(depth_dir_all, str(i)) depth_pred_names = os.scandir(depth_dir) depth_pred_names = [name.name for name in depth_pred_names] depth_pred_names = [os.path.join(depth_dir, name) for name in depth_pred_names] depth_pred_names = sorted(depth_pred_names) pred_dir = os.path.join(pred_dir_all, str(i)) pred_names = os.scandir(pred_dir) pred_names = [os.path.join(pred_dir, name.name) for name in pred_names] cat_pred_names = [name for name in pred_names if name.endswith('cat.png')] ins_pred_names = [name for name in pred_names if name.endswith('ins.png')] cat_pred_names = sorted(cat_pred_names) ins_pred_names = sorted(ins_pred_names) all_lst = [] gt_names = sorted(list(filter(lambda x: os.path.basename(x).startswith('{:06d}'.format(i)), gt_names_all))) if args.depth_thres > 0: depth_gt_names = sorted(list(filter(lambda x: os.path.basename(x).startswith('{:06d}'.format(i)), depth_gt_names_all))) for i in range(len(cat_pred_names) - eval_frames + 1): all_lst.append([cat_pred_names[i: i + eval_frames], ins_pred_names[i: i + eval_frames], gt_names[i: i + eval_frames], depth_pred_names[i: i + eval_frames] if args.depth_thres > 0 else None, depth_gt_names[i: i + eval_frames] if args.depth_thres > 0 else None ]) N = mp.cpu_count() // 2 with mp.Pool(processes=N) as p: results = p.map(eval, all_lst) iou_per_class = np.stack([result[0] for result in results]) iou_per_class_all.append(iou_per_class) tp_per_class = np.stack([result[1] for result in results]) tp_per_class_all.append(tp_per_class) fn_per_class = np.stack([result[2] for result in results]) fn_per_class_all.append(fn_per_class) fp_per_class = np.stack([result[3] for result in results]) fp_per_class_all.append(fp_per_class) # abs_rel = np.stack([result[4] for result in results]).mean(axis=0) epsilon = 1e-10 iou_per_class = iou_per_class.sum(axis=0)[:19] tp_per_class = tp_per_class.sum(axis=0)[:19] fn_per_class = fn_per_class.sum(axis=0)[:19] fp_per_class = fp_per_class.sum(axis=0)[:19] sq = iou_per_class / (tp_per_class + epsilon) rq = tp_per_class / (tp_per_class + 0.5 * fn_per_class + 0.5 * fp_per_class + epsilon) pq = sq * rq spq = pq[np.logical_not(things_index)] tpq = pq[things_index] print( r'{:.1f} {:.1f} {:.1f}'.format( pq.mean() * 100, tpq.mean() * 100, spq.mean() * 100)) print("----------------final-----------------") iou_per_class_all = np.concatenate(iou_per_class_all, axis=0).sum(axis=0)[:19] tp_per_class_all = np.concatenate(tp_per_class_all, axis=0).sum(axis=0)[:19] fn_per_class_all = np.concatenate(fn_per_class_all, axis=0).sum(axis=0)[:19] fp_per_class_all = np.concatenate(fp_per_class_all, axis=0).sum(axis=0)[:19] sq = iou_per_class_all / (tp_per_class_all + epsilon) rq = tp_per_class_all / (tp_per_class_all + 0.5 * fn_per_class_all + 0.5 * fp_per_class_all + epsilon) pq = sq * rq spq = pq[np.logical_not(things_index)] tpq = pq[things_index] print( r'{:.1f} {:.1f} {:.1f}'.format( pq.mean() * 100, tpq.mean() * 100, spq.mean() * 100)) if __name__ == '__main__': main() ================================================ FILE: tools/eval_dvpq_vipseg.py ================================================ import argparse import os import mmcv import numpy as np import six import multiprocessing as mp CLASSES = [ {"id": 0, "name": "wall", "isthing": 0, "color": [120, 120, 120]}, {"id": 1, "name": "ceiling", "isthing": 0, "color": [180, 120, 120]}, {"id": 2, "name": "door", "isthing": 1, "color": [6, 230, 230]}, {"id": 3, "name": "stair", "isthing": 0, "color": [80, 50, 50]}, {"id": 4, "name": "ladder", "isthing": 1, "color": [4, 200, 3]}, {"id": 5, "name": "escalator", "isthing": 0, "color": [120, 120, 80]}, {"id": 6, "name": "Playground_slide", "isthing": 0, "color": [140, 140, 140]}, {"id": 7, "name": "handrail_or_fence", "isthing": 0, "color": [204, 5, 255]}, {"id": 8, "name": "window", "isthing": 1, "color": [230, 230, 230]}, {"id": 9, "name": "rail", "isthing": 0, "color": [4, 250, 7]}, {"id": 10, "name": "goal", "isthing": 1, "color": [224, 5, 255]}, {"id": 11, "name": "pillar", "isthing": 0, "color": [235, 255, 7]}, {"id": 12, "name": "pole", "isthing": 0, "color": [150, 5, 61]}, {"id": 13, "name": "floor", "isthing": 0, "color": [120, 120, 70]}, {"id": 14, "name": "ground", "isthing": 0, "color": [8, 255, 51]}, {"id": 15, "name": "grass", "isthing": 0, "color": [255, 6, 82]}, {"id": 16, "name": "sand", "isthing": 0, "color": [143, 255, 140]}, {"id": 17, "name": "athletic_field", "isthing": 0, "color": [204, 255, 4]}, {"id": 18, "name": "road", "isthing": 0, "color": [255, 51, 7]}, {"id": 19, "name": "path", "isthing": 0, "color": [204, 70, 3]}, {"id": 20, "name": "crosswalk", "isthing": 0, "color": [0, 102, 200]}, {"id": 21, "name": "building", "isthing": 0, "color": [61, 230, 250]}, {"id": 22, "name": "house", "isthing": 0, "color": [255, 6, 51]}, {"id": 23, "name": "bridge", "isthing": 0, "color": [11, 102, 255]}, {"id": 24, "name": "tower", "isthing": 0, "color": [255, 7, 71]}, {"id": 25, "name": "windmill", "isthing": 0, "color": [255, 9, 224]}, {"id": 26, "name": "well_or_well_lid", "isthing": 0, "color": [9, 7, 230]}, {"id": 27, "name": "other_construction", "isthing": 0, "color": [220, 220, 220]}, {"id": 28, "name": "sky", "isthing": 0, "color": [255, 9, 92]}, {"id": 29, "name": "mountain", "isthing": 0, "color": [112, 9, 255]}, {"id": 30, "name": "stone", "isthing": 0, "color": [8, 255, 214]}, {"id": 31, "name": "wood", "isthing": 0, "color": [7, 255, 224]}, {"id": 32, "name": "ice", "isthing": 0, "color": [255, 184, 6]}, {"id": 33, "name": "snowfield", "isthing": 0, "color": [10, 255, 71]}, {"id": 34, "name": "grandstand", "isthing": 0, "color": [255, 41, 10]}, {"id": 35, "name": "sea", "isthing": 0, "color": [7, 255, 255]}, {"id": 36, "name": "river", "isthing": 0, "color": [224, 255, 8]}, {"id": 37, "name": "lake", "isthing": 0, "color": [102, 8, 255]}, {"id": 38, "name": "waterfall", "isthing": 0, "color": [255, 61, 6]}, {"id": 39, "name": "water", "isthing": 0, "color": [255, 194, 7]}, {"id": 40, "name": "billboard_or_Bulletin_Board", "isthing": 0, "color": [255, 122, 8]}, {"id": 41, "name": "sculpture", "isthing": 1, "color": [0, 255, 20]}, {"id": 42, "name": "pipeline", "isthing": 0, "color": [255, 8, 41]}, {"id": 43, "name": "flag", "isthing": 1, "color": [255, 5, 153]}, {"id": 44, "name": "parasol_or_umbrella", "isthing": 1, "color": [6, 51, 255]}, {"id": 45, "name": "cushion_or_carpet", "isthing": 0, "color": [235, 12, 255]}, {"id": 46, "name": "tent", "isthing": 1, "color": [160, 150, 20]}, {"id": 47, "name": "roadblock", "isthing": 1, "color": [0, 163, 255]}, {"id": 48, "name": "car", "isthing": 1, "color": [140, 140, 140]}, {"id": 49, "name": "bus", "isthing": 1, "color": [250, 10, 15]}, {"id": 50, "name": "truck", "isthing": 1, "color": [20, 255, 0]}, {"id": 51, "name": "bicycle", "isthing": 1, "color": [31, 255, 0]}, {"id": 52, "name": "motorcycle", "isthing": 1, "color": [255, 31, 0]}, {"id": 53, "name": "wheeled_machine", "isthing": 0, "color": [255, 224, 0]}, {"id": 54, "name": "ship_or_boat", "isthing": 1, "color": [153, 255, 0]}, {"id": 55, "name": "raft", "isthing": 1, "color": [0, 0, 255]}, {"id": 56, "name": "airplane", "isthing": 1, "color": [255, 71, 0]}, {"id": 57, "name": "tyre", "isthing": 0, "color": [0, 235, 255]}, {"id": 58, "name": "traffic_light", "isthing": 0, "color": [0, 173, 255]}, {"id": 59, "name": "lamp", "isthing": 0, "color": [31, 0, 255]}, {"id": 60, "name": "person", "isthing": 1, "color": [11, 200, 200]}, {"id": 61, "name": "cat", "isthing": 1, "color": [255, 82, 0]}, {"id": 62, "name": "dog", "isthing": 1, "color": [0, 255, 245]}, {"id": 63, "name": "horse", "isthing": 1, "color": [0, 61, 255]}, {"id": 64, "name": "cattle", "isthing": 1, "color": [0, 255, 112]}, {"id": 65, "name": "other_animal", "isthing": 1, "color": [0, 255, 133]}, {"id": 66, "name": "tree", "isthing": 0, "color": [255, 0, 0]}, {"id": 67, "name": "flower", "isthing": 0, "color": [255, 163, 0]}, {"id": 68, "name": "other_plant", "isthing": 0, "color": [255, 102, 0]}, {"id": 69, "name": "toy", "isthing": 0, "color": [194, 255, 0]}, {"id": 70, "name": "ball_net", "isthing": 0, "color": [0, 143, 255]}, {"id": 71, "name": "backboard", "isthing": 0, "color": [51, 255, 0]}, {"id": 72, "name": "skateboard", "isthing": 1, "color": [0, 82, 255]}, {"id": 73, "name": "bat", "isthing": 0, "color": [0, 255, 41]}, {"id": 74, "name": "ball", "isthing": 1, "color": [0, 255, 173]}, {"id": 75, "name": "cupboard_or_showcase_or_storage_rack", "isthing": 0, "color": [10, 0, 255]}, {"id": 76, "name": "box", "isthing": 1, "color": [173, 255, 0]}, {"id": 77, "name": "traveling_case_or_trolley_case", "isthing": 1, "color": [0, 255, 153]}, {"id": 78, "name": "basket", "isthing": 1, "color": [255, 92, 0]}, {"id": 79, "name": "bag_or_package", "isthing": 1, "color": [255, 0, 255]}, {"id": 80, "name": "trash_can", "isthing": 0, "color": [255, 0, 245]}, {"id": 81, "name": "cage", "isthing": 0, "color": [255, 0, 102]}, {"id": 82, "name": "plate", "isthing": 1, "color": [255, 173, 0]}, {"id": 83, "name": "tub_or_bowl_or_pot", "isthing": 1, "color": [255, 0, 20]}, {"id": 84, "name": "bottle_or_cup", "isthing": 1, "color": [255, 184, 184]}, {"id": 85, "name": "barrel", "isthing": 1, "color": [0, 31, 255]}, {"id": 86, "name": "fishbowl", "isthing": 1, "color": [0, 255, 61]}, {"id": 87, "name": "bed", "isthing": 1, "color": [0, 71, 255]}, {"id": 88, "name": "pillow", "isthing": 1, "color": [255, 0, 204]}, {"id": 89, "name": "table_or_desk", "isthing": 1, "color": [0, 255, 194]}, {"id": 90, "name": "chair_or_seat", "isthing": 1, "color": [0, 255, 82]}, {"id": 91, "name": "bench", "isthing": 1, "color": [0, 10, 255]}, {"id": 92, "name": "sofa", "isthing": 1, "color": [0, 112, 255]}, {"id": 93, "name": "shelf", "isthing": 0, "color": [51, 0, 255]}, {"id": 94, "name": "bathtub", "isthing": 0, "color": [0, 194, 255]}, {"id": 95, "name": "gun", "isthing": 1, "color": [0, 122, 255]}, {"id": 96, "name": "commode", "isthing": 1, "color": [0, 255, 163]}, {"id": 97, "name": "roaster", "isthing": 1, "color": [255, 153, 0]}, {"id": 98, "name": "other_machine", "isthing": 0, "color": [0, 255, 10]}, {"id": 99, "name": "refrigerator", "isthing": 1, "color": [255, 112, 0]}, {"id": 100, "name": "washing_machine", "isthing": 1, "color": [143, 255, 0]}, {"id": 101, "name": "Microwave_oven", "isthing": 1, "color": [82, 0, 255]}, {"id": 102, "name": "fan", "isthing": 1, "color": [163, 255, 0]}, {"id": 103, "name": "curtain", "isthing": 0, "color": [255, 235, 0]}, {"id": 104, "name": "textiles", "isthing": 0, "color": [8, 184, 170]}, {"id": 105, "name": "clothes", "isthing": 0, "color": [133, 0, 255]}, {"id": 106, "name": "painting_or_poster", "isthing": 1, "color": [0, 255, 92]}, {"id": 107, "name": "mirror", "isthing": 1, "color": [184, 0, 255]}, {"id": 108, "name": "flower_pot_or_vase", "isthing": 1, "color": [255, 0, 31]}, {"id": 109, "name": "clock", "isthing": 1, "color": [0, 184, 255]}, {"id": 110, "name": "book", "isthing": 0, "color": [0, 214, 255]}, {"id": 111, "name": "tool", "isthing": 0, "color": [255, 0, 112]}, {"id": 112, "name": "blackboard", "isthing": 0, "color": [92, 255, 0]}, {"id": 113, "name": "tissue", "isthing": 0, "color": [0, 224, 255]}, {"id": 114, "name": "screen_or_television", "isthing": 1, "color": [112, 224, 255]}, {"id": 115, "name": "computer", "isthing": 1, "color": [70, 184, 160]}, {"id": 116, "name": "printer", "isthing": 1, "color": [163, 0, 255]}, {"id": 117, "name": "Mobile_phone", "isthing": 1, "color": [153, 0, 255]}, {"id": 118, "name": "keyboard", "isthing": 1, "color": [71, 255, 0]}, {"id": 119, "name": "other_electronic_product", "isthing": 0, "color": [255, 0, 163]}, {"id": 120, "name": "fruit", "isthing": 0, "color": [255, 204, 0]}, {"id": 121, "name": "food", "isthing": 0, "color": [255, 0, 143]}, {"id": 122, "name": "instrument", "isthing": 1, "color": [0, 255, 235]}, {"id": 123, "name": "train", "isthing": 1, "color": [133, 255, 0]} ] CLASSES_THING = [ {'id': 2, 'name': 'door', 'isthing': 1, 'color': [6, 230, 230]}, {'id': 4, 'name': 'ladder', 'isthing': 1, 'color': [4, 200, 3]}, {'id': 8, 'name': 'window', 'isthing': 1, 'color': [230, 230, 230]}, {'id': 10, 'name': 'goal', 'isthing': 1, 'color': [224, 5, 255]}, {'id': 41, 'name': 'sculpture', 'isthing': 1, 'color': [0, 255, 20]}, {'id': 43, 'name': 'flag', 'isthing': 1, 'color': [255, 5, 153]}, {'id': 44, 'name': 'parasol_or_umbrella', 'isthing': 1, 'color': [6, 51, 255]}, {'id': 46, 'name': 'tent', 'isthing': 1, 'color': [160, 150, 20]}, {'id': 47, 'name': 'roadblock', 'isthing': 1, 'color': [0, 163, 255]}, {'id': 48, 'name': 'car', 'isthing': 1, 'color': [140, 140, 140]}, {'id': 49, 'name': 'bus', 'isthing': 1, 'color': [250, 10, 15]}, {'id': 50, 'name': 'truck', 'isthing': 1, 'color': [20, 255, 0]}, {'id': 51, 'name': 'bicycle', 'isthing': 1, 'color': [31, 255, 0]}, {'id': 52, 'name': 'motorcycle', 'isthing': 1, 'color': [255, 31, 0]}, {'id': 54, 'name': 'ship_or_boat', 'isthing': 1, 'color': [153, 255, 0]}, {'id': 55, 'name': 'raft', 'isthing': 1, 'color': [0, 0, 255]}, {'id': 56, 'name': 'airplane', 'isthing': 1, 'color': [255, 71, 0]}, {'id': 60, 'name': 'person', 'isthing': 1, 'color': [11, 200, 200]}, {'id': 61, 'name': 'cat', 'isthing': 1, 'color': [255, 82, 0]}, {'id': 62, 'name': 'dog', 'isthing': 1, 'color': [0, 255, 245]}, {'id': 63, 'name': 'horse', 'isthing': 1, 'color': [0, 61, 255]}, {'id': 64, 'name': 'cattle', 'isthing': 1, 'color': [0, 255, 112]}, {'id': 65, 'name': 'other_animal', 'isthing': 1, 'color': [0, 255, 133]}, {'id': 72, 'name': 'skateboard', 'isthing': 1, 'color': [0, 82, 255]}, {'id': 74, 'name': 'ball', 'isthing': 1, 'color': [0, 255, 173]}, {'id': 76, 'name': 'box', 'isthing': 1, 'color': [173, 255, 0]}, {'id': 77, 'name': 'traveling_case_or_trolley_case', 'isthing': 1, 'color': [0, 255, 153]}, {'id': 78, 'name': 'basket', 'isthing': 1, 'color': [255, 92, 0]}, {'id': 79, 'name': 'bag_or_package', 'isthing': 1, 'color': [255, 0, 255]}, {'id': 82, 'name': 'plate', 'isthing': 1, 'color': [255, 173, 0]}, {'id': 83, 'name': 'tub_or_bowl_or_pot', 'isthing': 1, 'color': [255, 0, 20]}, {'id': 84, 'name': 'bottle_or_cup', 'isthing': 1, 'color': [255, 184, 184]}, {'id': 85, 'name': 'barrel', 'isthing': 1, 'color': [0, 31, 255]}, {'id': 86, 'name': 'fishbowl', 'isthing': 1, 'color': [0, 255, 61]}, {'id': 87, 'name': 'bed', 'isthing': 1, 'color': [0, 71, 255]}, {'id': 88, 'name': 'pillow', 'isthing': 1, 'color': [255, 0, 204]}, {'id': 89, 'name': 'table_or_desk', 'isthing': 1, 'color': [0, 255, 194]}, {'id': 90, 'name': 'chair_or_seat', 'isthing': 1, 'color': [0, 255, 82]}, {'id': 91, 'name': 'bench', 'isthing': 1, 'color': [0, 10, 255]}, {'id': 92, 'name': 'sofa', 'isthing': 1, 'color': [0, 112, 255]}, {'id': 95, 'name': 'gun', 'isthing': 1, 'color': [0, 122, 255]}, {'id': 96, 'name': 'commode', 'isthing': 1, 'color': [0, 255, 163]}, {'id': 97, 'name': 'roaster', 'isthing': 1, 'color': [255, 153, 0]}, {'id': 99, 'name': 'refrigerator', 'isthing': 1, 'color': [255, 112, 0]}, {'id': 100, 'name': 'washing_machine', 'isthing': 1, 'color': [143, 255, 0]}, {'id': 101, 'name': 'Microwave_oven', 'isthing': 1, 'color': [82, 0, 255]}, {'id': 102, 'name': 'fan', 'isthing': 1, 'color': [163, 255, 0]}, {'id': 106, 'name': 'painting_or_poster', 'isthing': 1, 'color': [0, 255, 92]}, {'id': 107, 'name': 'mirror', 'isthing': 1, 'color': [184, 0, 255]}, {'id': 108, 'name': 'flower_pot_or_vase', 'isthing': 1, 'color': [255, 0, 31]}, {'id': 109, 'name': 'clock', 'isthing': 1, 'color': [0, 184, 255]}, {'id': 114, 'name': 'screen_or_television', 'isthing': 1, 'color': [112, 224, 255]}, {'id': 115, 'name': 'computer', 'isthing': 1, 'color': [70, 184, 160]}, {'id': 116, 'name': 'printer', 'isthing': 1, 'color': [163, 0, 255]}, {'id': 117, 'name': 'Mobile_phone', 'isthing': 1, 'color': [153, 0, 255]}, {'id': 118, 'name': 'keyboard', 'isthing': 1, 'color': [71, 255, 0]}, {'id': 122, 'name': 'instrument', 'isthing': 1, 'color': [0, 255, 235]}, {'id': 123, 'name': 'train', 'isthing': 1, 'color': [133, 255, 0]} ] CLASSES_STUFF = [ {'id': 0, 'name': 'wall', 'isthing': 0, 'color': [120, 120, 120]}, {'id': 1, 'name': 'ceiling', 'isthing': 0, 'color': [180, 120, 120]}, {'id': 3, 'name': 'stair', 'isthing': 0, 'color': [80, 50, 50]}, {'id': 5, 'name': 'escalator', 'isthing': 0, 'color': [120, 120, 80]}, {'id': 6, 'name': 'Playground_slide', 'isthing': 0, 'color': [140, 140, 140]}, {'id': 7, 'name': 'handrail_or_fence', 'isthing': 0, 'color': [204, 5, 255]}, {'id': 9, 'name': 'rail', 'isthing': 0, 'color': [4, 250, 7]}, {'id': 11, 'name': 'pillar', 'isthing': 0, 'color': [235, 255, 7]}, {'id': 12, 'name': 'pole', 'isthing': 0, 'color': [150, 5, 61]}, {'id': 13, 'name': 'floor', 'isthing': 0, 'color': [120, 120, 70]}, {'id': 14, 'name': 'ground', 'isthing': 0, 'color': [8, 255, 51]}, {'id': 15, 'name': 'grass', 'isthing': 0, 'color': [255, 6, 82]}, {'id': 16, 'name': 'sand', 'isthing': 0, 'color': [143, 255, 140]}, {'id': 17, 'name': 'athletic_field', 'isthing': 0, 'color': [204, 255, 4]}, {'id': 18, 'name': 'road', 'isthing': 0, 'color': [255, 51, 7]}, {'id': 19, 'name': 'path', 'isthing': 0, 'color': [204, 70, 3]}, {'id': 20, 'name': 'crosswalk', 'isthing': 0, 'color': [0, 102, 200]}, {'id': 21, 'name': 'building', 'isthing': 0, 'color': [61, 230, 250]}, {'id': 22, 'name': 'house', 'isthing': 0, 'color': [255, 6, 51]}, {'id': 23, 'name': 'bridge', 'isthing': 0, 'color': [11, 102, 255]}, {'id': 24, 'name': 'tower', 'isthing': 0, 'color': [255, 7, 71]}, {'id': 25, 'name': 'windmill', 'isthing': 0, 'color': [255, 9, 224]}, {'id': 26, 'name': 'well_or_well_lid', 'isthing': 0, 'color': [9, 7, 230]}, {'id': 27, 'name': 'other_construction', 'isthing': 0, 'color': [220, 220, 220]}, {'id': 28, 'name': 'sky', 'isthing': 0, 'color': [255, 9, 92]}, {'id': 29, 'name': 'mountain', 'isthing': 0, 'color': [112, 9, 255]}, {'id': 30, 'name': 'stone', 'isthing': 0, 'color': [8, 255, 214]}, {'id': 31, 'name': 'wood', 'isthing': 0, 'color': [7, 255, 224]}, {'id': 32, 'name': 'ice', 'isthing': 0, 'color': [255, 184, 6]}, {'id': 33, 'name': 'snowfield', 'isthing': 0, 'color': [10, 255, 71]}, {'id': 34, 'name': 'grandstand', 'isthing': 0, 'color': [255, 41, 10]}, {'id': 35, 'name': 'sea', 'isthing': 0, 'color': [7, 255, 255]}, {'id': 36, 'name': 'river', 'isthing': 0, 'color': [224, 255, 8]}, {'id': 37, 'name': 'lake', 'isthing': 0, 'color': [102, 8, 255]}, {'id': 38, 'name': 'waterfall', 'isthing': 0, 'color': [255, 61, 6]}, {'id': 39, 'name': 'water', 'isthing': 0, 'color': [255, 194, 7]}, {'id': 40, 'name': 'billboard_or_Bulletin_Board', 'isthing': 0, 'color': [255, 122, 8]}, {'id': 42, 'name': 'pipeline', 'isthing': 0, 'color': [255, 8, 41]}, {'id': 45, 'name': 'cushion_or_carpet', 'isthing': 0, 'color': [235, 12, 255]}, {'id': 53, 'name': 'wheeled_machine', 'isthing': 0, 'color': [255, 224, 0]}, {'id': 57, 'name': 'tyre', 'isthing': 0, 'color': [0, 235, 255]}, {'id': 58, 'name': 'traffic_light', 'isthing': 0, 'color': [0, 173, 255]}, {'id': 59, 'name': 'lamp', 'isthing': 0, 'color': [31, 0, 255]}, {'id': 66, 'name': 'tree', 'isthing': 0, 'color': [255, 0, 0]}, {'id': 67, 'name': 'flower', 'isthing': 0, 'color': [255, 163, 0]}, {'id': 68, 'name': 'other_plant', 'isthing': 0, 'color': [255, 102, 0]}, {'id': 69, 'name': 'toy', 'isthing': 0, 'color': [194, 255, 0]}, {'id': 70, 'name': 'ball_net', 'isthing': 0, 'color': [0, 143, 255]}, {'id': 71, 'name': 'backboard', 'isthing': 0, 'color': [51, 255, 0]}, {'id': 73, 'name': 'bat', 'isthing': 0, 'color': [0, 255, 41]}, {'id': 75, 'name': 'cupboard_or_showcase_or_storage_rack', 'isthing': 0, 'color': [10, 0, 255]}, {'id': 80, 'name': 'trash_can', 'isthing': 0, 'color': [255, 0, 245]}, {'id': 81, 'name': 'cage', 'isthing': 0, 'color': [255, 0, 102]}, {'id': 93, 'name': 'shelf', 'isthing': 0, 'color': [51, 0, 255]}, {'id': 94, 'name': 'bathtub', 'isthing': 0, 'color': [0, 194, 255]}, {'id': 98, 'name': 'other_machine', 'isthing': 0, 'color': [0, 255, 10]}, {'id': 103, 'name': 'curtain', 'isthing': 0, 'color': [255, 235, 0]}, {'id': 104, 'name': 'textiles', 'isthing': 0, 'color': [8, 184, 170]}, {'id': 105, 'name': 'clothes', 'isthing': 0, 'color': [133, 0, 255]}, {'id': 110, 'name': 'book', 'isthing': 0, 'color': [0, 214, 255]}, {'id': 111, 'name': 'tool', 'isthing': 0, 'color': [255, 0, 112]}, {'id': 112, 'name': 'blackboard', 'isthing': 0, 'color': [92, 255, 0]}, {'id': 113, 'name': 'tissue', 'isthing': 0, 'color': [0, 224, 255]}, {'id': 119, 'name': 'other_electronic_product', 'isthing': 0, 'color': [255, 0, 163]}, {'id': 120, 'name': 'fruit', 'isthing': 0, 'color': [255, 204, 0]}, {'id': 121, 'name': 'food', 'isthing': 0, 'color': [255, 0, 143]} ] NO_OBJ = 0 NO_OBJ_HB = 255 DIVISOR_PAN = 100 DIVISOR_NEW = 1000 NUM_THING = 58 NUM_STUFF = 66 THING_B_STUFF = False def vip2hb(pan_map): assert not THING_B_STUFF, "VIPSeg only supports stuff -> thing" pan_new = - np.ones_like(pan_map) vip2hb_thing = {itm['id'] + 1: idx for idx, itm in enumerate(CLASSES_THING)} vip2hb_stuff = {itm['id'] + 1: idx for idx, itm in enumerate(CLASSES_STUFF)} for idx in np.unique(pan_map): if idx == NO_OBJ or idx == 200: pan_new[pan_map == idx] = NO_OBJ_HB * DIVISOR_NEW elif idx > 128: cls_id = idx // DIVISOR_PAN cls_new_id = vip2hb_thing[cls_id] inst_id = idx % DIVISOR_PAN # since stuff -> thing cls_new_id += NUM_STUFF pan_new[pan_map == idx] = cls_new_id * DIVISOR_NEW + inst_id + 1 else: pan_new[pan_map == idx] = vip2hb_stuff[idx] * DIVISOR_NEW assert -1. not in np.unique(pan_new) return pan_new def parse_args(): parser = argparse.ArgumentParser(description='Evaluation of DSTQ') parser.add_argument('result_path') parser.add_argument('--gt-path', default='data/kitti-step') parser.add_argument('--split', default='val') parser.add_argument( '--depth', action='store_true', help='eval depth') parser.add_argument('--nproc', default=32, type=int, help='number of process') args = parser.parse_args() return args def vpq_eval(element): pred_ids, gt_ids = element max_ins = 2 ** 16 ign_id = 255 offset = 2 ** 30 num_cat = NUM_THING + NUM_STUFF + 1 iou_per_class = np.zeros(num_cat, dtype=np.float64) tp_per_class = np.zeros(num_cat, dtype=np.float64) fn_per_class = np.zeros(num_cat, dtype=np.float64) fp_per_class = np.zeros(num_cat, dtype=np.float64) def _ids_to_counts(id_array): ids, counts = np.unique(id_array, return_counts=True) return dict(six.moves.zip(ids, counts)) pred_areas = _ids_to_counts(pred_ids) gt_areas = _ids_to_counts(gt_ids) void_id = ign_id * max_ins ign_ids = { gt_id for gt_id in six.iterkeys(gt_areas) if (gt_id // max_ins) == ign_id } int_ids = gt_ids.astype(np.int64) * offset + pred_ids.astype(np.int64) int_areas = _ids_to_counts(int_ids) def prediction_void_overlap(pred_id): void_int_id = void_id * offset + pred_id return int_areas.get(void_int_id, 0) def prediction_ignored_overlap(pred_id): total_ignored_overlap = 0 for _ign_id in ign_ids: int_id = _ign_id * offset + pred_id total_ignored_overlap += int_areas.get(int_id, 0) return total_ignored_overlap gt_matched = set() pred_matched = set() for int_id, int_area in six.iteritems(int_areas): gt_id = int(int_id // offset) gt_cat = int(gt_id // max_ins) pred_id = int(int_id % offset) pred_cat = int(pred_id // max_ins) if gt_cat != pred_cat: continue union = ( gt_areas[gt_id] + pred_areas[pred_id] - int_area - prediction_void_overlap(pred_id) ) iou = int_area / union if iou > 0.5: tp_per_class[gt_cat] += 1 iou_per_class[gt_cat] += iou gt_matched.add(gt_id) pred_matched.add(pred_id) for gt_id in six.iterkeys(gt_areas): if gt_id in gt_matched: continue cat_id = gt_id // max_ins if cat_id == ign_id: continue fn_per_class[cat_id] += 1 for pred_id in six.iterkeys(pred_areas): if pred_id in pred_matched: continue if (prediction_ignored_overlap(pred_id) / pred_areas[pred_id]) > 0.5: continue cat = pred_id // max_ins fp_per_class[cat] += 1 return iou_per_class, tp_per_class, fn_per_class, fp_per_class def read_to_eval(element): max_ins = 2 ** 16 pred_list, gt_list = element pred_cat = [mmcv.imread(image[0], flag='unchanged').astype(np.int32) for image in pred_list] pred_ins = [mmcv.imread(image[1], flag='unchanged').astype(np.int32) for image in pred_list] pred_cat = np.concatenate(pred_cat, axis=1) pred_ins = np.concatenate(pred_ins, axis=1) pred = pred_cat.astype(np.int32) * max_ins + pred_ins.astype(np.int32) gt_pan = [mmcv.imread(image, flag='unchanged').astype(np.int64) for image in gt_list] gt_pan = np.concatenate(gt_pan, axis=1) gt_pan = vip2hb(gt_pan) gt_cls = gt_pan // DIVISOR_NEW gt_ins = gt_pan % DIVISOR_NEW gt = gt_cls * max_ins + gt_ins result = vpq_eval([pred, gt]) return result def eval_dvpq(result_dir, gt_dir, split='val', k=1, with_depth=True): if with_depth: raise NotImplementedError ann_folders = mmcv.list_from_file(os.path.join(gt_dir, "{}.txt".format(split)), prefix=os.path.join(gt_dir, 'panomasks') + '/') seq_ids = np.arange(0, len(ann_folders)).tolist() iou_per_class_all = [] tp_per_class_all = [] fn_per_class_all = [] fp_per_class_all = [] for seq_id in seq_ids: gt_names = list(mmcv.scandir(ann_folders[seq_id])) gt_pan_names = sorted(list(filter(lambda x: '.png' in x, gt_names))) if not os.path.exists(os.path.join(result_dir, 'panoptic', str(seq_id))): print("Error when seq_id is {}. But cal existing seqs.".format(seq_id)) break pred_name_panoptic = list(mmcv.scandir(os.path.join(result_dir, 'panoptic', str(seq_id)))) pred_ins_names = sorted(list(filter(lambda x: 'ins' in x, pred_name_panoptic))) pred_cls_names = sorted(list(filter(lambda x: 'cat' in x, pred_name_panoptic))) if len(gt_pan_names) != len(pred_ins_names): print("Error when seq_id is {}. But cal existing seqs.".format(seq_id)) break elements = [] assert len(pred_ins_names) == len(pred_cls_names) assert len(pred_cls_names) == len(gt_pan_names) len_seq = len(pred_ins_names) k = min(k, len_seq) for idx in range(len_seq): if idx + k - 1 >= len_seq: break pred = [] gt = [] for j in range(k): pred_cur = (os.path.join(result_dir, 'panoptic', str(seq_id), pred_cls_names[idx + j]), os.path.join(result_dir, 'panoptic', str(seq_id), pred_cls_names[idx + j])) gt_cur = os.path.join(ann_folders[seq_id], gt_pan_names[idx + j]) pred.append(pred_cur) gt.append(gt_cur) elements.append((pred, gt)) N = mp.cpu_count() with mp.Pool(processes=N) as p: results = p.map(read_to_eval, elements) iou_per_class = np.stack([result[0] for result in results]) iou_per_class_all.append(iou_per_class) tp_per_class = np.stack([result[1] for result in results]) tp_per_class_all.append(tp_per_class) fn_per_class = np.stack([result[2] for result in results]) fn_per_class_all.append(fn_per_class) fp_per_class = np.stack([result[3] for result in results]) fp_per_class_all.append(fp_per_class) epsilon = 1e-10 iou_per_class_all = np.concatenate(iou_per_class_all, axis=0).sum(axis=0)[:NUM_THING + NUM_STUFF] tp_per_class_all = np.concatenate(tp_per_class_all, axis=0).sum(axis=0)[:NUM_THING + NUM_STUFF] fn_per_class_all = np.concatenate(fn_per_class_all, axis=0).sum(axis=0)[:NUM_THING + NUM_STUFF] fp_per_class_all = np.concatenate(fp_per_class_all, axis=0).sum(axis=0)[:NUM_THING + NUM_STUFF] sq = iou_per_class_all / (tp_per_class_all + epsilon) rq = tp_per_class_all / (tp_per_class_all + 0.5 * fn_per_class_all + 0.5 * fp_per_class_all + epsilon) pq = sq * rq spq = pq[:NUM_STUFF] tpq = pq[NUM_STUFF:] print( r'PQ : {:.3f} PQ_thing : {:.3f} PQ_stuff : {:.3f}'.format( pq.mean() * 100, tpq.mean() * 100, spq.mean() * 100) ) # usage python eval_dstq_vipseg.py /opt/data/results/test --gt-path /opt/data/VIPSeg if __name__ == '__main__': args = parse_args() result_path = args.result_path gt_path = args.gt_path split = args.split for k in [1, 2, 4, 6]: print("k={}".format(k)) eval_dvpq(result_path, gt_path, split=split, with_depth=args.depth, k=k) ================================================ FILE: tools/flops_counter.py ================================================ ''' Copyright (C) 2019 Sovrasov V. - All Rights Reserved * You may use, distribute and modify this code under the * terms of the MIT license. * You should have received a copy of the MIT license with * this file. If not visit https://opensource.org/licenses/MIT ''' import sys from functools import partial import mmcv.cnn.bricks.transformer import numpy as np import torch import torch.nn as nn import mmcv def get_model_complexity_info(model, input_res, print_per_layer_stat=True, as_strings=True, input_constructor=None, ost=sys.stdout, verbose=False, ignore_modules=[], custom_modules_hooks={}): assert type(input_res) is tuple assert len(input_res) >= 1 assert isinstance(model, nn.Module) global CUSTOM_MODULES_MAPPING CUSTOM_MODULES_MAPPING = custom_modules_hooks flops_model = add_flops_counting_methods(model) flops_model.eval() flops_model.start_flops_count(ost=ost, verbose=verbose, ignore_list=ignore_modules) if input_constructor: input = input_constructor(input_res) _ = flops_model(**input) else: try: batch = torch.ones(()).new_empty((1, *input_res), dtype=next(flops_model.parameters()).dtype, device=next(flops_model.parameters()).device) except StopIteration: batch = torch.ones(()).new_empty((1, *input_res)) _ = flops_model(batch) flops_count, params_count = flops_model.compute_average_flops_cost() if print_per_layer_stat: print_model_with_flops(flops_model, flops_count, params_count, ost=ost) flops_model.stop_flops_count() CUSTOM_MODULES_MAPPING = {} if as_strings: return flops_to_string(flops_count), params_to_string(params_count) return flops_count, params_count def flops_to_string(flops, units='GMac', precision=2): if units is None: if flops // 10**9 > 0: return str(round(flops / 10.**9, precision)) + ' GMac' elif flops // 10**6 > 0: return str(round(flops / 10.**6, precision)) + ' MMac' elif flops // 10**3 > 0: return str(round(flops / 10.**3, precision)) + ' KMac' else: return str(flops) + ' Mac' else: if units == 'GMac': return str(round(flops / 10.**9, precision)) + ' ' + units elif units == 'MMac': return str(round(flops / 10.**6, precision)) + ' ' + units elif units == 'KMac': return str(round(flops / 10.**3, precision)) + ' ' + units else: return str(flops) + ' Mac' def params_to_string(params_num, units=None, precision=2): if units is None: if params_num // 10 ** 6 > 0: return str(round(params_num / 10 ** 6, 2)) + ' M' elif params_num // 10 ** 3: return str(round(params_num / 10 ** 3, 2)) + ' k' else: return str(params_num) else: if units == 'M': return str(round(params_num / 10.**6, precision)) + ' ' + units elif units == 'K': return str(round(params_num / 10.**3, precision)) + ' ' + units else: return str(params_num) def accumulate_flops(self): if is_supported_instance(self): return self.__flops__ else: sum = 0 for m in self.children(): sum += m.accumulate_flops() return sum def print_model_with_flops(model, total_flops, total_params, units='GMac', precision=3, ost=sys.stdout): if total_flops < 1: total_flops = 1 def accumulate_params(self): if is_supported_instance(self): return self.__params__ else: sum = 0 for m in self.children(): sum += m.accumulate_params() return sum def flops_repr(self): accumulated_params_num = self.accumulate_params() accumulated_flops_cost = self.accumulate_flops() / model.__batch_counter__ return ', '.join([params_to_string(accumulated_params_num, units='M', precision=precision), '{:.3%} Params'.format(accumulated_params_num / total_params), flops_to_string(accumulated_flops_cost, units=units, precision=precision), '{:.3%} MACs'.format(accumulated_flops_cost / total_flops), self.original_extra_repr()]) def add_extra_repr(m): m.accumulate_flops = accumulate_flops.__get__(m) m.accumulate_params = accumulate_params.__get__(m) flops_extra_repr = flops_repr.__get__(m) if m.extra_repr != flops_extra_repr: m.original_extra_repr = m.extra_repr m.extra_repr = flops_extra_repr assert m.extra_repr != m.original_extra_repr def del_extra_repr(m): if hasattr(m, 'original_extra_repr'): m.extra_repr = m.original_extra_repr del m.original_extra_repr if hasattr(m, 'accumulate_flops'): del m.accumulate_flops model.apply(add_extra_repr) print(repr(model), file=ost) model.apply(del_extra_repr) def get_model_parameters_number(model): params_num = sum(p.numel() for p in model.parameters() if p.requires_grad) return params_num def add_flops_counting_methods(net_main_module): # adding additional methods to the existing module object, # this is done this way so that each function has access to self object net_main_module.start_flops_count = start_flops_count.__get__(net_main_module) net_main_module.stop_flops_count = stop_flops_count.__get__(net_main_module) net_main_module.reset_flops_count = reset_flops_count.__get__(net_main_module) net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__( net_main_module) net_main_module.reset_flops_count() return net_main_module def compute_average_flops_cost(self): """ A method that will be available after add_flops_counting_methods() is called on a desired net object. Returns current mean flops consumption per image. """ for m in self.modules(): m.accumulate_flops = accumulate_flops.__get__(m) flops_sum = self.accumulate_flops() for m in self.modules(): if hasattr(m, 'accumulate_flops'): del m.accumulate_flops params_sum = get_model_parameters_number(self) return flops_sum / self.__batch_counter__, params_sum def start_flops_count(self, **kwargs): """ A method that will be available after add_flops_counting_methods() is called on a desired net object. Activates the computation of mean flops consumption per image. Call it before you run the network. """ add_batch_counter_hook_function(self) seen_types = set() def add_flops_counter_hook_function(module, ost, verbose, ignore_list): if type(module) in ignore_list: seen_types.add(type(module)) if is_supported_instance(module): module.__params__ = 0 elif is_supported_instance(module): if hasattr(module, '__flops_handle__'): return if type(module) in CUSTOM_MODULES_MAPPING: handle = module.register_forward_hook( CUSTOM_MODULES_MAPPING[type(module)]) else: handle = module.register_forward_hook(MODULES_MAPPING[type(module)]) module.__flops_handle__ = handle seen_types.add(type(module)) else: if verbose and not type(module) in (nn.Sequential, nn.ModuleList) and \ not type(module) in seen_types: print('Warning: module ' + type(module).__name__ + ' is treated as a zero-op.', file=ost) seen_types.add(type(module)) self.apply(partial(add_flops_counter_hook_function, **kwargs)) def stop_flops_count(self): """ A method that will be available after add_flops_counting_methods() is called on a desired net object. Stops computing the mean flops consumption per image. Call whenever you want to pause the computation. """ remove_batch_counter_hook_function(self) self.apply(remove_flops_counter_hook_function) def reset_flops_count(self): """ A method that will be available after add_flops_counting_methods() is called on a desired net object. Resets statistics computed so far. """ add_batch_counter_variables_or_reset(self) self.apply(add_flops_counter_variable_or_reset) # ---- Internal functions def empty_flops_counter_hook(module, input, output): module.__flops__ += 0 def upsample_flops_counter_hook(module, input, output): output_size = output[0] batch_size = output_size.shape[0] output_elements_count = batch_size for val in output_size.shape[1:]: output_elements_count *= val module.__flops__ += int(output_elements_count) def relu_flops_counter_hook(module, input, output): active_elements_count = output.numel() module.__flops__ += int(active_elements_count) def linear_flops_counter_hook(module, input, output): input = input[0] # pytorch checks dimensions, so here we don't care much output_last_dim = output.shape[-1] bias_flops = output_last_dim if module.bias is not None else 0 module.__flops__ += int(np.prod(input.shape) * output_last_dim + bias_flops) def pool_flops_counter_hook(module, input, output): input = input[0] module.__flops__ += int(np.prod(input.shape)) def bn_flops_counter_hook(module, input, output): input = input[0] batch_flops = np.prod(input.shape) if module.affine: batch_flops *= 2 module.__flops__ += int(batch_flops) def conv_flops_counter_hook(conv_module, input, output): # Can have multiple inputs, getting the first one input = input[0] batch_size = input.shape[0] output_dims = list(output.shape[2:]) kernel_dims = list(conv_module.kernel_size) in_channels = conv_module.in_channels out_channels = conv_module.out_channels groups = conv_module.groups filters_per_channel = out_channels // groups conv_per_position_flops = int(np.prod(kernel_dims)) * \ in_channels * filters_per_channel active_elements_count = batch_size * int(np.prod(output_dims)) overall_conv_flops = conv_per_position_flops * active_elements_count bias_flops = 0 if conv_module.bias is not None: bias_flops = out_channels * active_elements_count overall_flops = overall_conv_flops + bias_flops conv_module.__flops__ += int(overall_flops) def batch_counter_hook(module, input, output): batch_size = 1 if len(input) > 0: # Can have multiple inputs, getting the first one input = input[0] batch_size = len(input) else: pass print('Warning! No positional inputs found for a module,' ' assuming batch size is 1.') module.__batch_counter__ += batch_size def rnn_flops(flops, rnn_module, w_ih, w_hh, input_size): # matrix matrix mult ih state and internal state flops += w_ih.shape[0]*w_ih.shape[1] # matrix matrix mult hh state and internal state flops += w_hh.shape[0]*w_hh.shape[1] if isinstance(rnn_module, (nn.RNN, nn.RNNCell)): # add both operations flops += rnn_module.hidden_size elif isinstance(rnn_module, (nn.GRU, nn.GRUCell)): # hadamard of r flops += rnn_module.hidden_size # adding operations from both states flops += rnn_module.hidden_size*3 # last two hadamard product and add flops += rnn_module.hidden_size*3 elif isinstance(rnn_module, (nn.LSTM, nn.LSTMCell)): # adding operations from both states flops += rnn_module.hidden_size*4 # two hadamard product and add for C state flops += rnn_module.hidden_size + rnn_module.hidden_size + rnn_module.hidden_size # final hadamard flops += rnn_module.hidden_size + rnn_module.hidden_size + rnn_module.hidden_size return flops def rnn_flops_counter_hook(rnn_module, input, output): """ Takes into account batch goes at first position, contrary to pytorch common rule (but actually it doesn't matter). IF sigmoid and tanh are made hard, only a comparison FLOPS should be accurate """ flops = 0 # input is a tuple containing a sequence to process and (optionally) hidden state inp = input[0] batch_size = inp.shape[0] seq_length = inp.shape[1] num_layers = rnn_module.num_layers for i in range(num_layers): w_ih = rnn_module.__getattr__('weight_ih_l' + str(i)) w_hh = rnn_module.__getattr__('weight_hh_l' + str(i)) if i == 0: input_size = rnn_module.input_size else: input_size = rnn_module.hidden_size flops = rnn_flops(flops, rnn_module, w_ih, w_hh, input_size) if rnn_module.bias: b_ih = rnn_module.__getattr__('bias_ih_l' + str(i)) b_hh = rnn_module.__getattr__('bias_hh_l' + str(i)) flops += b_ih.shape[0] + b_hh.shape[0] flops *= batch_size flops *= seq_length if rnn_module.bidirectional: flops *= 2 rnn_module.__flops__ += int(flops) def rnn_cell_flops_counter_hook(rnn_cell_module, input, output): flops = 0 inp = input[0] batch_size = inp.shape[0] w_ih = rnn_cell_module.__getattr__('weight_ih') w_hh = rnn_cell_module.__getattr__('weight_hh') input_size = inp.shape[1] flops = rnn_flops(flops, rnn_cell_module, w_ih, w_hh, input_size) if rnn_cell_module.bias: b_ih = rnn_cell_module.__getattr__('bias_ih') b_hh = rnn_cell_module.__getattr__('bias_hh') flops += b_ih.shape[0] + b_hh.shape[0] flops *= batch_size rnn_cell_module.__flops__ += int(flops) def ffn_hook(module, input, output): input = input[0] for layer in module.layers: if isinstance(layer, nn.Sequential): layer_cur = layer[0] else: layer_cur = layer if not isinstance(layer_cur, nn.Linear): continue # pytorch checks dimensions, so here we don't care much output_last_dim = layer_cur.out_features bias_flops = output_last_dim if layer_cur.bias is not None else 0 module.__flops__ += int(input.shape[0] * input.shape[1] * layer_cur.in_features) * output_last_dim + bias_flops def multihead_attention_counter_hook(multihead_attention_module, input, output): flops = 0 if len(input) == 0: print(len(output)) for i in output: print(i.shape) # unknown problem q,k,v = output[0], output[0], output[0] else: print("Successful!") q, k, v = input batch_size = q.shape[1] num_heads = multihead_attention_module.num_heads embed_dims = multihead_attention_module.embed_dims kdim = multihead_attention_module.kdim vdim = multihead_attention_module.vdim if kdim is None: kdim = embed_dims if vdim is None: vdim = embed_dims # initial projections flops = q.shape[0] * q.shape[2] * embed_dims + \ k.shape[0] * k.shape[2] * kdim + \ v.shape[0] * v.shape[2] * vdim if multihead_attention_module.in_proj_bias is not None: flops += (q.shape[0] + k.shape[0] + v.shape[0]) * embed_dims # attention heads: scale, matmul, softmax, matmul head_dim = embed_dims // num_heads head_flops = q.shape[0] * head_dim + \ head_dim * q.shape[0] * k.shape[0] + \ q.shape[0] * k.shape[0] + \ q.shape[0] * k.shape[0] * head_dim flops += num_heads * head_flops # final projection, bias is always enabled flops += q.shape[0] * embed_dims * (embed_dims + 1) flops *= batch_size multihead_attention_module.__flops__ += int(flops) def add_batch_counter_variables_or_reset(module): module.__batch_counter__ = 0 def add_batch_counter_hook_function(module): if hasattr(module, '__batch_counter_handle__'): return handle = module.register_forward_hook(batch_counter_hook) module.__batch_counter_handle__ = handle def remove_batch_counter_hook_function(module): if hasattr(module, '__batch_counter_handle__'): module.__batch_counter_handle__.remove() del module.__batch_counter_handle__ def add_flops_counter_variable_or_reset(module): if is_supported_instance(module): if hasattr(module, '__flops__') or hasattr(module, '__params__'): print('Warning: variables __flops__ or __params__ are already ' 'defined for the module' + type(module).__name__ + ' ptflops can affect your code!') module.__flops__ = 0 module.__params__ = get_model_parameters_number(module) CUSTOM_MODULES_MAPPING = {} def norm_flops_counter_hook(module, input, output): input = input[0] batch_flops = np.prod(input.shape) if (getattr(module, 'affine', False) or getattr(module, 'elementwise_affine', False)): batch_flops *= 2 module.__flops__ += int(batch_flops) MODULES_MAPPING = { # convolutions nn.Conv1d: conv_flops_counter_hook, nn.Conv2d: conv_flops_counter_hook, nn.Conv3d: conv_flops_counter_hook, # activations nn.ReLU: relu_flops_counter_hook, nn.PReLU: relu_flops_counter_hook, nn.ELU: relu_flops_counter_hook, nn.LeakyReLU: relu_flops_counter_hook, nn.ReLU6: relu_flops_counter_hook, # poolings nn.MaxPool1d: pool_flops_counter_hook, nn.AvgPool1d: pool_flops_counter_hook, nn.AvgPool2d: pool_flops_counter_hook, nn.MaxPool2d: pool_flops_counter_hook, nn.MaxPool3d: pool_flops_counter_hook, nn.AvgPool3d: pool_flops_counter_hook, nn.AdaptiveMaxPool1d: pool_flops_counter_hook, nn.AdaptiveAvgPool1d: pool_flops_counter_hook, nn.AdaptiveMaxPool2d: pool_flops_counter_hook, nn.AdaptiveAvgPool2d: pool_flops_counter_hook, nn.AdaptiveMaxPool3d: pool_flops_counter_hook, nn.AdaptiveAvgPool3d: pool_flops_counter_hook, # BNs nn.BatchNorm1d: bn_flops_counter_hook, nn.BatchNorm2d: bn_flops_counter_hook, nn.BatchNorm3d: bn_flops_counter_hook, nn.InstanceNorm1d: bn_flops_counter_hook, nn.InstanceNorm2d: bn_flops_counter_hook, nn.InstanceNorm3d: bn_flops_counter_hook, nn.GroupNorm: bn_flops_counter_hook, # normalizations # nn.BatchNorm1d: norm_flops_counter_hook, # nn.BatchNorm2d: norm_flops_counter_hook, # nn.BatchNorm3d: norm_flops_counter_hook, # nn.GroupNorm: norm_flops_counter_hook, # nn.InstanceNorm1d: norm_flops_counter_hook, # nn.InstanceNorm2d: norm_flops_counter_hook, # nn.InstanceNorm3d: norm_flops_counter_hook, nn.LayerNorm: norm_flops_counter_hook, # FC nn.Linear: linear_flops_counter_hook, # Upscale nn.Upsample: upsample_flops_counter_hook, # Deconvolution nn.ConvTranspose1d: conv_flops_counter_hook, nn.ConvTranspose2d: conv_flops_counter_hook, nn.ConvTranspose3d: conv_flops_counter_hook, # RNN nn.RNN: rnn_flops_counter_hook, nn.GRU: rnn_flops_counter_hook, nn.LSTM: rnn_flops_counter_hook, nn.RNNCell: rnn_cell_flops_counter_hook, nn.LSTMCell: rnn_cell_flops_counter_hook, nn.GRUCell: rnn_cell_flops_counter_hook, nn.MultiheadAttention: multihead_attention_counter_hook, mmcv.cnn.bricks.transformer.FFN:ffn_hook } def is_supported_instance(module): if type(module) in MODULES_MAPPING or type(module) in CUSTOM_MODULES_MAPPING: return True return False def remove_flops_counter_hook_function(module): if is_supported_instance(module): if hasattr(module, '__flops_handle__'): module.__flops_handle__.remove() del module.__flops_handle__ ================================================ FILE: tools/get_flops.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import numpy as np import torch from mmcv import Config, DictAction from mmdet.models import build_detector try: from mmcv.cnn import get_model_complexity_info # from tools.flops_counter import get_model_complexity_info except ImportError: raise ImportError('Please upgrade mmcv to >0.6.2') def parse_args(): parser = argparse.ArgumentParser(description='Train a detector') parser.add_argument('config', help='train config file path') parser.add_argument( '--shape', type=int, nargs='+', default=[1280, 800], help='input image size') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') parser.add_argument( '--size-divisor', type=int, default=32, help='Pad the input image, the minimum size that is divisible ' 'by size_divisor, -1 means do not pad the image.') args = parser.parse_args() return args def main(): args = parse_args() if len(args.shape) == 1: h = w = args.shape[0] elif len(args.shape) == 2: h, w = args.shape else: raise ValueError('invalid input shape') orig_shape = (3, h, w) divisor = args.size_divisor if divisor > 0: h = int(np.ceil(h / divisor)) * divisor w = int(np.ceil(w / divisor)) * divisor input_shape = (3, h, w) cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # import modules from string list. if cfg.get('custom_imports', None): from mmcv.utils import import_modules_from_strings import_modules_from_strings(**cfg['custom_imports']) model = build_detector( cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg')) if torch.cuda.is_available(): model.cuda() model.eval() if hasattr(model, 'forward_dummy'): model.forward = model.forward_dummy else: raise NotImplementedError( 'FLOPs counter is currently not currently supported with {}'. format(model.__class__.__name__)) flops, params = get_model_complexity_info(model, input_shape) split_line = '=' * 30 if divisor > 0 and \ input_shape != orig_shape: print(f'{split_line}\nUse size divisor set input shape ' f'from {orig_shape} to {input_shape}\n') print(f'{split_line}\nInput shape: {input_shape}\n' f'Flops: {flops}\nParams: {params}\n{split_line}') print('!!!Please be cautious if you use the results in papers. ' 'You may need to check if all ops are supported and verify that the ' 'flops computation is correct.') if __name__ == '__main__': main() ================================================ FILE: tools/inference_kitti_step.sh ================================================ #!/usr/bin/env bash CONFIG=$1 CHECKPOINT=$2 LOG=$3 # configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2.py logger/models/video_knet_vis/video_knet_step_quansi_r50.pth logger/results/kitti_step_merge_joint_semantic_filter # configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2.py logger/models/video_knet_vis/video_knet_step_quansi_r50.pth logger/results/kitti_step_semantic_filter # --cfg-options data.test.split=val model.roi_head.merge_joint=True model.semantic_filter=True # --cfg-options data.test.split=val model.roi_head.merge_joint=False model.semantic_filter=True PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ python $(dirname "$0")/test_dvps.py $CONFIG $CHECKPOINT --eval dummy --show-dir $LOG ${@:4} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ python $(dirname "$0")/eval_dstq_step.py $LOG ================================================ FILE: tools/slurm_test.sh ================================================ #!/usr/bin/env bash set -x PARTITION=$1 JOB_NAME=$2 CONFIG=$3 CHECKPOINT=$4 GPUS=${GPUS:-8} GPUS_PER_NODE=${GPUS_PER_NODE:-8} CPUS_PER_TASK=${CPUS_PER_TASK:-5} PY_ARGS=${@:5} SRUN_ARGS=${SRUN_ARGS:-""} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ srun -p ${PARTITION} \ --job-name=${JOB_NAME} \ --gres=gpu:${GPUS_PER_NODE} \ --ntasks=${GPUS} \ --ntasks-per-node=${GPUS_PER_NODE} \ --cpus-per-task=${CPUS_PER_TASK} \ --kill-on-bad-exit=1 \ ${SRUN_ARGS} \ python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} ================================================ FILE: tools/slurm_test_dvps.sh ================================================ #!/usr/bin/env bash set -x PARTITION=$1 JOB_NAME=$2 CONFIG=$3 CHECKPOINT=$4 GPUS=${GPUS:-1} GPUS_PER_NODE=${GPUS_PER_NODE:-1} CPUS_PER_TASK=${CPUS_PER_TASK:-5} PY_ARGS=${@:5} SRUN_ARGS=${SRUN_ARGS:-""} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ srun -p ${PARTITION} \ --job-name=${JOB_NAME} \ --gres=gpu:${GPUS_PER_NODE} \ --ntasks=${GPUS} \ --ntasks-per-node=${GPUS_PER_NODE} \ --cpus-per-task=${CPUS_PER_TASK} \ --kill-on-bad-exit=1 \ ${SRUN_ARGS} \ python -u tools/test_dvps.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} ================================================ FILE: tools/slurm_test_step.sh ================================================ #!/usr/bin/env bash set -x PARTITION=$1 JOB_NAME=$2 CONFIG=$3 CHECKPOINT=$4 GPUS=${GPUS:-1} GPUS_PER_NODE=${GPUS_PER_NODE:-1} CPUS_PER_TASK=${CPUS_PER_TASK:-5} PY_ARGS=${@:5} SRUN_ARGS=${SRUN_ARGS:-""} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ srun -p ${PARTITION} \ --job-name=${JOB_NAME} \ --gres=gpu:${GPUS_PER_NODE} \ --ntasks=${GPUS} \ --ntasks-per-node=${GPUS_PER_NODE} \ --cpus-per-task=${CPUS_PER_TASK} \ --kill-on-bad-exit=1 \ ${SRUN_ARGS} \ python -u tools/test_step.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} ================================================ FILE: tools/slurm_test_vis.sh ================================================ #!/usr/bin/env bash set -x PARTITION=$1 JOB_NAME=$2 CONFIG=$3 CHECKPOINT=$4 GPUS=${GPUS:-1} GPUS_PER_NODE=${GPUS_PER_NODE:-1} CPUS_PER_TASK=${CPUS_PER_TASK:-5} PY_ARGS=${@:5} SRUN_ARGS=${SRUN_ARGS:-""} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ srun -p ${PARTITION} \ --job-name=${JOB_NAME} \ --gres=gpu:${GPUS_PER_NODE} \ --ntasks=${GPUS} \ --ntasks-per-node=${GPUS_PER_NODE} \ --cpus-per-task=${CPUS_PER_TASK} \ --kill-on-bad-exit=1 \ ${SRUN_ARGS} \ python -u tools/test_vis.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} ================================================ FILE: tools/slurm_test_vps.sh ================================================ #!/usr/bin/env bash set -x PARTITION=$1 JOB_NAME=$2 CONFIG=$3 CHECKPOINT=$4 GPUS=${GPUS:-1} GPUS_PER_NODE=${GPUS_PER_NODE:-8} CPUS_PER_TASK=${CPUS_PER_TASK:-5} PY_ARGS=${@:5} SRUN_ARGS=${SRUN_ARGS:-""} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ srun -p ${PARTITION} \ --job-name=${JOB_NAME} \ --gres=gpu:${GPUS_PER_NODE} \ --ntasks=${GPUS} \ --ntasks-per-node=${GPUS_PER_NODE} \ --cpus-per-task=${CPUS_PER_TASK} \ --kill-on-bad-exit=1 \ ${SRUN_ARGS} \ python -u tools/test_vps_two_frames.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} ================================================ FILE: tools/slurm_train.sh ================================================ #!/usr/bin/env bash set -x PARTITION=$1 JOB_NAME=$2 CONFIG=$3 WORK_DIR=$4 GPUS=${GPUS:-8} GPUS_PER_NODE=${GPUS_PER_NODE:-8} CPUS_PER_TASK=${CPUS_PER_TASK:-5} SRUN_ARGS=${SRUN_ARGS:-""} PY_ARGS=${@:5} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ srun -p ${PARTITION} \ --job-name=${JOB_NAME} \ --gres=gpu:${GPUS_PER_NODE} \ --ntasks=${GPUS} \ --ntasks-per-node=${GPUS_PER_NODE} \ --cpus-per-task=${CPUS_PER_TASK} \ --kill-on-bad-exit=1 \ ${SRUN_ARGS} \ python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS} ================================================ FILE: tools/test.py ================================================ import argparse import os import warnings import mmcv import torch from mmcv import Config, DictAction from mmcv.cnn import fuse_conv_bn from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, wrap_fp16_model) from mmdet.datasets import (build_dataloader, build_dataset, replace_ImageToTensor) from mmdet.models import build_detector from external.test import multi_gpu_test, single_gpu_test def parse_args(): parser = argparse.ArgumentParser( description='MMDet test (and eval) a model') parser.add_argument('config', help='test config file path') parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument('--out', help='output result file in pickle format') parser.add_argument( '--fuse-conv-bn', action='store_true', help='Whether to fuse conv and bn, this will slightly increase' 'the inference speed') parser.add_argument( '--format-only', action='store_true', help='Format the output results without perform evaluation. It is' 'useful when you want to format the result to a specific format and ' 'submit it to the test server') parser.add_argument( '--eval', type=str, nargs='+', help='evaluation metrics, which depends on the dataset, e.g., "bbox",' ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') parser.add_argument('--show', action='store_true', help='show results') parser.add_argument( '--show-dir', help='directory where painted images will be saved') parser.add_argument( '--show-score-thr', type=float, default=0.3, help='score threshold (default: 0.3)') parser.add_argument( '--gpu-collect', action='store_true', help='whether to use gpu to collect results.') parser.add_argument( '--tmpdir', help='tmp directory used for collecting results from multiple ' 'workers, available when gpu-collect is not specified') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') parser.add_argument( '--options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function (deprecate), ' 'change to --eval-options instead.') parser.add_argument( '--eval-options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) if args.options and args.eval_options: raise ValueError( '--options and --eval-options cannot be both ' 'specified, --options is deprecated in favor of --eval-options') if args.options: warnings.warn('--options is deprecated in favor of --eval-options') args.eval_options = args.options return args def main(): args = parse_args() assert args.out or args.eval or args.format_only or args.show \ or args.show_dir, \ ('Please specify at least one operation (save/eval/format/show the ' 'results / save the results) with the argument "--out", "--eval"' ', "--format-only", "--show" or "--show-dir"') if args.eval and args.format_only: raise ValueError('--eval and --format_only cannot be both specified') if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # import modules from string list. if cfg.get('custom_imports', None): from mmcv.utils import import_modules_from_strings import_modules_from_strings(**cfg['custom_imports']) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # cfg.model.pretrained = None if cfg.model.get('neck'): if isinstance(cfg.model.neck, list): for neck_cfg in cfg.model.neck: if neck_cfg.get('rfp_backbone'): if neck_cfg.rfp_backbone.get('pretrained'): neck_cfg.rfp_backbone.pretrained = None elif cfg.model.neck.get('rfp_backbone'): if cfg.model.neck.rfp_backbone.get('pretrained'): cfg.model.neck.rfp_backbone.pretrained = None # in case the test dataset is concatenated samples_per_gpu = 1 if isinstance(cfg.data.test, dict): cfg.data.test.test_mode = True samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) if samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.test.pipeline = replace_ImageToTensor( cfg.data.test.pipeline) elif isinstance(cfg.data.test, list): for ds_cfg in cfg.data.test: ds_cfg.test_mode = True samples_per_gpu = max( [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test]) if samples_per_gpu > 1: for ds_cfg in cfg.data.test: ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, samples_per_gpu=samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint cfg.model.train_cfg = None model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_conv_bn(model) # old versions did not save class info in checkpoints, this walkaround is # for backward compatibility if 'CLASSES' in checkpoint.get('meta', {}): model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = dataset.CLASSES if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader, args.show, args.show_dir, args.show_score_thr) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: if args.out: print(f'\nwriting results to {args.out}') mmcv.dump(outputs, args.out) kwargs = {} if args.eval_options is None else args.eval_options if args.format_only: dataset.format_results(outputs, **kwargs) if args.eval: eval_kwargs = cfg.get('evaluation', {}).copy() # hard-code way to remove EvalHook args for key in [ 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', 'rule', 'by_epoch' ]: eval_kwargs.pop(key, None) eval_kwargs.update(dict(metric=args.eval, **kwargs)) print(dataset.evaluate(outputs, **eval_kwargs)) if __name__ == '__main__': main() ================================================ FILE: tools/test_dvps.py ================================================ import argparse import os import os.path as osp import warnings import numpy as np import pickle import json import mmcv import torch from mmcv import Config, DictAction from mmcv.cnn import fuse_conv_bn from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, wrap_fp16_model) from mmdet.datasets import (build_dataloader, build_dataset, replace_ImageToTensor) from mmdet.models import build_detector from external.test import encode_mask_results, tensor2imgs def single_gpu_test(model, data_loader, show=False, out_dir=None, show_score_thr=0.3, with_semantic_input=False, rescale_depth=False, with_seq=False, ): if out_dir is None: out_dir = './out' print("The output dir is {}".format(out_dir)) model.eval() results = [] dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) pano_seg_2ch_list = [] for i, data in enumerate(data_loader): seq_id = data['seq_id'][0].item() img_id = data['img_id'][0].item() data.pop('seq_id') if with_semantic_input: semantic_input = mmcv.imread( os.path.join('data/kitti-dvps/semantic/', "{:06d}_{:06d}_semantic.png".format(seq_id, img_id)), flag='unchanged') semantic_input = torch.tensor(semantic_input, device=data['img'][0].device) else: semantic_input = None with torch.no_grad(): segm_results = model(return_loss=False, rescale=True, semantic_input=semantic_input, **data) sseg_results, track_maps, depth_final, vis_sem, vis_tracker = segm_results batch_size = 1 # dump results seq_folder = str(seq_id) if with_seq else "" cat_path = os.path.join(out_dir, 'panoptic', seq_folder, '{:06d}_{:06d}_cat.png'.format(seq_id, img_id)) ins_path = os.path.join(out_dir, 'panoptic', seq_folder, '{:06d}_{:06d}_ins.png'.format(seq_id, img_id)) dep_path = os.path.join(out_dir, 'depth', seq_folder, '{:06d}_{:06d}.png'.format(seq_id, img_id)) vis_path = os.path.join(out_dir, 'vis', seq_folder, '{:06d}_{:06d}.png'.format(seq_id, img_id)) depth_final_rescale = mmcv.imresize(depth_final, (300, 100), interpolation='bilinear') \ if depth_final is not None else None mmcv.imwrite(sseg_results.astype(np.uint16), cat_path) mmcv.imwrite(track_maps.astype(np.uint16), ins_path) if depth_final_rescale is not None: mmcv.imwrite(((depth_final_rescale if rescale_depth else depth_final) * 256.).astype(np.uint16), dep_path) mmcv.imwrite(np.concatenate((vis_sem, vis_tracker), axis=0), vis_path) for _ in range(batch_size): prog_bar.update() return results, pano_seg_2ch_list def parse_args(): parser = argparse.ArgumentParser( description='MMDet test (and eval) a model') parser.add_argument('config', help='test config file path') parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument('--out', help='output result file in pickle format') parser.add_argument( '--fuse-conv-bn', action='store_true', help='Whether to fuse conv and bn, this will slightly increase' 'the inference speed') parser.add_argument( '--format-only', action='store_true', help='Format the output results without perform evaluation. It is' 'useful when you want to format the result to a specific format and ' 'submit it to the test server') parser.add_argument( '--eval', type=str, nargs='+', help='evaluation metrics, which depends on the dataset, e.g., "bbox",' ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') parser.add_argument('--show', action='store_true', help='show results') parser.add_argument( '--show-dir', help='directory where painted images will be saved') parser.add_argument( '--show-score-thr', type=float, default=0.3, help='score threshold (default: 0.3)') parser.add_argument( '--gpu-collect', action='store_true', help='whether to use gpu to collect results.') parser.add_argument( '--tmpdir', help='tmp directory used for collecting results from multiple ' 'workers, available when gpu-collect is not specified') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') parser.add_argument( '--options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function (deprecate), ' 'change to --eval-options instead.') parser.add_argument( '--eval-options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function') parser.add_argument( '--semantic', action='store_true', help="semantic input" ) parser.add_argument( '--rescale-depth', action='store_true', help="" ) parser.add_argument( '--with-seq', action='store_true', help="" ) parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) # parser.add_argument('--output_dir', default="./work_dirs/vps/vps_output", # help='output result file in pickle format to load') # parser.add_argument('--n_video', type=int, default=50, help="number of video clips") # parser.add_argument('--pan_im_json_file', type=str, default='data/cityscapes_vps/panoptic_im_val_city_vps.json') args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) if args.options and args.eval_options: raise ValueError( '--options and --eval-options cannot be both ' 'specified, --options is deprecated in favor of --eval-options') if args.options: warnings.warn('--options is deprecated in favor of --eval-options') args.eval_options = args.options return args def main(): args = parse_args() assert args.out or args.eval or args.format_only or args.show \ or args.show_dir, \ ('Please specify at least one operation (save/eval/format/show the ' 'results / save the results) with the argument "--out", "--eval"' ', "--format-only", "--show" or "--show-dir"') if args.eval and args.format_only: raise ValueError('--eval and --format_only cannot be both specified') if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') print(args) cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # import modules from string list. if cfg.get('custom_imports', None): from mmcv.utils import import_modules_from_strings import_modules_from_strings(**cfg['custom_imports']) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None if cfg.model.get('neck'): if isinstance(cfg.model.neck, list): for neck_cfg in cfg.model.neck: if neck_cfg.get('rfp_backbone'): if neck_cfg.rfp_backbone.get('pretrained'): neck_cfg.rfp_backbone.pretrained = None elif cfg.model.neck.get('rfp_backbone'): if cfg.model.neck.rfp_backbone.get('pretrained'): cfg.model.neck.rfp_backbone.pretrained = None # in case the test dataset is concatenated samples_per_gpu = 1 if isinstance(cfg.data.test, dict): cfg.data.test.test_mode = True samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) if samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.test.pipeline = replace_ImageToTensor( cfg.data.test.pipeline) elif isinstance(cfg.data.test, list): for ds_cfg in cfg.data.test: ds_cfg.test_mode = True samples_per_gpu = max( [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test]) if samples_per_gpu > 1: for ds_cfg in cfg.data.test: ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, samples_per_gpu=samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint cfg.model.train_cfg = None model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_conv_bn(model) # old versions did not save class info in checkpoints, this walkaround is # for backward compatibility if 'CLASSES' in checkpoint.get('meta', {}): model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = dataset.CLASSES model = MMDataParallel(model, device_ids=[0]) # Inference the sequence outputs, pred_pans_2ch = single_gpu_test(model, data_loader, args.show, args.show_dir, args.show_score_thr, with_semantic_input=args.semantic, rescale_depth=args.rescale_depth, with_seq=args.with_seq) print("==>Inference Depth VPS Done!") # Evaluation Part if __name__ == '__main__': main() ================================================ FILE: tools/test_step.py ================================================ import argparse import os import os.path as osp import warnings import numpy as np import pickle import json import cv2 import mmcv import torch from mmcv import Config, DictAction from mmcv.cnn import fuse_conv_bn from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, wrap_fp16_model) from mmdet.datasets import (build_dataloader, build_dataset, replace_ImageToTensor) from mmdet.models import build_detector try: from mmcv.cnn import get_model_complexity_info except ImportError: raise ImportError('Please upgrade mmcv to >0.6.2') def single_gpu_test(model, data_loader, show=False, out_dir=None, show_score_thr=0.3, with_semantic_input=False,): if out_dir is None: out_dir = './out' print("The output dir is {}".format(out_dir)) model.eval() results = [] dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) pano_seg_2ch_list = [] for i, data in enumerate(data_loader): seq_id = data['seq_id'][0].item() img_id = data['img_id'][0].item() data.pop('seq_id') with torch.no_grad(): segm_results = model(return_loss=False, rescale=True, **data) sseg_results, track_maps, _, _, _ = segm_results batch_size = 1 # merge # dump results cat_path = os.path.join(out_dir, 'panoptic', str(seq_id), '{:06d}_{:06d}_cat.png'.format(seq_id, img_id)) ins_path = os.path.join(out_dir, 'panoptic', str(seq_id), '{:06d}_{:06d}_ins.png'.format(seq_id, img_id)) vis_path = os.path.join(out_dir, 'vis', str(seq_id), '{:06d}_{:06d}.png'.format(seq_id, img_id)) final_path = os.path.join(out_dir, 'final', '{:04d}'.format(seq_id), '{:06d}.png'.format(img_id)) # depth_final_rescale = mmcv.imresize(depth_final, (300, 100), interpolation='bilinear') \ # if depth_final is not None else None final_map = np.stack([sseg_results.astype(np.uint8), (track_maps // 256).astype(np.uint8), (track_maps % 256).astype(np.uint8)], axis=-1) cv2.cvtColor(final_map, cv2.COLOR_RGB2BGR, final_map) mmcv.imwrite(sseg_results.astype(np.uint16), cat_path) mmcv.imwrite(track_maps.astype(np.uint16), ins_path) # final map for evaluation mmcv.imwrite(final_map, final_path) # depth # if depth_final_rescale is not None: # mmcv.imwrite((depth_final_rescale * 256).astype(np.uint16), dep_path) # vis # mmcv.imwrite(np.concatenate((vis_sem, vis_tracker), axis=0), vis_path) for _ in range(batch_size): prog_bar.update() return results, pano_seg_2ch_list def parse_args(): parser = argparse.ArgumentParser( description='MMDet test (and eval) a model') parser.add_argument('config', help='test config file path') parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument('--out', help='output result file in pickle format') parser.add_argument( '--fuse-conv-bn', action='store_true', help='Whether to fuse conv and bn, this will slightly increase' 'the inference speed') parser.add_argument( '--format-only', action='store_true', help='Format the output results without perform evaluation. It is' 'useful when you want to format the result to a specific format and ' 'submit it to the test server') parser.add_argument( '--eval', type=str, nargs='+', help='evaluation metrics, which depends on the dataset, e.g., "bbox",' ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') parser.add_argument('--show', action='store_true', help='show results') parser.add_argument( '--show-dir', help='directory where painted images will be saved') parser.add_argument( '--show-score-thr', type=float, default=0.3, help='score threshold (default: 0.3)') parser.add_argument( '--gpu-collect', action='store_true', help='whether to use gpu to collect results.') parser.add_argument( '--tmpdir', help='tmp directory used for collecting results from multiple ' 'workers, available when gpu-collect is not specified') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') parser.add_argument( '--options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function (deprecate), ' 'change to --eval-options instead.') parser.add_argument( '--eval-options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function') parser.add_argument( '--semantic', action='store_true', help="semantic input" ) parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--output_dir', default="./work_dirs/vps/vps_output", help='output result file in pickle format to load') parser.add_argument('--n_video', type=int, default=50, help="number of video clips") parser.add_argument('--pan_im_json_file', type=str, default='data/cityscapes_vps/panoptic_im_val_city_vps.json') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) if args.options and args.eval_options: raise ValueError( '--options and --eval-options cannot be both ' 'specified, --options is deprecated in favor of --eval-options') if args.options: warnings.warn('--options is deprecated in favor of --eval-options') args.eval_options = args.options return args def main(): args = parse_args() assert args.out or args.eval or args.format_only or args.show \ or args.show_dir, \ ('Please specify at least one operation (save/eval/format/show the ' 'results / save the results) with the argument "--out", "--eval"' ', "--format-only", "--show" or "--show-dir"') if args.eval and args.format_only: raise ValueError('--eval and --format_only cannot be both specified') if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') print(args) cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # import modules from string list. if cfg.get('custom_imports', None): from mmcv.utils import import_modules_from_strings import_modules_from_strings(**cfg['custom_imports']) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None if cfg.model.get('neck'): if isinstance(cfg.model.neck, list): for neck_cfg in cfg.model.neck: if neck_cfg.get('rfp_backbone'): if neck_cfg.rfp_backbone.get('pretrained'): neck_cfg.rfp_backbone.pretrained = None elif cfg.model.neck.get('rfp_backbone'): if cfg.model.neck.rfp_backbone.get('pretrained'): cfg.model.neck.rfp_backbone.pretrained = None # in case the test dataset is concatenated samples_per_gpu = 1 if isinstance(cfg.data.test, dict): cfg.data.test.test_mode = True samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) if samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.test.pipeline = replace_ImageToTensor( cfg.data.test.pipeline) elif isinstance(cfg.data.test, list): for ds_cfg in cfg.data.test: ds_cfg.test_mode = True samples_per_gpu = max( [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test]) if samples_per_gpu > 1: for ds_cfg in cfg.data.test: ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, samples_per_gpu=samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint cfg.model.train_cfg = None model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_conv_bn(model) # old versions did not save class info in checkpoints, this walkaround is # for backward compatibility if 'CLASSES' in checkpoint.get('meta', {}): model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = dataset.CLASSES model = MMDataParallel(model, device_ids=[0]) # Inference the sequence outputs, pred_pans_2ch = single_gpu_test(model, data_loader, args.show, args.show_dir, args.show_score_thr, with_semantic_input=args.semantic) print("==>Inference STEP Done!") # Evaluation Part if __name__ == '__main__': main() ================================================ FILE: tools/test_vps.py ================================================ import argparse import os import os.path as osp import warnings import numpy as np import pickle import json import mmcv import torch from mmcv import Config, DictAction from mmcv.cnn import fuse_conv_bn from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, wrap_fp16_model) from mmdet.datasets import (build_dataloader, build_dataset, replace_ImageToTensor) from mmdet.models import build_detector from external.test import encode_mask_results, tensor2imgs def single_gpu_test(model, data_loader, show=False, out_dir=None, show_score_thr=0.3, with_semantic_input=False, rescale_depth=False, with_seq=False, ): if out_dir is None: out_dir = './out' print("The output dir is {}".format(out_dir)) model.eval() results = [] dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) pano_seg_2ch_list = [] # print("data loader length:", len(data_loader)) # exit() for i, data in enumerate(data_loader): seq_id = data['seq_id'][0].item() img_id = data['img_id'][0].item() data.pop('seq_id') with torch.no_grad(): segm_results = model(return_loss=False, rescale=True, **data) sseg_results, track_maps, _, _, _ = segm_results batch_size = 1 # dump results seq_folder = str(seq_id) if with_seq else "" cat_path = os.path.join(out_dir, 'panoptic', seq_folder, '{:06d}_{:06d}_cat.png'.format(seq_id, img_id)) ins_path = os.path.join(out_dir, 'panoptic', seq_folder, '{:06d}_{:06d}_ins.png'.format(seq_id, img_id)) mmcv.imwrite(sseg_results.astype(np.uint16), cat_path) mmcv.imwrite(track_maps.astype(np.uint16), ins_path) # if depth_final_rescale is not None: # mmcv.imwrite(((depth_final_rescale if rescale_depth else depth_final) * 256.).astype(np.uint16), dep_path) # mmcv.imwrite(np.concatenate((vis_sem, vis_tracker), axis=0), vis_path) for _ in range(batch_size): prog_bar.update() return results, pano_seg_2ch_list def parse_args(): parser = argparse.ArgumentParser( description='MMDet test (and eval) a model') parser.add_argument('config', help='test config file path') parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument('--out', help='output result file in pickle format') parser.add_argument( '--fuse-conv-bn', action='store_true', help='Whether to fuse conv and bn, this will slightly increase' 'the inference speed') parser.add_argument( '--format-only', action='store_true', help='Format the output results without perform evaluation. It is' 'useful when you want to format the result to a specific format and ' 'submit it to the test server') parser.add_argument( '--eval', type=str, nargs='+', help='evaluation metrics, which depends on the dataset, e.g., "bbox",' ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') parser.add_argument('--show', action='store_true', help='show results') parser.add_argument( '--show-dir', help='directory where painted images will be saved') parser.add_argument( '--show-score-thr', type=float, default=0.3, help='score threshold (default: 0.3)') parser.add_argument( '--gpu-collect', action='store_true', help='whether to use gpu to collect results.') parser.add_argument( '--tmpdir', help='tmp directory used for collecting results from multiple ' 'workers, available when gpu-collect is not specified') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') parser.add_argument( '--options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function (deprecate), ' 'change to --eval-options instead.') parser.add_argument( '--eval-options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function') parser.add_argument( '--semantic', action='store_true', help="semantic input" ) parser.add_argument( '--rescale-depth', action='store_true', help="" ) parser.add_argument( '--with-seq', action='store_true', help="" ) parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) # parser.add_argument('--output_dir', default="./work_dirs/vps/vps_output", # help='output result file in pickle format to load') # parser.add_argument('--n_video', type=int, default=50, help="number of video clips") # parser.add_argument('--pan_im_json_file', type=str, default='data/cityscapes_vps/panoptic_im_val_city_vps.json') args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) if args.options and args.eval_options: raise ValueError( '--options and --eval-options cannot be both ' 'specified, --options is deprecated in favor of --eval-options') if args.options: warnings.warn('--options is deprecated in favor of --eval-options') args.eval_options = args.options return args def main(): args = parse_args() assert args.out or args.eval or args.format_only or args.show \ or args.show_dir, \ ('Please specify at least one operation (save/eval/format/show the ' 'results / save the results) with the argument "--out", "--eval"' ', "--format-only", "--show" or "--show-dir"') if args.eval and args.format_only: raise ValueError('--eval and --format_only cannot be both specified') if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') print(args) cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # import modules from string list. if cfg.get('custom_imports', None): from mmcv.utils import import_modules_from_strings import_modules_from_strings(**cfg['custom_imports']) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None if cfg.model.get('neck'): if isinstance(cfg.model.neck, list): for neck_cfg in cfg.model.neck: if neck_cfg.get('rfp_backbone'): if neck_cfg.rfp_backbone.get('pretrained'): neck_cfg.rfp_backbone.pretrained = None elif cfg.model.neck.get('rfp_backbone'): if cfg.model.neck.rfp_backbone.get('pretrained'): cfg.model.neck.rfp_backbone.pretrained = None # in case the test dataset is concatenated samples_per_gpu = 1 if isinstance(cfg.data.test, dict): cfg.data.test.test_mode = True samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) if samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.test.pipeline = replace_ImageToTensor( cfg.data.test.pipeline) elif isinstance(cfg.data.test, list): for ds_cfg in cfg.data.test: ds_cfg.test_mode = True samples_per_gpu = max( [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test]) if samples_per_gpu > 1: for ds_cfg in cfg.data.test: ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, samples_per_gpu=samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint cfg.model.train_cfg = None model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_conv_bn(model) # old versions did not save class info in checkpoints, this walkaround is # for backward compatibility if 'CLASSES' in checkpoint.get('meta', {}): model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = dataset.CLASSES model = MMDataParallel(model, device_ids=[0]) # Inference the sequence outputs, pred_pans_2ch = single_gpu_test(model, data_loader, args.show, args.show_dir, args.show_score_thr, with_semantic_input=args.semantic, rescale_depth=args.rescale_depth, with_seq=args.with_seq) print("==>Inference Depth VPS Done!") # Evaluation Part if __name__ == '__main__': main() ================================================ FILE: tools/train.py ================================================ import argparse import copy import os import os.path as osp import time import warnings import mmcv import torch import torch.distributed as dist from mmcv import Config, DictAction from mmcv.runner import get_dist_info, init_dist from mmcv.utils import get_git_hash from mmdet import __version__ from mmdet.apis import set_random_seed from mmdet.datasets import build_dataset from mmdet.models import build_detector from mmdet.utils import collect_env, get_root_logger from external.train import train_detector def parse_args(): parser = argparse.ArgumentParser(description='Train a detector') parser.add_argument('config', help='train config file path') parser.add_argument('--work-dir', help='the dir to save logs and models') parser.add_argument( '--resume-from', help='the checkpoint file to resume from') parser.add_argument( '--load-from', help='the checkpoint file to resume from') parser.add_argument( '--no-validate', action='store_true', help='whether not to evaluate the checkpoint during training') group_gpus = parser.add_mutually_exclusive_group() group_gpus.add_argument( '--gpus', type=int, help='number of gpus to use ' '(only applicable to non-distributed training)') group_gpus.add_argument( '--gpu-ids', type=int, nargs='+', help='ids of gpus to use ' '(only applicable to non-distributed training)') parser.add_argument('--seed', type=int, default=None, help='random seed') parser.add_argument( '--deterministic', action='store_true', help='whether to set deterministic options for CUDNN backend.') parser.add_argument( '--detect-anomaly', action='store_true', help='detect anomaly') parser.add_argument( '--options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file (deprecate), ' 'change to --cfg-options instead.') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) if args.options and args.cfg_options: raise ValueError( '--options and --cfg-options cannot be both ' 'specified, --options is deprecated in favor of --cfg-options') if args.options: warnings.warn('--options is deprecated in favor of --cfg-options') args.cfg_options = args.options return args def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # import modules from string list. if cfg.get('custom_imports', None): from mmcv.utils import import_modules_from_strings import_modules_from_strings(**cfg['custom_imports']) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir elif cfg.get('work_dir', None) is None: # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) if args.resume_from is not None: cfg.resume_from = args.resume_from if args.load_from is not None: cfg.load_from = args.load_from if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids else: cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) dist.barrier() # re-set gpu_ids with distributed training mode _, world_size = get_dist_info() cfg.gpu_ids = range(world_size) # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) # dump config cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, f'{timestamp}.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # Added in PansegMM # Log the git hash info to video_knet_vis the experiments logger.info('The repo is : https://github.com/lxtGH/PanopticSegMM/tree/{}/'.format(get_git_hash())) logger.info('The config is : https://github.com/lxtGH/PanopticSegMM/tree/{}/{}'.format(get_git_hash(), args.config)) # init the meta dict to record some important information such as # environment info and seed, which will be logged meta = dict() # log env info env_info_dict = collect_env() env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line) meta['env_info'] = env_info meta['config'] = cfg.pretty_text # log some basic info logger.info(f'Distributed training: {distributed}') logger.info(f'Config:\n{cfg.pretty_text}') # set random seeds if args.seed is not None: logger.info(f'Set random seed to {args.seed}, ' f'deterministic: {args.deterministic}') set_random_seed(args.seed, deterministic=args.deterministic) else: set_random_seed(0, deterministic=args.deterministic) cfg.seed = args.seed meta['seed'] = args.seed meta['exp_name'] = osp.basename(args.config) model = build_detector( cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg')) model.init_weights() logger.info(f'Model:\n{model}') datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: val_dataset = copy.deepcopy(cfg.data.val) val_dataset.pipeline = cfg.data.train.pipeline datasets.append(build_dataset(val_dataset)) if cfg.checkpoint_config is not None: # save mmdet version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict( mmdet_version=__version__ + get_git_hash()[:7], CLASSES=datasets[0].CLASSES) # add an attribute for visualization convenience model.CLASSES = datasets[0].CLASSES if args.detect_anomaly: with torch.autograd.detect_anomaly(): train_detector( model, datasets, cfg, distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, meta=meta) else: train_detector( model, datasets, cfg, distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, meta=meta) if __name__ == '__main__': main() ================================================ FILE: tools/utils/DSTQ.py ================================================ from typing import Sequence, Tuple import collections import numpy as np from .STQ import STQuality class DSTQuality(STQuality): def __init__( self, num_classes: int, things_list: Sequence[int], ignore_label: int, label_bit_shift: int, offset: int, depth_threshold: Tuple[float] = (1.25, 1.1), name: str = 'dstq' ): super().__init__( num_classes=num_classes, things_list=things_list, ignore_label=ignore_label, label_bit_shift=label_bit_shift, offset=offset ) if not (isinstance(depth_threshold, tuple) or isinstance(depth_threshold, list)): raise TypeError('The type of depth_threshold must be tuple or list.') if not depth_threshold: raise ValueError('depth_threshold must be non-empty.') self._depth_threshold = tuple(depth_threshold) self._depth_total_counts = collections.OrderedDict() self._depth_inlier_counts = [] for _ in range(len(self._depth_threshold)): self._depth_inlier_counts.append(collections.OrderedDict()) def update_state( self, y_true: np.ndarray, y_pred: np.ndarray, d_true: np.ndarray, d_pred: np.ndarray, sequence_id: int = 0 ): """Accumulates the depth-aware segmentation and tracking quality statistics. Args: y_true: The ground-truth panoptic label map for a particular video frame (defined as semantic_map * max_instances_per_category + instance_map). y_pred: The predicted panoptic label map for a particular video frame (defined as semantic_map * max_instances_per_category + instance_map). d_true: The ground-truth depth map for this video frame. d_pred: The predicted depth map for this video frame. sequence_id: The optional ID of the sequence the frames belong to. When no sequence is given, all frames are considered to belong to the same sequence (default: 0). """ super().update_state(y_true, y_pred, sequence_id) # Valid depth labels contain positive values. d_valid_mask = d_true > 0 d_valid_total = np.sum(d_valid_mask.astype(int)) # Valid depth prediction is expected to contain positive values. # TODO : very wrong implementation because it is hackable d_valid_mask = np.logical_and(d_valid_mask, d_pred > 0) d_valid_true = d_true[d_valid_mask] d_valid_pred = d_pred[d_valid_mask] inlier_error = np.maximum(d_valid_pred / d_valid_true, d_valid_true / d_valid_pred) # For each threshold, count the number of inliers. for threshold_index, threshold in enumerate(self._depth_threshold): num_inliers = np.sum((inlier_error <= threshold).astype(int)) inlier_counts = self._depth_inlier_counts[threshold_index] inlier_counts[sequence_id] = (inlier_counts.get(sequence_id, 0) + int(num_inliers)) # Update the total counts of the depth labels. self._depth_total_counts[sequence_id] = ( self._depth_total_counts.get(sequence_id, 0) + int(d_valid_total)) def result(self): """Computes the depth-aware segmentation and tracking quality. Returns: A dictionary containing: - 'STQ': The total STQ score. - 'AQ': The total association quality (AQ) score. - 'IoU': The total mean IoU. - 'STQ_per_seq': A list of the STQ score per sequence. - 'AQ_per_seq': A list of the AQ score per sequence. - 'IoU_per_seq': A list of mean IoU per sequence. - 'Id_per_seq': A list of sequence Ids to map list index to sequence. - 'Length_per_seq': A list of the length of each sequence. - 'DSTQ': The total DSTQ score. - 'DSTQ@thres': The total DSTQ score for threshold thres - 'DSTQ_per_seq@thres': A list of DSTQ score per sequence for thres. - 'DQ': The total DQ score. - 'DQ@thres': The total DQ score for threshold thres. - 'DQ_per_seq@thres': A list of DQ score per sequence for thres. """ # Gather the results for STQ. stq_results = super().result() # Collect results for depth quality per sequecne and threshold. dq_per_seq_at_threshold = {} dq_at_threshold = {} for threshold_index, threshold in enumerate(self._depth_threshold): dq_per_seq_at_threshold[threshold] = [0] * len(self._ground_truth) total_count = 0 inlier_count = 0 # Follow the order of computing STQ by enumerating _ground_truth. for index, sequence_id in enumerate(self._ground_truth): sequence_inlier = self._depth_inlier_counts[threshold_index][sequence_id] sequence_total = self._depth_total_counts[sequence_id] if sequence_total > 0: dq_per_seq_at_threshold[threshold][ index] = sequence_inlier / sequence_total total_count += sequence_total inlier_count += sequence_inlier if total_count == 0: dq_at_threshold[threshold] = 0 else: dq_at_threshold[threshold] = inlier_count / total_count # Compute DQ as the geometric mean of DQ's at different thresholds. dq = 1 for _, threshold in enumerate(self._depth_threshold): dq *= dq_at_threshold[threshold] dq = dq ** (1 / len(self._depth_threshold)) dq_results = {} dq_results['DQ'] = dq for _, threshold in enumerate(self._depth_threshold): dq_results['DQ@{}'.format(threshold)] = dq_at_threshold[threshold] dq_results['DQ_per_seq@{}'.format( threshold)] = dq_per_seq_at_threshold[threshold] # Combine STQ and DQ to get DSTQ. dstq_results = {} dstq_results['DSTQ'] = (stq_results['STQ'] ** 2 * dq) ** (1 / 3) for _, threshold in enumerate(self._depth_threshold): dstq_results['DSTQ@{}'.format(threshold)] = (stq_results['STQ'] ** 2 * dq_at_threshold[ threshold]) ** (1 / 3) dstq_results['DSTQ_per_seq@{}'.format(threshold)] = [ (stq_result ** 2 * dq_result) ** (1 / 3) for stq_result, dq_result in zip( stq_results['STQ_per_seq'], dq_per_seq_at_threshold[threshold]) ] # Merge all the results. dstq_results.update(stq_results) dstq_results.update(dq_results) return dstq_results def reset_states(self): """Resets all states that accumulated data.""" super().reset_states() self._depth_total_counts = collections.OrderedDict() self._depth_inlier_counts = [] for _ in range(len(self._depth_threshold)): self._depth_inlier_counts.append(collections.OrderedDict()) ================================================ FILE: tools/utils/STQ.py ================================================ # This file is copied from deeplab2, please refer to https://github.com/google-research/deeplab2/ # for details. Please cite their papers if this file is helpful. # coding=utf-8 # Copyright 2021 The Deeplab2 Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Numpy Implementation of the Segmentation and Tracking Quality (STQ) metric. This implementation is designed to work stand-alone. Please feel free to copy this file and the corresponding unit-test to your project. """ import collections from typing import Mapping, MutableMapping, Sequence, Text, Any import numpy as np _EPSILON = 1e-15 def _update_dict_stats(stat_dict: MutableMapping[int, np.ndarray], id_array: np.ndarray): """Updates a given dict with corresponding counts.""" ids, counts = np.unique(id_array, return_counts=True) for idx, count in zip(ids, counts): if idx in stat_dict: stat_dict[idx] += count else: stat_dict[idx] = count class STQuality(object): """Metric class for the Segmentation and Tracking Quality (STQ). Please see the following paper for more details about the metric: "STEP: Segmenting and Tracking Every Pixel", Weber et al., arXiv:2102.11859, 2021. The metric computes the geometric mean of two terms. - Association Quality: This term measures the quality of the video_knet_vis ID assignment for `thing` classes. It is formulated as a weighted IoU measure. - Segmentation Quality: This term measures the semantic segmentation quality. The standard class IoU measure is used for this. Example usage: stq_obj = segmentation_tracking_quality.STQuality(num_classes, things_list, ignore_label, label_bit_shift, offset) stq_obj.update_state(y_true_1, y_pred_1) stq_obj.update_state(y_true_2, y_pred_2) ... result = stq_obj.result() """ def __init__(self, num_classes: int, things_list: Sequence[int], ignore_label: int, label_bit_shift: int, offset: int): """Initialization of the STQ metric. Args: num_classes: Number of classes in the dataset as an integer. things_list: A sequence of class ids that belong to `things`. ignore_label: The class id to be ignored in evaluation as an integer or integer tensor. label_bit_shift: The number of bits the class label is shifted as an integer -> (class_label << bits) + trackingID offset: The maximum number of unique labels as an integer or integer tensor. """ self._num_classes = num_classes self._ignore_label = ignore_label self._things_list = things_list self._label_bit_shift = label_bit_shift self._bit_mask = (2 ** label_bit_shift) - 1 if ignore_label >= num_classes: self._confusion_matrix_size = num_classes + 1 self._include_indices = np.arange(self._num_classes) else: self._confusion_matrix_size = num_classes self._include_indices = np.array( [i for i in range(num_classes) if i != self._ignore_label]) self._iou_confusion_matrix_per_sequence = collections.OrderedDict() self._predictions = collections.OrderedDict() self._ground_truth = collections.OrderedDict() self._intersections = collections.OrderedDict() self._sequence_length = collections.OrderedDict() self._offset = offset lower_bound = num_classes << self._label_bit_shift if offset < lower_bound: raise ValueError('The provided offset %d is too small. No guarantess ' 'about the correctness of the results can be made. ' 'Please choose an offset that is higher than num_classes' ' * max_instances_per_category = %d' % lower_bound) def get_semantic(self, y: np.ndarray) -> np.ndarray: """Returns the semantic class from a panoptic label map.""" return y >> self._label_bit_shift def update_state(self, y_true: np.ndarray, y_pred: np.ndarray, sequence_id=0): """Accumulates the segmentation and tracking quality statistics. IMPORTANT: When encoding the parameters y_true and y_pred, please be aware that the `+` operator binds higher than the label shift `<<` operator. Args: y_true: The ground-truth panoptic label map for a particular video frame (defined as (semantic_map << label_bit_shift) + instance_map). y_pred: The predicted panoptic label map for a particular video frame (defined as (semantic_map << label_bit_shift) + instance_map). sequence_id: The optional ID of the sequence the frames belong to. When no sequence is given, all frames are considered to belong to the same sequence (default: 0). """ y_true = y_true.astype(np.int64) y_pred = y_pred.astype(np.int64) semantic_label = self.get_semantic(y_true) semantic_prediction = self.get_semantic(y_pred) # Check if the ignore value is outside the range [0, num_classes]. If yes, # map `_ignore_label` to `_num_classes`, so it can be used to create the # confusion matrix. if self._ignore_label > self._num_classes: semantic_label = np.where(semantic_label != self._ignore_label, semantic_label, self._num_classes) semantic_prediction = np.where(semantic_prediction != self._ignore_label, semantic_prediction, self._num_classes) if sequence_id in self._iou_confusion_matrix_per_sequence: idxs = (np.reshape(semantic_label, [-1]) << self._label_bit_shift) + np.reshape(semantic_prediction, [-1]) unique_idxs, counts = np.unique(idxs, return_counts=True) self._iou_confusion_matrix_per_sequence[sequence_id][ unique_idxs >> self._label_bit_shift, unique_idxs & self._bit_mask] += counts self._sequence_length[sequence_id] += 1 else: self._iou_confusion_matrix_per_sequence[sequence_id] = np.zeros( (self._confusion_matrix_size, self._confusion_matrix_size), dtype=np.int64) idxs = np.stack([ np.reshape(semantic_label, [-1]), np.reshape(semantic_prediction, [-1]) ], axis=0) np.add.at(self._iou_confusion_matrix_per_sequence[sequence_id], tuple(idxs), 1) self._predictions[sequence_id] = {} self._ground_truth[sequence_id] = {} self._intersections[sequence_id] = {} self._sequence_length[sequence_id] = 1 instance_label = y_true & self._bit_mask # 0xFFFF == 2 ^ 16 - 1 label_mask = np.zeros_like(semantic_label, dtype=np.bool) prediction_mask = np.zeros_like(semantic_prediction, dtype=np.bool) for things_class_id in self._things_list: label_mask = np.logical_or(label_mask, semantic_label == things_class_id) prediction_mask = np.logical_or(prediction_mask, semantic_prediction == things_class_id) # Select the `crowd` region of the current class. This region is encoded # instance id `0`. is_crowd = np.logical_and(instance_label == 0, label_mask) # Select the non-crowd region of the corresponding class as the `crowd` # region is ignored for the tracking term. label_mask = np.logical_and(label_mask, np.logical_not(is_crowd)) # Do not punish id assignment for regions that are annotated as `crowd` in # the ground-truth. prediction_mask = np.logical_and(prediction_mask, np.logical_not(is_crowd)) seq_preds = self._predictions[sequence_id] seq_gts = self._ground_truth[sequence_id] seq_intersects = self._intersections[sequence_id] # Compute and update areas of ground-truth, predictions and intersections. _update_dict_stats(seq_preds, y_pred[prediction_mask]) _update_dict_stats(seq_gts, y_true[label_mask]) non_crowd_intersection = np.logical_and(label_mask, prediction_mask) intersection_ids = ( y_true[non_crowd_intersection] * self._offset + y_pred[non_crowd_intersection]) _update_dict_stats(seq_intersects, intersection_ids) def result(self) -> Mapping[Text, Any]: """Computes the segmentation and tracking quality. Returns: A dictionary containing: - 'STQ': The total STQ score. - 'AQ': The total association quality (AQ) score. - 'IoU': The total mean IoU. - 'STQ_per_seq': A list of the STQ score per sequence. - 'AQ_per_seq': A list of the AQ score per sequence. - 'IoU_per_seq': A list of mean IoU per sequence. - 'Id_per_seq': A list of string-type sequence Ids to map list index to sequence. - 'Length_per_seq': A list of the length of each sequence. """ # Compute association quality (AQ) num_tubes_per_seq = [0] * len(self._ground_truth) aq_per_seq = [0] * len(self._ground_truth) iou_per_seq = [0] * len(self._ground_truth) id_per_seq = [''] * len(self._ground_truth) for index, sequence_id in enumerate(self._ground_truth): outer_sum = 0.0 predictions = self._predictions[sequence_id] ground_truth = self._ground_truth[sequence_id] intersections = self._intersections[sequence_id] num_tubes_per_seq[index] = len(ground_truth) id_per_seq[index] = sequence_id for gt_id, gt_size in ground_truth.items(): inner_sum = 0.0 for pr_id, pr_size in predictions.items(): tpa_key = self._offset * gt_id + pr_id if tpa_key in intersections: tpa = intersections[tpa_key] fpa = pr_size - tpa fna = gt_size - tpa inner_sum += tpa * (tpa / (tpa + fpa + fna)) outer_sum += 1.0 / gt_size * inner_sum aq_per_seq[index] = outer_sum aq_mean = np.sum(aq_per_seq) / np.maximum( np.sum(num_tubes_per_seq), _EPSILON) aq_per_seq = aq_per_seq / np.maximum(num_tubes_per_seq, _EPSILON) # Compute IoU scores. # The rows correspond to ground-truth and the columns to predictions. # Remove fp from confusion matrix for the void/ignore class. total_confusion = np.zeros( (self._confusion_matrix_size, self._confusion_matrix_size), dtype=np.int64) for index, confusion in enumerate( self._iou_confusion_matrix_per_sequence.values()): removal_matrix = np.zeros_like(confusion) removal_matrix[self._include_indices, :] = 1.0 confusion *= removal_matrix total_confusion += confusion # `intersections` corresponds to true positives. intersections = confusion.diagonal() fps = confusion.sum(axis=0) - intersections fns = confusion.sum(axis=1) - intersections unions = intersections + fps + fns num_classes = np.count_nonzero(unions) ious = ( intersections.astype(np.double) / np.maximum(unions, 1e-15).astype(np.double)) iou_per_seq[index] = np.sum(ious) / num_classes # `intersections` corresponds to true positives. intersections = total_confusion.diagonal() fps = total_confusion.sum(axis=0) - intersections fns = total_confusion.sum(axis=1) - intersections unions = intersections + fps + fns num_classes = np.count_nonzero(unions) ious = ( intersections.astype(np.double) / np.maximum(unions, _EPSILON).astype(np.double)) iou_mean = np.sum(ious) / num_classes st_quality = np.sqrt(aq_mean * iou_mean) st_quality_per_seq = np.sqrt(aq_per_seq * iou_per_seq) return { 'STQ': st_quality, 'AQ': aq_mean, 'IoU': float(iou_mean), 'STQ_per_seq': st_quality_per_seq, 'AQ_per_seq': aq_per_seq, 'IoU_per_seq': iou_per_seq, 'ID_per_seq': id_per_seq, 'Length_per_seq': list(self._sequence_length.values()), } def reset_states(self): """Resets all states that accumulated data.""" self._iou_confusion_matrix_per_sequence = collections.OrderedDict() self._predictions = collections.OrderedDict() self._ground_truth = collections.OrderedDict() self._intersections = collections.OrderedDict() self._sequence_length = collections.OrderedDict() ================================================ FILE: tools/utils/cityscapesvps_eval.py ================================================ from __future__ import print_function import argparse import os import os.path as osp import torch.multiprocessing as multiprocessing import numpy as np import json from PIL import Image import pickle from torch.utils.data import Dataset class CityscapesVps(Dataset): def __init__(self): super(CityscapesVps, self).__init__() self.nframes_per_video = 6 self.lambda_ = 5 self.labeled_fid = 20 def _save_image_single_core(self, proc_id, images_set, names_set, colors = None): def colorize(gray, palette): # gray: numpy array of the label and 1*3N size list palette color = Image.fromarray(gray.astype(np.uint8)).convert('P') color.putpalette(palette) return color for working_idx, (image, name) in enumerate(zip(images_set, names_set)): if colors is not None: image = colorize(image, colors) else: image = Image.fromarray(image) os.makedirs(os.path.dirname(name), exist_ok=True) image.save(name) def inference_panoptic_video(self, pred_pans_2ch, output_dir, categories, names, n_video=0): from panopticapi.utils import IdGenerator # Sample only frames with GT annotations. if len(pred_pans_2ch) != len(names): pred_pans_2ch = pred_pans_2ch[(self.labeled_fid // self.lambda_)::self.lambda_] categories = {el['id']: el for el in categories} color_generator = IdGenerator(categories) def get_pred_large(pan_2ch_all, vid_num, nframes_per_video=6): vid_num = len(pan_2ch_all) // nframes_per_video # 10 cpu_num = multiprocessing.cpu_count() // 2 # 32 --> 16 nprocs = min(vid_num, cpu_num) # 10 max_nframes = cpu_num * nframes_per_video nsplits = (len(pan_2ch_all) - 1) // max_nframes + 1 annotations, pan_all = [], [] for i in range(0, len(pan_2ch_all), max_nframes): print('==> Read and convert VPS output - split %d/%d' % ((i // max_nframes) + 1, nsplits)) pan_2ch_part = pan_2ch_all[i:min( i + max_nframes, len(pan_2ch_all))] pan_2ch_split = np.array_split(pan_2ch_part, nprocs) workers = multiprocessing.Pool(processes=nprocs) processes = [] for proc_id, pan_2ch_set in enumerate(pan_2ch_split): p = workers.apply_async( self.converter_2ch_track_core, (proc_id, pan_2ch_set, color_generator)) processes.append(p) workers.close() workers.join() for p in processes: p = p.get() annotations.extend(p[0]) pan_all.extend(p[1]) pan_json = {'annotations': annotations} return pan_all, pan_json def save_image(images, save_folder, names, colors=None): os.makedirs(save_folder, exist_ok=True) names = [osp.join(save_folder, name.replace('_leftImg8bit', '').replace('_newImg8bit', '').replace('jpg', 'png').replace( 'jpeg', 'png')) for name in names] cpu_num = multiprocessing.cpu_count() // 2 images_split = np.array_split(images, cpu_num) names_split = np.array_split(names, cpu_num) workers = multiprocessing.Pool(processes=cpu_num) for proc_id, (images_set, names_set) in enumerate(zip(images_split, names_split)): workers.apply_async(self._save_image_single_core, (proc_id, images_set, names_set, colors)) workers.close() workers.join() # inference_panoptic_video pred_pans, pred_json = get_pred_large(pred_pans_2ch, vid_num=n_video) print('--------------------------------------') print('==> Saving VPS output png files') os.makedirs(output_dir, exist_ok=True) save_image(pred_pans_2ch, osp.join(output_dir, 'pan_2ch'), names) save_image(pred_pans, osp.join(output_dir, 'pan_pred'), names) print('==> Saving pred.jsons file') json.dump(pred_json, open(osp.join(output_dir, 'pred.json'), 'w')) print('--------------------------------------') return pred_pans, pred_json def converter_2ch_track_core(self, proc_id, pan_2ch_set, color_generator): from panopticapi.utils import rgb2id OFFSET = 1000 VOID = 255 annotations, pan_all = [], [] # reference dict to used color inst2color = {} for idx in range(len(pan_2ch_set)): pan_2ch = np.uint32(pan_2ch_set[idx]) # pan_2ch: ss-seg maps[:,:,0], id-seg maps[:,:,1] pan = OFFSET * pan_2ch[:, :, 0] + pan_2ch[:, :, 1] pan_format = np.zeros((pan_2ch.shape[0], pan_2ch.shape[1], 3), dtype=np.uint8) l = np.unique(pan) segm_info = {} for el in l: sem = el // OFFSET if sem == VOID: continue mask = pan == el #### handling used color for inst id if el % OFFSET > 0: # if el > OFFSET: # things class if el in inst2color: color = inst2color[el] else: color = color_generator.get_color(sem) inst2color[el] = color else: # stuff class color = color_generator.get_color(sem) pan_format[mask] = color index = np.where(mask) x = index[1].min() y = index[0].min() width = index[1].max() - x height = index[0].max() - y dt = {"category_id": sem.item(), "iscrowd": 0, "id": int(rgb2id(color)), "bbox": [x.item(), y.item(), width.item(), height.item()], "area": mask.sum().item()} segment_id = int(rgb2id(color)) segm_info[segment_id] = dt # annotations.append({"segments_info": segm_info}) pan_all.append(pan_format) gt_pan = np.uint32(pan_format) # rgb2id for evaluation pan_gt = gt_pan[:, :, 0] + gt_pan[:, :, 1] * 256 + gt_pan[:, :, 2] * 256 * 256 labels, labels_cnt = np.unique(pan_gt, return_counts=True) for label, area in zip(labels, labels_cnt): if label == 0: continue if label not in segm_info.keys(): print('label:', label) raise KeyError('label not in segm_info keys.') segm_info[label]["area"] = int(area) segm_info = [v for k, v in segm_info.items()] annotations.append({"segments_info": segm_info}) return annotations, pan_all ================================================ FILE: tools/visualization.py ================================================ import argparse import os import os.path as osp import warnings import numpy as np import pickle import json import mmcv import torch from mmcv import Config, DictAction from mmcv.cnn import fuse_conv_bn from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, wrap_fp16_model) from mmdet.datasets import (build_dataloader, build_dataset, replace_ImageToTensor) from mmdet.models import build_detector from external.test import encode_mask_results, tensor2imgs def single_gpu_test(model, data_loader, out_dir=None, ): if out_dir is None: out_dir = 'logger/blackhole' print("The output dir is {}".format(out_dir)) model.eval() dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) for i, data in enumerate(data_loader): with torch.no_grad(): visualizations = model(return_loss=False, rescale=True, **data) instance_map = visualizations['instance_map'] seg_infos = visualizations['segments_info'] depth = visualizations['depth_final'] prog_bar.update() return None def parse_args(): parser = argparse.ArgumentParser( description='MMDet test (and eval) a model') parser.add_argument('config', help='test config file path') parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument('--out', help='output result file in pickle format') parser.add_argument( '--fuse-conv-bn', action='store_true', help='Whether to fuse conv and bn, this will slightly increase' 'the inference speed') parser.add_argument( '--format-only', action='store_true', help='Format the output results without perform evaluation. It is' 'useful when you want to format the result to a specific format and ' 'submit it to the test server') parser.add_argument( '--eval', type=str, nargs='+', help='evaluation metrics, which depends on the dataset, e.g., "bbox",' ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') parser.add_argument('--show', action='store_true', help='show results') parser.add_argument( '--show-dir', help='directory where painted images will be saved') parser.add_argument( '--show-score-thr', type=float, default=0.3, help='score threshold (default: 0.3)') parser.add_argument( '--gpu-collect', action='store_true', help='whether to use gpu to collect results.') parser.add_argument( '--tmpdir', help='tmp directory used for collecting results from multiple ' 'workers, available when gpu-collect is not specified') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') parser.add_argument( '--options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function (deprecate), ' 'change to --eval-options instead.') parser.add_argument( '--eval-options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) if args.options and args.eval_options: raise ValueError( '--options and --eval-options cannot be both ' 'specified, --options is deprecated in favor of --eval-options') if args.options: warnings.warn('--options is deprecated in favor of --eval-options') args.eval_options = args.options return args def main(): args = parse_args() print(args) cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # import modules from string list. if cfg.get('custom_imports', None): from mmcv.utils import import_modules_from_strings import_modules_from_strings(**cfg['custom_imports']) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None if cfg.model.get('neck'): if isinstance(cfg.model.neck, list): for neck_cfg in cfg.model.neck: if neck_cfg.get('rfp_backbone'): if neck_cfg.rfp_backbone.get('pretrained'): neck_cfg.rfp_backbone.pretrained = None elif cfg.model.neck.get('rfp_backbone'): if cfg.model.neck.rfp_backbone.get('pretrained'): cfg.model.neck.rfp_backbone.pretrained = None # in case the test dataset is concatenated samples_per_gpu = 1 if isinstance(cfg.data.test, dict): cfg.data.test.test_mode = True samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) if samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.test.pipeline = replace_ImageToTensor( cfg.data.test.pipeline) elif isinstance(cfg.data.test, list): for ds_cfg in cfg.data.test: ds_cfg.test_mode = True samples_per_gpu = max( [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test]) if samples_per_gpu > 1: for ds_cfg in cfg.data.test: ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, samples_per_gpu=samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint cfg.model.train_cfg = None model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu', strict=True) if args.fuse_conv_bn: model = fuse_conv_bn(model) # old versions did not save class info in checkpoints, this walkaround is # for backward compatibility if 'CLASSES' in checkpoint.get('meta', {}): model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = dataset.CLASSES model = MMDataParallel(model, device_ids=[0]) # Inference the sequence single_gpu_test(model, data_loader, args.show_dir) if __name__ == '__main__': main() ================================================ FILE: tools_vis/apis/__init__.py ================================================ from .test import single_gpu_test, multi_gpu_test ================================================ FILE: tools_vis/apis/test.py ================================================ # Modified from mmdet 2.20.0 / https://github.com/open-mmlab/mmdetection/tree/ff9bc import os.path as osp import pickle import shutil import tempfile import time import mmcv import torch import torch.distributed as dist from mmcv.image import tensor2imgs from mmcv.runner import get_dist_info from mmdet.core import encode_mask_results def single_gpu_test(model, data_loader, show=False, out_dir=None, show_score_thr=0.3): model.eval() results = [] dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) batch_size = len(result) # encode mask results for idx in range(len(result)): if isinstance(result[idx][0], tuple): result[idx] = [(bbox_results, encode_mask_results(mask_results)) for bbox_results, mask_results in result[idx]] results.extend(result) for _ in range(batch_size): prog_bar.update() results = sum(results, []) return results def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): """Test model with multiple gpus. This method tests model with multiple gpus and collects the results under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' it encodes results to gpu tensors and use gpu communication for results collection. On cpu mode it saves the results on different gpus to 'tmpdir' and collects them by the rank 0 worker. Args: model (nn.Module): Model to be tested. data_loader (nn.Dataloader): Pytorch data loader. tmpdir (str): Path of directory to save the temporary results from different gpus under cpu mode. gpu_collect (bool): Option to use either gpu or cpu to collect results. Returns: list: The prediction results. """ model.eval() results = [] dataset = data_loader.dataset rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) time.sleep(2) # This line can prevent deadlock problem in some cases. for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) # encode mask results for idx in range(len(result)): if isinstance(result[idx][0], tuple): result[idx] = [(bbox_results, encode_mask_results(mask_results)) for bbox_results, mask_results in result[idx]] results.extend(result) if rank == 0: batch_size = len(result) for _ in range(batch_size * world_size): prog_bar.update() # collect results from all ranks if gpu_collect: results = collect_results_gpu(results, size=len(dataset)) else: results = collect_results_cpu(results, size=len(dataset), tmpdir=tmpdir) if rank == 0: results = sum(results, []) return results def collect_results_cpu(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: mmcv.mkdir_or_exist('.dist_test') tmpdir = tempfile.mkdtemp(dir='.dist_test') tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, f'part_{i}.pkl') part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results def collect_results_gpu(result_part, size): rank, world_size = get_dist_info() # dump result part to tensor with pickle part_tensor = torch.tensor( bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda') # gather all result part tensor shape shape_tensor = torch.tensor(part_tensor.shape, device='cuda') shape_list = [shape_tensor.clone() for _ in range(world_size)] dist.all_gather(shape_list, shape_tensor) # padding result part tensor to max length shape_max = torch.tensor(shape_list).max() part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') part_send[:shape_tensor[0]] = part_tensor part_recv_list = [ part_tensor.new_zeros(shape_max) for _ in range(world_size) ] # gather all result part dist.all_gather(part_recv_list, part_send) if rank == 0: part_list = [] for recv, shape in zip(part_recv_list, shape_list): part_list.append( pickle.loads(recv[:shape[0]].cpu().numpy().tobytes())) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] return ordered_results ================================================ FILE: tools_vis/dist_test_whole_video.sh ================================================ #!/usr/bin/env bash CONFIG=$1 CHECKPOINT=$2 GPUS=$3 PORT=${PORT:-$((29500 + $RANDOM % 29))} if command -v torchrun &> /dev/null then echo "Using torchrun mode." PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ torchrun --nproc_per_node=$GPUS --master_port=$PORT \ $(dirname "$0")/test_whole_video.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} else echo "Using launch mode." PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ $(dirname "$0")/test_whole_video.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} fi ================================================ FILE: tools_vis/docker.sh ================================================ #!/bin/bash DATALOC=${DATALOC:-`realpath ../datasets`} LOGLOC=${LOGLOC:-`realpath ../logger`} IMG=${IMG:-"harbory/openmmlab:eccv-2022"} docker run --gpus all -it --rm --ipc=host --net=host \ --mount src=$(pwd),target=/data,type=bind \ --mount src=$DATALOC,target=/data/data,type=bind \ --mount src=$LOGLOC,target=/data/logger,type=bind \ $IMG ================================================ FILE: tools_vis/slurm_test_vis.sh ================================================ #!/usr/bin/env bash set -x PARTITION=$1 JOB_NAME=$2 CONFIG=$3 CHECKPOINT=$4 GPUS=${GPUS:-8} GPUS_PER_NODE=${GPUS_PER_NODE:-8} CPUS_PER_TASK=${CPUS_PER_TASK:-5} PY_ARGS=${@:5} SRUN_ARGS=${SRUN_ARGS:-""} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ srun -p ${PARTITION} \ --job-name=${JOB_NAME} \ --gres=gpu:${GPUS_PER_NODE} \ --ntasks=${GPUS} \ --ntasks-per-node=${GPUS_PER_NODE} \ --cpus-per-task=${CPUS_PER_TASK} \ --kill-on-bad-exit=1 \ ${SRUN_ARGS} \ python -u tools2/test_whole_video.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} ================================================ FILE: tools_vis/test.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # Modified from mmdet 2.20.0 / https://github.com/open-mmlab/mmdetection/tree/ff9bc import argparse import os import os.path as osp import time import warnings import mmcv import torch from mmcv import Config, DictAction from mmcv.cnn import fuse_conv_bn from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, wrap_fp16_model) from mmdet.apis import multi_gpu_test, single_gpu_test from mmdet.datasets import (build_dataloader, build_dataset, replace_ImageToTensor) from mmdet.models import build_detector def parse_args(): parser = argparse.ArgumentParser( description='MMDet test (and eval) a model') parser.add_argument('config', help='test config file path') parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument( '--work-dir', help='the directory to save the file containing evaluation metrics') parser.add_argument('--out', help='output result file in pickle format') parser.add_argument( '--fuse-conv-bn', action='store_true', help='Whether to fuse conv and bn, this will slightly increase' 'the inference speed') parser.add_argument( '--gpu-ids', type=int, nargs='+', help='ids of gpus to use ' '(only applicable to non-distributed testing)') parser.add_argument( '--format-only', action='store_true', help='Format the output results without perform evaluation. It is' 'useful when you want to format the result to a specific format and ' 'submit it to the test server') parser.add_argument( '--eval', type=str, nargs='+', help='evaluation metrics, which depends on the dataset, e.g., "bbox",' ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') parser.add_argument('--show', action='store_true', help='show results') parser.add_argument( '--show-dir', help='directory where painted images will be saved') parser.add_argument( '--show-score-thr', type=float, default=0.3, help='score threshold (default: 0.3)') parser.add_argument( '--gpu-collect', action='store_true', help='whether to use gpu to collect results.') parser.add_argument( '--tmpdir', help='tmp directory used for collecting results from multiple ' 'workers, available when gpu-collect is not specified') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') parser.add_argument( '--options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function (deprecate), ' 'change to --eval-options instead.') parser.add_argument( '--eval-options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) if args.options and args.eval_options: raise ValueError( '--options and --eval-options cannot be both ' 'specified, --options is deprecated in favor of --eval-options') if args.options: warnings.warn('--options is deprecated in favor of --eval-options') args.eval_options = args.options return args def main(): args = parse_args() assert args.out or args.eval or args.format_only or args.show \ or args.show_dir, \ ('Please specify at least one operation (save/eval/format/show the ' 'results / save the results) with the argument "--out", "--eval"' ', "--format-only", "--show" or "--show-dir"') if args.eval and args.format_only: raise ValueError('--eval and --format_only cannot be both specified') if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None if cfg.model.get('neck'): if isinstance(cfg.model.neck, list): for neck_cfg in cfg.model.neck: if neck_cfg.get('rfp_backbone'): if neck_cfg.rfp_backbone.get('pretrained'): neck_cfg.rfp_backbone.pretrained = None elif cfg.model.neck.get('rfp_backbone'): if cfg.model.neck.rfp_backbone.get('pretrained'): cfg.model.neck.rfp_backbone.pretrained = None # in case the test dataset is concatenated samples_per_gpu = 1 if isinstance(cfg.data.test, dict): cfg.data.test.test_mode = True samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) if samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.test.pipeline = replace_ImageToTensor( cfg.data.test.pipeline) elif isinstance(cfg.data.test, list): for ds_cfg in cfg.data.test: ds_cfg.test_mode = True samples_per_gpu = max( [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test]) if samples_per_gpu > 1: for ds_cfg in cfg.data.test: ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline) if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids else: cfg.gpu_ids = range(1) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False if len(cfg.gpu_ids) > 1: warnings.warn( f'We treat {cfg.gpu_ids} as gpu-ids, and reset to ' f'{cfg.gpu_ids[0:1]} as gpu-ids to avoid potential error in ' 'non-distribute testing time.') cfg.gpu_ids = cfg.gpu_ids[0:1] else: distributed = True init_dist(args.launcher, **cfg.dist_params) rank, _ = get_dist_info() # allows not to create if args.work_dir is not None and rank == 0: mmcv.mkdir_or_exist(osp.abspath(args.work_dir)) timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) json_file = osp.join(args.work_dir, f'eval_{timestamp}.json') # build the dataloader dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, samples_per_gpu=samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint cfg.model.train_cfg = None model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_conv_bn(model) # old versions did not save class info in checkpoints, this walkaround is # for backward compatibility if 'CLASSES' in checkpoint.get('meta', {}): model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = dataset.CLASSES if not distributed: model = MMDataParallel(model, device_ids=cfg.gpu_ids) outputs = single_gpu_test(model, data_loader, args.show, args.show_dir, args.show_score_thr) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: if args.out: print(f'\nwriting results to {args.out}') mmcv.dump(outputs, args.out) kwargs = {} if args.eval_options is None else args.eval_options if args.format_only: dataset.format_results(outputs, **kwargs) if args.eval: eval_kwargs = cfg.get('evaluation', {}).copy() # hard-code way to remove EvalHook args for key in [ 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', 'rule', 'dynamic_intervals' ]: eval_kwargs.pop(key, None) eval_kwargs.update(dict(metric=args.eval, **kwargs)) metric = dataset.evaluate(outputs, **eval_kwargs) print(metric) metric_dict = dict(config=args.config, metric=metric) if args.work_dir is not None and rank == 0: mmcv.dump(metric_dict, json_file) if __name__ == '__main__': main() ================================================ FILE: tools_vis/test_whole_video.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # Modified from mmdet 2.20.0 / https://github.com/open-mmlab/mmdetection/tree/ff9bc import argparse import os import os.path as osp import time import warnings import mmcv import torch from mmcv import Config, DictAction from mmcv.cnn import fuse_conv_bn from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, wrap_fp16_model) from tools2.apis import multi_gpu_test, single_gpu_test from mmdet.datasets import (build_dataloader, build_dataset, replace_ImageToTensor) from mmdet.models import build_detector def parse_args(): parser = argparse.ArgumentParser( description='MMDet test (and eval) a model') parser.add_argument('config', help='test config file path') parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument( '--work-dir', help='the directory to save the file containing evaluation metrics') parser.add_argument('--out', help='output result file in pickle format') parser.add_argument( '--fuse-conv-bn', action='store_true', help='Whether to fuse conv and bn, this will slightly increase' 'the inference speed') parser.add_argument( '--gpu-ids', type=int, nargs='+', help='ids of gpus to use ' '(only applicable to non-distributed testing)') parser.add_argument( '--format-only', action='store_true', help='Format the output results without perform evaluation. It is' 'useful when you want to format the result to a specific format and ' 'submit it to the test server') parser.add_argument( '--eval', type=str, nargs='+', help='evaluation metrics, which depends on the dataset, e.g., "bbox",' ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') parser.add_argument('--show', action='store_true', help='show results') parser.add_argument( '--show-dir', help='directory where painted images will be saved') parser.add_argument( '--show-score-thr', type=float, default=0.3, help='score threshold (default: 0.3)') parser.add_argument( '--gpu-collect', action='store_true', help='whether to use gpu to collect results.') parser.add_argument( '--tmpdir', help='tmp directory used for collecting results from multiple ' 'workers, available when gpu-collect is not specified') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') parser.add_argument( '--options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function (deprecate), ' 'change to --eval-options instead.') parser.add_argument( '--eval-options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) if args.options and args.eval_options: raise ValueError( '--options and --eval-options cannot be both ' 'specified, --options is deprecated in favor of --eval-options') if args.options: warnings.warn('--options is deprecated in favor of --eval-options') args.eval_options = args.options return args def main(): args = parse_args() assert args.out or args.eval or args.format_only or args.show \ or args.show_dir, \ ('Please specify at least one operation (save/eval/format/show the ' 'results / save the results) with the argument "--out", "--eval"' ', "--format-only", "--show" or "--show-dir"') if args.eval and args.format_only: raise ValueError('--eval and --format_only cannot be both specified') # if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): # raise ValueError('The output file must be a pkl file.') cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None if cfg.model.get('neck'): if isinstance(cfg.model.neck, list): for neck_cfg in cfg.model.neck: if neck_cfg.get('rfp_backbone'): if neck_cfg.rfp_backbone.get('pretrained'): neck_cfg.rfp_backbone.pretrained = None elif cfg.model.neck.get('rfp_backbone'): if cfg.model.neck.rfp_backbone.get('pretrained'): cfg.model.neck.rfp_backbone.pretrained = None # in case the test dataset is concatenated samples_per_gpu = 1 if isinstance(cfg.data.test, dict): cfg.data.test.test_mode = True samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) if samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.test.pipeline = replace_ImageToTensor( cfg.data.test.pipeline) elif isinstance(cfg.data.test, list): for ds_cfg in cfg.data.test: ds_cfg.test_mode = True samples_per_gpu = max( [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test]) if samples_per_gpu > 1: for ds_cfg in cfg.data.test: ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline) if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids else: cfg.gpu_ids = range(1) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False if len(cfg.gpu_ids) > 1: warnings.warn( f'We treat {cfg.gpu_ids} as gpu-ids, and reset to ' f'{cfg.gpu_ids[0:1]} as gpu-ids to avoid potential error in ' 'non-distribute testing time.') cfg.gpu_ids = cfg.gpu_ids[0:1] else: distributed = True init_dist(args.launcher, **cfg.dist_params) rank, _ = get_dist_info() # allows not to create if args.work_dir is not None and rank == 0: mmcv.mkdir_or_exist(osp.abspath(args.work_dir)) timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) json_file = osp.join(args.work_dir, f'eval_{timestamp}.json') # build the dataloader dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, samples_per_gpu=samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint cfg.model.train_cfg = None model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_conv_bn(model) # old versions did not save class info in checkpoints, this walkaround is # for backward compatibility if 'CLASSES' in checkpoint.get('meta', {}): model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = dataset.CLASSES if not distributed: model = MMDataParallel(model, device_ids=cfg.gpu_ids) outputs = single_gpu_test(model, data_loader, args.show, args.show_dir, args.show_score_thr) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: if args.out: print(f'\nwriting results to {args.out}') mmcv.dump(outputs, args.out) kwargs = {} if args.eval_options is None else args.eval_options kwargs['resfile_path'] = args.checkpoint.replace('.pth', '_results') if kwargs['resfile_path'][:7] == 'logger/': os.system("ln -sf {} {}".format( os.path.join('../', kwargs['resfile_path'], 'submission_file.zip'), 'logger/submission.zip')) if args.format_only: dataset.format_results(outputs, **kwargs) if args.eval: eval_kwargs = cfg.get('evaluation', {}).copy() # hard-code way to remove EvalHook args for key in [ 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', 'rule', 'dynamic_intervals' ]: eval_kwargs.pop(key, None) eval_kwargs.update(dict(metric=args.eval, **kwargs)) metric = dataset.evaluate(outputs, **eval_kwargs) print(metric) metric_dict = dict(config=args.config, metric=metric) if args.work_dir is not None and rank == 0: mmcv.dump(metric_dict, json_file) if __name__ == '__main__': main() ================================================ FILE: unitrack/__init__.py ================================================ from .model import * ================================================ FILE: unitrack/basetrack.py ================================================ import numpy as np from collections import OrderedDict,deque from unitrack.core.motion.kalman_filter import KalmanFilter import unitrack.core.association.matching as matching from unitrack.utils.box import * import torch import torch.nn.functional as F class TrackState(object): New = 0 Tracked = 1 Lost = 2 Removed = 3 class BaseTrack(object): _count = 0 track_id = 0 is_activated = False state = TrackState.New history = OrderedDict() features = [] curr_feature = None score = 0 start_frame = 0 frame_id = 0 time_since_update = 0 # multi-camera location = (np.inf, np.inf) @property def end_frame(self): return self.frame_id @staticmethod def next_id(): BaseTrack._count += 1 return BaseTrack._count def activate(self, *args): raise NotImplementedError def predict(self): raise NotImplementedError def update(self, *args, **kwargs): raise NotImplementedError def mark_lost(self): self.state = TrackState.Lost def mark_removed(self): self.state = TrackState.Removed class STrack(BaseTrack): shared_kalman = KalmanFilter() def __init__(self, tlwh, score, temp_feat, buffer_size=30, mask=None, pose=None, ac=False, category=-1, use_kalman=True): # wait activate self._tlwh = np.asarray(tlwh, dtype=np.float) self.kalman_filter = None self.mean, self.covariance = None, None self.use_kalman = use_kalman if not use_kalman: ac=True self.is_activated = ac self.score = score self.category = category self.tracklet_len = 0 self.smooth_feat = None self.update_features(temp_feat) self.features = deque([], maxlen=buffer_size) self.alpha = 0.9 self.mask = mask self.pose = pose def update_features(self, feat): self.curr_feat = feat if self.smooth_feat is None: self.smooth_feat = feat elif self.smooth_feat.shape == feat.shape: self.smooth_feat = self.alpha *self.smooth_feat + (1-self.alpha) * feat else: pass def predict(self): mean_state = self.mean.copy() if self.state != TrackState.Tracked: mean_state[7] = 0 self.mean, self.covariance = self.kalman_filter.predict(mean_state, self.covariance) @staticmethod def multi_predict(stracks): if len(stracks) > 0: multi_mean = np.asarray([st.mean.copy() for st in stracks]) multi_covariance = np.asarray([st.covariance for st in stracks]) for i,st in enumerate(stracks): if st.state != TrackState.Tracked: multi_mean[i][7] = 0 multi_mean, multi_covariance = STrack.shared_kalman.multi_predict(multi_mean, multi_covariance) for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)): stracks[i].mean = mean stracks[i].covariance = cov def activate(self, kalman_filter, frame_id): """Start a new tracklet""" self.kalman_filter = kalman_filter self.track_id = self.next_id() self.mean, self.covariance = self.kalman_filter.initiate(tlwh_to_xyah(self._tlwh)) self.tracklet_len = 0 self.state = TrackState.Tracked if frame_id == 1: self.is_activated = True #self.is_activated = True self.frame_id = frame_id self.start_frame = frame_id def re_activate(self, new_track, frame_id, new_id=False, update_feature=True): if self.use_kalman: self.mean, self.covariance = self.kalman_filter.update( self.mean, self.covariance, tlwh_to_xyah(new_track.tlwh) ) else: self.mean, self.covariance = None, None self._tlwh = np.asarray(new_track.tlwh, dtype=np.float) if update_feature: self.update_features(new_track.curr_feat) self.tracklet_len = 0 self.state = TrackState.Tracked self.is_activated = True self.frame_id = frame_id if new_id: self.track_id = self.next_id() if not new_track.mask is None: self.mask = new_track.mask def update(self, new_track, frame_id, update_feature=True): """ Update a matched track :type new_track: STrack :type frame_id: int :type update_feature: bool :return: """ self.frame_id = frame_id self.tracklet_len += 1 new_tlwh = new_track.tlwh if self.use_kalman: self.mean, self.covariance = self.kalman_filter.update( self.mean, self.covariance, tlwh_to_xyah(new_tlwh)) else: self.mean, self.covariance = None, None self._tlwh = np.asarray(new_tlwh, dtype=np.float) self.state = TrackState.Tracked self.is_activated = True self.score = new_track.score ''' For TAO dataset ''' self.category = new_track.category if update_feature: self.update_features(new_track.curr_feat) if not new_track.mask is None: self.mask = new_track.mask if not new_track.pose is None: self.pose = new_track.pose @property def tlwh(self): """Get current position in bounding box format `(top left x, top left y, width, height)`. """ if self.mean is None: return self._tlwh.copy() ret = self.mean[:4].copy() ret[2] *= ret[3] ret[:2] -= ret[2:] / 2 return ret @property def tlbr(self): """Convert bounding box to format `(min x, min y, max x, max y)`, i.e., `(top left, bottom right)`. """ ret = self.tlwh.copy() ret[2:] += ret[:2] return ret def to_xyah(self): return tlwh_to_xyah(self.tlwh) def __repr__(self): return 'OT_{}_({}-{})'.format(self.track_id, self.start_frame, self.end_frame) def joint_stracks(tlista, tlistb): exists = {} res = [] for t in tlista: exists[t.track_id] = 1 res.append(t) for t in tlistb: tid = t.track_id if not exists.get(tid, 0): exists[tid] = 1 res.append(t) return res def sub_stracks(tlista, tlistb): stracks = {} for t in tlista: stracks[t.track_id] = t for t in tlistb: tid = t.track_id if stracks.get(tid, 0): del stracks[tid] return list(stracks.values()) def remove_duplicate_stracks(stracksa, stracksb, ioudist=0.15): pdist = matching.iou_distance(stracksa, stracksb) pairs = np.where(pdist timeq: dupb.append(q) else: dupa.append(p) resa = [t for i,t in enumerate(stracksa) if not i in dupa] resb = [t for i,t in enumerate(stracksb) if not i in dupb] return resa, resb ================================================ FILE: unitrack/box.py ================================================ ################################################################### # File Name: box.py # Author: Zhongdao Wang # mail: wcd17@mails.tsinghua.edu.cn # Created Time: Fri Jan 29 15:16:53 2021 ################################################################### import torch from torchvision import ops from .basetrack import STrack from .multitracker import AssociationTracker from unitrack.utils.box import scale_box, scale_box_input_size, xywh2xyxy, tlbr_to_tlwh class BoxAssociationTracker(AssociationTracker): def __init__(self, opt): super(BoxAssociationTracker, self).__init__(opt) def extract_emb(self, img, obs): feat = self.app_model(img.unsqueeze(0).to(self.opt.device).float()) scale = [feat.shape[-1]/self.opt.img_size[0], feat.shape[-2]/self.opt.img_size[1]] obs_feat = scale_box(scale, obs).to(self.opt.device) obs_feat = [obs_feat[:, :4], ] ret = ops.roi_align(feat, obs_feat, self.opt.feat_size).detach().cpu() return ret def prepare_obs(self, img, img0, obs, embs=None): obs = torch.from_numpy(obs[obs[:, 4] > self.opt.conf_thres]).float() if len(obs) > 0: obs = xywh2xyxy(obs) obs = scale_box(self.opt.img_size, obs) embs = self.extract_emb(img, obs) obs = scale_box_input_size(self.opt.img_size, obs, img0.shape) if obs.shape[1] == 5: detections = [STrack(tlbr_to_tlwh(tlbrs[:4]), tlbrs[4], f, self.buffer_size, use_kalman=self.opt.use_kalman) for (tlbrs, f) in zip(obs, embs)] elif obs.shape[1] == 6: detections = [STrack(tlbr_to_tlwh(tlbrs[:4]), tlbrs[4], f, self.buffer_size, category=tlbrs[5], use_kalman=self.opt.use_kalman) for (tlbrs, f) in zip(obs, embs)] else: raise ValueError( 'Shape of observations should be [n, 5] or [n, 6].') else: detections = [] return detections ================================================ FILE: unitrack/core/__init__.py ================================================ ================================================ FILE: unitrack/core/association/__init__.py ================================================ ================================================ FILE: unitrack/core/association/matching.py ================================================ import torch import torch.nn.functional as F import numpy as np import scipy from scipy.spatial.distance import cdist import lap from cython_bbox import bbox_overlaps as bbox_ious from ..motion import kalman_filter def merge_matches(m1, m2, shape): O,P,Q = shape m1 = np.asarray(m1) m2 = np.asarray(m2) M1 = scipy.sparse.coo_matrix((np.ones(len(m1)), (m1[:, 0], m1[:, 1])), shape=(O, P)) M2 = scipy.sparse.coo_matrix((np.ones(len(m2)), (m2[:, 0], m2[:, 1])), shape=(P, Q)) mask = M1*M2 match = mask.nonzero() match = list(zip(match[0], match[1])) unmatched_O = tuple(set(range(O)) - set([i for i, j in match])) unmatched_Q = tuple(set(range(Q)) - set([j for i, j in match])) return match, unmatched_O, unmatched_Q def linear_assignment(cost_matrix, thresh): if cost_matrix.size == 0: return np.empty((0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(range(cost_matrix.shape[1])) matches, unmatched_a, unmatched_b = [], [], [] cost, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh) for ix, mx in enumerate(x): if mx >= 0: matches.append([ix, mx]) unmatched_a = np.where(x < 0)[0] unmatched_b = np.where(y < 0)[0] matches = np.asarray(matches) return matches, unmatched_a, unmatched_b def ious(atlbrs, btlbrs): """ Compute cost based on IoU :type atlbrs: list[tlbr] | np.ndarray :type atlbrs: list[tlbr] | np.ndarray :rtype ious np.ndarray """ ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float) if ious.size == 0: return ious ious = bbox_ious( np.ascontiguousarray(atlbrs, dtype=np.float), np.ascontiguousarray(btlbrs, dtype=np.float) ) return ious def iou_distance(atracks, btracks): """ Compute cost based on IoU :type atracks: list[STrack] :type btracks: list[STrack] :rtype cost_matrix np.ndarray """ if (len(atracks)>0 and isinstance(atracks[0], np.ndarray)) or (len(btracks) > 0 and isinstance(btracks[0], np.ndarray)): atlbrs = atracks btlbrs = btracks else: atlbrs = [track.tlbr for track in atracks] btlbrs = [track.tlbr for track in btracks] _ious = ious(atlbrs, btlbrs) cost_matrix = 1 - _ious return cost_matrix def embedding_distance(tracks, detections, metric='cosine'): """ :param tracks: list[STrack] :param detections: list[BaseTrack] :param metric: :return: cost_matrix np.ndarray """ cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float) if cost_matrix.size == 0: return cost_matrix det_features = np.asarray([track.curr_feat for track in detections], dtype=np.float) track_features = np.asarray([track.smooth_feat for track in tracks], dtype=np.float) cost_matrix = np.maximum(0.0, cdist(track_features, det_features)) # Nomalized features return cost_matrix def fuse_motion(kf, cost_matrix, tracks, detections, only_position=False, lambda_=0.98, gate=True): if cost_matrix.size == 0: return cost_matrix gating_dim = 2 if only_position else 4 gating_threshold = kalman_filter.chi2inv95[gating_dim] measurements = np.asarray([det.to_xyah() for det in detections]) for row, track in enumerate(tracks): gating_distance = kf.gating_distance( track.mean, track.covariance, measurements, only_position, metric='maha') if gate: cost_matrix[row, gating_distance > gating_threshold] = np.inf cost_matrix[row] = lambda_ * cost_matrix[row] + (1-lambda_)* gating_distance return cost_matrix def center_emb_distance(tracks, detections, metric='cosine'): """ :param tracks: list[STrack] :param detections: list[BaseTrack] :param metric: :return: cost_matrix np.ndarray """ cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float) if cost_matrix.size == 0: return cost_matrix det_features = torch.stack([track.curr_feat.squeeze() for track in detections]) track_features = torch.stack([track.smooth_feat.squeeze() for track in tracks]) normed_det = F.normalize(det_features) normed_track = F.normalize(track_features) cost_matrix = torch.mm(normed_track, normed_det.T) cost_matrix = 1 - cost_matrix.detach().cpu().numpy() return cost_matrix def recons_distance(tracks, detections, tmp=100): """ :param tracks: list[STrack] :param detections: list[BaseTrack] :param metric: :return: cost_matrix np.ndarray """ cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float) if cost_matrix.size == 0: return cost_matrix det_features_ = torch.stack([track.curr_feat.squeeze() for track in detections]) track_features_ = torch.stack([track.smooth_feat for track in tracks]) det_features = F.normalize(det_features_, dim=1) track_features = F.normalize(track_features_, dim=1) ndet, ndim, nw, nh = det_features.shape ntrk, _, _, _ = track_features.shape fdet = det_features.permute(0,2,3,1).reshape(-1, ndim).cuda() # ndet*nw*nh, ndim ftrk = track_features.permute(0,2,3,1).reshape(-1, ndim).cuda() # ntrk*nw*nh, ndim aff = torch.mm(ftrk, fdet.transpose(0,1)) # ntrk*nw*nh, ndet*nw*nh aff_td = F.softmax(tmp*aff, dim=1) aff_dt = F.softmax(tmp*aff, dim=0).transpose(0,1) recons_ftrk = torch.einsum('tds,dsm->tdm', aff_td.view(ntrk*nw*nh, ndet, nw*nh), fdet.view(ndet, nw*nh, ndim)) # ntrk*nw*nh, ndet, ndim recons_fdet = torch.einsum('dts,tsm->dtm', aff_dt.view(ndet*nw*nh, ntrk, nw*nh), ftrk.view(ntrk, nw*nh, ndim)) # ndet*nw*nh, ntrk, ndim res_ftrk = (recons_ftrk.permute(0,2,1) - ftrk.unsqueeze(-1)).view(ntrk, nw*nh*ndim, ndet) res_fdet = (recons_fdet.permute(0,2,1) - fdet.unsqueeze(-1)).view(ndet, nw*nh*ndim, ntrk) cost_matrix = (torch.abs(res_ftrk).mean(1) + torch.abs(res_fdet).mean(1).transpose(0,1)) * 0.5 cost_matrix = cost_matrix / cost_matrix.max(1)[0].unsqueeze(-1) #pdb.set_trace() cost_matrix = cost_matrix.cpu().numpy() return cost_matrix def get_track_feat(tracks, feat_flag='curr'): if feat_flag == 'curr': feat_list = [track.curr_feat.squeeze(0) for track in tracks] elif feat_flag == 'smooth': feat_list = [track.smooth_feat.squeeze(0) for track in tracks] else: raise NotImplementedError n = len(tracks) fdim = feat_list[0].shape[0] fdim_num = len(feat_list[0].shape) if fdim_num > 2: feat_list = [f.view(fdim,-1) for f in feat_list] numels = [f.shape[1] for f in feat_list] ret = torch.zeros(n, fdim, np.max(numels)).to(feat_list[0].device) for i, f in enumerate(feat_list): ret[i, :, :numels[i]] = f return ret def reconsdot_distance(tracks, detections, tmp=100): """ :param tracks: list[STrack] :param detections: list[BaseTrack] :param metric: :return: cost_matrix np.ndarray """ cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float) if cost_matrix.size == 0: return cost_matrix, None det_features_ = get_track_feat(detections) track_features_ = get_track_feat(tracks, feat_flag='curr') det_features = F.normalize(det_features_, dim=1) track_features = F.normalize(track_features_, dim=1) ndet, ndim, nsd = det_features.shape ntrk, _, nst = track_features.shape fdet = det_features.permute(0, 2, 1).reshape(-1, ndim) ftrk = track_features.permute(0, 2, 1).reshape(-1, ndim) aff = torch.mm(ftrk, fdet.transpose(0, 1)) aff_td = F.softmax(tmp*aff, dim=1) aff_dt = F.softmax(tmp*aff, dim=0).transpose(0, 1) recons_ftrk = torch.einsum('tds,dsm->tdm', aff_td.view(ntrk*nst, ndet, nsd), fdet.view(ndet, nsd, ndim)) recons_fdet = torch.einsum('dts,tsm->dtm', aff_dt.view(ndet*nsd, ntrk, nst), ftrk.view(ntrk, nst, ndim)) recons_ftrk = recons_ftrk.permute(0, 2, 1).reshape((ntrk, nst*ndim, ndet)) recons_ftrk_norm = F.normalize(recons_ftrk, dim=1) recons_fdet = recons_fdet.permute(0, 2, 1).view(ndet, nsd*ndim, ntrk) recons_fdet_norm = F.normalize(recons_fdet, dim=1) dot_td = torch.einsum('tad,ta->td', recons_ftrk_norm, F.normalize(ftrk.reshape(ntrk, nst*ndim), dim=1)) dot_dt = torch.einsum('dat,da->dt', recons_fdet_norm, F.normalize(fdet.reshape(ndet, nsd*ndim), dim=1)) cost_matrix = 1 - 0.5 * (dot_td + dot_dt.transpose(0, 1)) cost_matrix = cost_matrix.detach().cpu().numpy() return cost_matrix, None def category_gate(cost_matrix, tracks, detections): """ :param tracks: list[STrack] :param detections: list[BaseTrack] :param metric: :return: cost_matrix np.ndarray """ if cost_matrix.size == 0: return cost_matrix det_categories = np.array([d.category for d in detections]) trk_categories = np.array([t.category for t in tracks]) cost_matrix = cost_matrix + np.abs( det_categories[None, :] - trk_categories[:, None]) return cost_matrix ================================================ FILE: unitrack/core/motion/kalman_filter.py ================================================ # vim: expandtab:ts=4:sw=4 import numpy as np import scipy.linalg """ Table for the 0.95 quantile of the chi-square distribution with N degrees of freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv function and used as Mahalanobis gating threshold. """ chi2inv95 = { 1: 3.8415, 2: 5.9915, 3: 7.8147, 4: 9.4877, 5: 11.070, 6: 12.592, 7: 14.067, 8: 15.507, 9: 16.919} class KalmanFilter(object): """ A simple Kalman filter for tracking bounding boxes in image space. The 8-dimensional state space x, y, a, h, vx, vy, va, vh contains the bounding box center position (x, y), aspect ratio a, height h, and their respective velocities. Object motion follows a constant velocity model. The bounding box location (x, y, a, h) is taken as direct observation of the state space (linear observation model). """ def __init__(self): ndim, dt = 4, 1. # Create Kalman filter model matrices. self._motion_mat = np.eye(2 * ndim, 2 * ndim) for i in range(ndim): self._motion_mat[i, ndim + i] = dt self._update_mat = np.eye(ndim, 2 * ndim) # Motion and observation uncertainty are chosen relative to the current # state estimate. These weights control the amount of uncertainty in # the model. This is a bit hacky. self._std_weight_position = 1. / 20 self._std_weight_velocity = 1. / 160 def initiate(self, measurement): """Create track from unassociated measurement. Parameters ---------- measurement : ndarray Bounding box coordinates (x, y, a, h) with center position (x, y), aspect ratio a, and height h. Returns ------- (ndarray, ndarray) Returns the mean vector (8 dimensional) and covariance matrix (8x8 dimensional) of the new track. Unobserved velocities are initialized to 0 mean. """ mean_pos = measurement mean_vel = np.zeros_like(mean_pos) mean = np.r_[mean_pos, mean_vel] std = [ 2 * self._std_weight_position * measurement[3], 2 * self._std_weight_position * measurement[3], 1e-2, 2 * self._std_weight_position * measurement[3], 10 * self._std_weight_velocity * measurement[3], 10 * self._std_weight_velocity * measurement[3], 1e-5, 10 * self._std_weight_velocity * measurement[3]] covariance = np.diag(np.square(std)) return mean, covariance def predict(self, mean, covariance): """Run Kalman filter prediction step. Parameters ---------- mean : ndarray The 8 dimensional mean vector of the object state at the previous time step. covariance : ndarray The 8x8 dimensional covariance matrix of the object state at the previous time step. Returns ------- (ndarray, ndarray) Returns the mean vector and covariance matrix of the predicted state. Unobserved velocities are initialized to 0 mean. """ std_pos = [ self._std_weight_position * mean[3], self._std_weight_position * mean[3], 1e-2, self._std_weight_position * mean[3]] std_vel = [ self._std_weight_velocity * mean[3], self._std_weight_velocity * mean[3], 1e-5, self._std_weight_velocity * mean[3]] motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) mean = np.dot(mean, self._motion_mat.T) covariance = np.linalg.multi_dot(( self._motion_mat, covariance, self._motion_mat.T)) + motion_cov return mean, covariance def project(self, mean, covariance): """Project state distribution to measurement space. Parameters ---------- mean : ndarray The state's mean vector (8 dimensional array). covariance : ndarray The state's covariance matrix (8x8 dimensional). Returns ------- (ndarray, ndarray) Returns the projected mean and covariance matrix of the given state estimate. """ std = [ self._std_weight_position * mean[3], self._std_weight_position * mean[3], 1e-1, self._std_weight_position * mean[3]] innovation_cov = np.diag(np.square(std)) mean = np.dot(self._update_mat, mean) covariance = np.linalg.multi_dot(( self._update_mat, covariance, self._update_mat.T)) return mean, covariance + innovation_cov def multi_predict(self, mean, covariance): """Run Kalman filter prediction step (Vectorized version). Parameters ---------- mean : ndarray The Nx8 dimensional mean matrix of the object states at the previous time step. covariance : ndarray The Nx8x8 dimensional covariance matrics of the object states at the previous time step. Returns ------- (ndarray, ndarray) Returns the mean vector and covariance matrix of the predicted state. Unobserved velocities are initialized to 0 mean. """ std_pos = [ self._std_weight_position * mean[:, 3], self._std_weight_position * mean[:, 3], 1e-2 * np.ones_like(mean[:, 3]), self._std_weight_position * mean[:, 3]] std_vel = [ self._std_weight_velocity * mean[:, 3], self._std_weight_velocity * mean[:, 3], 1e-5 * np.ones_like(mean[:, 3]), self._std_weight_velocity * mean[:, 3]] sqr = np.square(np.r_[std_pos, std_vel]).T motion_cov = [] for i in range(len(mean)): motion_cov.append(np.diag(sqr[i])) motion_cov = np.asarray(motion_cov) mean = np.dot(mean, self._motion_mat.T) left = np.dot(self._motion_mat, covariance).transpose((1,0,2)) covariance = np.dot(left, self._motion_mat.T) + motion_cov return mean, covariance def update(self, mean, covariance, measurement): """Run Kalman filter correction step. Parameters ---------- mean : ndarray The predicted state's mean vector (8 dimensional). covariance : ndarray The state's covariance matrix (8x8 dimensional). measurement : ndarray The 4 dimensional measurement vector (x, y, a, h), where (x, y) is the center position, a the aspect ratio, and h the height of the bounding box. Returns ------- (ndarray, ndarray) Returns the measurement-corrected state distribution. """ projected_mean, projected_cov = self.project(mean, covariance) chol_factor, lower = scipy.linalg.cho_factor( projected_cov, lower=True, check_finite=False) kalman_gain = scipy.linalg.cho_solve( (chol_factor, lower), np.dot(covariance, self._update_mat.T).T, check_finite=False).T innovation = measurement - projected_mean new_mean = mean + np.dot(innovation, kalman_gain.T) new_covariance = covariance - np.linalg.multi_dot(( kalman_gain, projected_cov, kalman_gain.T)) return new_mean, new_covariance def gating_distance(self, mean, covariance, measurements, only_position=False, metric='maha'): """Compute gating distance between state distribution and measurements. A suitable distance threshold can be obtained from `chi2inv95`. If `only_position` is False, the chi-square distribution has 4 degrees of freedom, otherwise 2. Parameters ---------- mean : ndarray Mean vector over the state distribution (8 dimensional). covariance : ndarray Covariance of the state distribution (8x8 dimensional). measurements : ndarray An Nx4 dimensional matrix of N measurements, each in format (x, y, a, h) where (x, y) is the bounding box center position, a the aspect ratio, and h the height. only_position : Optional[bool] If True, distance computation is done with respect to the bounding box center position only. Returns ------- ndarray Returns an array of length N, where the i-th element contains the squared Mahalanobis distance between (mean, covariance) and `measurements[i]`. """ mean, covariance = self.project(mean, covariance) if only_position: mean, covariance = mean[:2], covariance[:2, :2] measurements = measurements[:, :2] d = measurements - mean if metric == 'gaussian': return np.sum(d * d, axis=1) elif metric == 'maha': cholesky_factor = np.linalg.cholesky(covariance) z = scipy.linalg.solve_triangular( cholesky_factor, d.T, lower=True, check_finite=False, overwrite_b=True) squared_maha = np.sum(z * z, axis=0) return squared_maha else: raise ValueError('invalid distance metric') ================================================ FILE: unitrack/core/propagation/__init__.py ================================================ ################################################################### # File Name: __init__.py # Author: Zhongdao Wang # mail: wcd17@mails.tsinghua.edu.cn # Created Time: Mon Jan 18 15:57:34 2021 ################################################################### from __future__ import print_function from __future__ import division from __future__ import absolute_import from .propagate_box import propagate_box from .propagate_mask import propagate_mask from .propagate_pose import propagate_pose def propagate(temp_feats, obs, img, model, format='box'): if format == 'box': return propagate_box(temp_feats, obs, img, model) elif format == 'mask': return propagate_box(temp_feats, obs, img, model) elif format == 'pose': return propagate_pose(temp_feats, obs, img, model) else: raise ValueError('Observation format not supported.') ================================================ FILE: unitrack/core/propagation/propagate_box.py ================================================ ################################################################### # File Name: propagate_box.py # Author: Zhongdao Wang # mail: wcd17@mails.tsinghua.edu.cn # Created Time: Mon Jan 18 16:01:46 2021 ################################################################### from __future__ import print_function from __future__ import division from __future__ import absolute_import def propagate_box(temp_feats, box, img, model): pass ================================================ FILE: unitrack/core/propagation/propagate_mask.py ================================================ ################################################################### # File Name: propagate_box.py # Author: Zhongdao Wang # mail: wcd17@mails.tsinghua.edu.cn # Created Time: Mon Jan 18 16:01:46 2021 ################################################################### from __future__ import print_function from __future__ import division from __future__ import absolute_import def propagate_mask(temp_feats, mask, img, model): pass ================================================ FILE: unitrack/core/propagation/propagate_pose.py ================================================ ################################################################### # File Name: propagate_box.py # Author: Zhongdao Wang # mail: wcd17@mails.tsinghua.edu.cn # Created Time: Mon Jan 18 16:01:46 2021 ################################################################### from __future__ import print_function from __future__ import division from __future__ import absolute_import def propagate_pose(temp_feats, pose, img, model): pass ================================================ FILE: unitrack/mask.py ================================================ ################################################################### # File Name: mask.py # Author: Zhongdao Wang # mail: wcd17@mails.tsinghua.edu.cn # Created Time: Fri Jan 29 15:16:53 2021 ################################################################### import numpy as np import torch import torch.nn.functional as F from unitrack.utils.box import * from unitrack.utils.mask import * from .basetrack import * from .multitracker import AssociationTracker class MaskAssociationTracker(AssociationTracker): def __init__(self, opt): super(MaskAssociationTracker, self).__init__(opt) def extract_emb(self, img, obs): img = img.to(self.opt.device).float() with torch.no_grad(): feat = self.app_model(img) _, d, h, w = feat.shape obs = torch.from_numpy(obs).to(self.opt.device).float() obs = F.interpolate(obs.unsqueeze(1), size=(h,w), mode='nearest') template_scale = np.prod(self.opt.feat_size) embs = [] for ob in obs: obfeat = ob*feat scale = ob.sum() if scale > 0: if scale > self.opt.max_mask_area: scale_factor = np.sqrt(self.opt.max_mask_area/scale.item()) else: scale_factor = 1 norm_obfeat = F.interpolate(obfeat, scale_factor=scale_factor, mode='bilinear') norm_mask = F.interpolate(ob.unsqueeze(1), scale_factor=scale_factor, mode='nearest') emb = norm_obfeat[:,:, norm_mask.squeeze(0).squeeze(0).ge(0.5)] # print("embedding", emb.shape) embs.append(emb.cpu()) else: embs.append(torch.randn(d, template_scale)) return obs, embs def prepare_obs(self, img, img0, obs, embs=None): ''' Step 1: Network forward, get detections & embeddings''' if obs.shape[0] > 0: masks, embs = self.extract_emb(img, obs) boxes = mask2box(masks) keep_idx = remove_duplicated_box(boxes, iou_th=0.7) boxes, masks, obs = boxes[keep_idx], masks[keep_idx], obs[keep_idx] embs = [embs[k] for k in keep_idx] detections = [STrack(tlbr_to_tlwh(tlbrs), 1, f, self.buffer_size, mask, ac=True) \ for (tlbrs,mask,f) in zip(boxes, obs, embs)] else: detections = [] return detections ================================================ FILE: unitrack/mask_with_train_embs.py ================================================ ################################################################### # File Name: mask.py # Author: Zhongdao Wang # mail: wcd17@mails.tsinghua.edu.cn # Created Time: Fri Jan 29 15:16:53 2021 ################################################################### import time import numpy as np import torch import torch.nn.functional as F from unitrack.utils.box import * from unitrack.utils.mask import * from .basetrack import * from unitrack.model import AppearanceModel class AssociationTrackerWithTrainedEmbed(object): def __init__(self, opt): self.opt = opt self.tracked_stracks = [] # type: list[STrack] self.lost_stracks = [] # type: list[STrack] self.removed_stracks = [] # type: list[STrack] self.frame_id = 0 self.det_thresh = opt.conf_thres self.buffer_size = opt.track_buffer self.max_time_lost = self.buffer_size self.kalman_filter = KalmanFilter() # self.app_model = AppearanceModel(opt).to(opt.device) # self.app_model.eval() if not self.opt.asso_with_motion: self.opt.motion_lambda = 1 self.opt.motion_gated = False def extract_emb(self, img, obs): raise NotImplementedError def prepare_obs(self, img, img0, obs, embs=None): raise NotImplementedError def update(self, img, img0, obs, embs=None): torch.cuda.empty_cache() self.frame_id += 1 activated_stracks = [] refind_stracks = [] lost_stracks = [] removed_stracks = [] t1 = time.time() detections = self.prepare_obs(img, img0, obs, embs=None) ''' Add newly detected tracklets to tracked_stracks''' unconfirmed = [] tracked_stracks = [] # type: list[STrack] for track in self.tracked_stracks: if not track.is_activated: unconfirmed.append(track) else: tracked_stracks.append(track) ''' Step 2: First association, with embedding''' tracks = joint_stracks(tracked_stracks, self.lost_stracks) dists, recons_ftrk = matching.center_emb_distance(tracks, detections) if self.opt.use_kalman: # Predict the current location with KF STrack.multi_predict(tracks) dists = matching.fuse_motion(self.kalman_filter, dists, tracks, detections, lambda_=self.opt.motion_lambda, gate=self.opt.motion_gated) if obs.shape[1] == 6: dists = matching.category_gate(dists, tracks, detections) matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7) for itracked, idet in matches: track = tracks[itracked] det = detections[idet] if track.state == TrackState.Tracked: track.update(detections[idet], self.frame_id) activated_stracks.append(track) else: track.re_activate(det, self.frame_id, new_id=False) refind_stracks.append(track) if self.opt.use_kalman: '''(optional) Step 3: Second association, with IOU''' tracks = [tracks[i] for i in u_track if tracks[i].state == TrackState.Tracked] detections = [detections[i] for i in u_detection] dists = matching.iou_distance(tracks, detections) matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.5) for itracked, idet in matches: track = tracks[itracked] det = detections[idet] if track.state == TrackState.Tracked: track.update(det, self.frame_id) activated_stracks.append(track) else: track.re_activate(det, self.frame_id, new_id=False) refind_stracks.append(track) '''Deal with unconfirmed tracks, usually tracks with only one beginning frame''' detections = [detections[i] for i in u_detection] dists = matching.iou_distance(unconfirmed, detections) matches, u_unconfirmed, u_detection = matching.linear_assignment( dists, thresh=self.opt.confirm_iou_thres) for itracked, idet in matches: unconfirmed[itracked].update(detections[idet], self.frame_id) activated_stracks.append(unconfirmed[itracked]) for it in u_unconfirmed: track = unconfirmed[it] track.mark_removed() removed_stracks.append(track) for it in u_track: track = tracks[it] if not track.state == TrackState.Lost: track.mark_lost() lost_stracks.append(track) """ Step 4: Init new stracks""" for inew in u_detection: track = detections[inew] if track.score < self.det_thresh: continue track.activate(self.kalman_filter, self.frame_id) activated_stracks.append(track) """ Step 5: Update state""" for track in self.lost_stracks: if self.frame_id - track.end_frame > self.max_time_lost: track.mark_removed() removed_stracks.append(track) self.tracked_stracks = [t for t in self.tracked_stracks if t.state == TrackState.Tracked] self.tracked_stracks = joint_stracks(self.tracked_stracks, activated_stracks) self.tracked_stracks = joint_stracks(self.tracked_stracks, refind_stracks) self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks) self.lost_stracks.extend(lost_stracks) self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks) self.removed_stracks.extend(removed_stracks) self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks( self.tracked_stracks, self.lost_stracks, ioudist=self.opt.dup_iou_thres) # get scores of lost tracks output_stracks = [track for track in self.tracked_stracks if track.is_activated] return output_stracks def reset_all(self, ): self.tracked_stracks = [] # type: list[STrack] self.lost_stracks = [] # type: list[STrack] self.removed_stracks = [] # type: list[STrack] self.frame_id = 0 class MaskAssociationTracker(AssociationTrackerWithTrainedEmbed): def __init__(self, opt): super(MaskAssociationTracker, self).__init__(opt) def extract_emb(self, img, obs, embs): img = img.to(self.opt.device).float() obs = obs.to(self.opt.device).float() embs = embs.to(self.opt.device).float().unsqueeze(-1) # print(img.shape) # print("obs", obs.shape) # print("embs", embs.shape) # exit() # obs = F.interpolate(obs.unsqueeze(1), size=(h,w), mode='nearest') # template_scale = np.prod(self.opt.feat_size) embs_list = [] for emb in embs: # obfeat = ob embs_list.append(emb.cpu()) # scale = ob.sum() # if scale > 0: # if scale > self.opt.max_mask_area: # scale_factor = np.sqrt(self.opt.max_mask_area/scale.item()) # else: # scale_factor = 1 # norm_obfeat = F.interpolate(obfeat, scale_factor=scale_factor, mode='bilinear') # norm_mask = F.interpolate(ob.unsqueeze(1), scale_factor=scale_factor, mode='nearest') # emb = norm_obfeat[:,:, norm_mask.squeeze(0).squeeze(0).ge(0.5)] # embs.append(emb.cpu()) # else: # embs.append(torch.randn(d, template_scale)) return obs, embs_list def prepare_obs(self, img, img0, obs, embs=None): ''' Step 1: Network forward, get detections & embeddings''' if obs.shape[0] > 0: if embs is not None: masks, embs = self.extract_emb(img, obs, embs) boxes = mask2box(masks) keep_idx = remove_duplicated_box(boxes, iou_th=0.7) boxes, masks, obs = boxes[keep_idx], masks[keep_idx], obs[keep_idx] embs = [embs[k] for k in keep_idx] detections = [STrack(tlbr_to_tlwh(tlbrs), 1, f, self.buffer_size, mask, ac=True) \ for (tlbrs,mask,f) in zip(boxes, obs, embs)] else: detections = [] return detections ================================================ FILE: unitrack/model/__init__.py ================================================ ################################################################### # File Name: __init__.py # Author: Zhongdao Wang # mail: wcd17@mails.tsinghua.edu.cn # Created Time: Thu Dec 24 14:24:44 2020 ################################################################### from __future__ import print_function from __future__ import division from __future__ import absolute_import from .model import * from .resnet import * ================================================ FILE: unitrack/model/functional.py ================================================ ################################################################### # File Name: functional.py # Author: Zhongdao Wang # mail: wcd17@mails.tsinghua.edu.cn # Created Time: Mon Jun 21 21:04:09 2021 ################################################################### from __future__ import print_function from __future__ import division from __future__ import absolute_import import torch import torch.nn as nn import torch.nn.functional as F def hard_prop(pred): pred_max = pred.max(axis=0)[0] pred[pred < pred_max] = 0 pred[pred >= pred_max] = 1 pred /= pred.sum(0)[None] return pred def context_index_bank(n_context, long_mem, N): ''' Construct bank of source frames indices, for each target frame ''' ll = [] # "long term" context (i.e. first frame) for t in long_mem: assert 0 <= t < N, 'context frame out of bounds' idx = torch.zeros(N, 1).long() if t > 0: idx += t + (n_context+1) idx[:n_context+t+1] = 0 ll.append(idx) # "short" context ss = [(torch.arange(n_context)[None].repeat(N, 1) + \ torch.arange(N)[:, None])[:, :]] return ll + ss def mem_efficient_batched_affinity( query, keys, mask, temperature, topk, long_mem, device): ''' Mini-batched computation of affinity, for memory efficiency ''' bsize, pbsize = 10, 100 #keys.shape[2] // 2 Ws, Is = [], [] for b in range(0, keys.shape[2], bsize): _k, _q = keys[:, :, b:b+bsize].to(device), query[:, :, b:b+bsize].to(device) w_s, i_s = [], [] for pb in range(0, _k.shape[-1], pbsize): A = torch.einsum('ijklm,ijkn->iklmn', _k, _q[..., pb:pb+pbsize]) A[0, :, len(long_mem):] += mask[..., pb:pb+pbsize].to(device) _, N, T, h1w1, hw = A.shape A = A.view(N, T*h1w1, hw) A /= temperature weights, ids = torch.topk(A, topk, dim=-2) weights = F.softmax(weights, dim=-2) w_s.append(weights.cpu()) i_s.append(ids.cpu()) weights = torch.cat(w_s, dim=-1) ids = torch.cat(i_s, dim=-1) Ws += [w for w in weights] Is += [ii for ii in ids] return Ws, Is def batched_affinity(query, keys, mask, temperature, topk, long_mem, device): ''' Mini-batched computation of affinity, for memory efficiency (less aggressively mini-batched) ''' bsize = 2 Ws, Is = [], [] for b in range(0, keys.shape[2], bsize): _k, _q = keys[:, :, b:b+bsize].to(device), query[:, :, b:b+bsize].to(device) w_s, i_s = [], [] A = torch.einsum('ijklmn,ijkop->iklmnop', _k, _q) / temperature # Mask A[0, :, len(long_mem):] += mask.to(device) _, N, T, h1w1, hw = A.shape A = A.view(N, T*h1w1, hw) A /= temperature weights, ids = torch.topk(A, topk, dim=-2) weights = F.softmax(weights, dim=-2) Ws += [w for w in weights] Is += [ii for ii in ids] return Ws, Is def process_pose(pred, lbl_set, topk=3): # generate the coordinates: pred = pred[..., 1:] flatlbls = pred.flatten(0,1) topk = min(flatlbls.shape[0], topk) vals, ids = torch.topk(flatlbls, k=topk, dim=0) vals /= vals.sum(0)[None] xx, yy = ids % pred.shape[1], ids // pred.shape[1] current_coord = torch.stack([(xx * vals).sum(0), (yy * vals).sum(0)], dim=0) current_coord[:, flatlbls.sum(0) == 0] = -1 pred_val_sharp = np.zeros((*pred.shape[:2], 3)) for t in range(len(lbl_set) - 1): x = int(current_coord[0, t]) y = int(current_coord[1, t]) if x >=0 and y >= 0: pred_val_sharp[y, x, :] = lbl_set[t + 1] return current_coord.cpu(), pred_val_sharp class MaskedAttention(nn.Module): ''' A module that implements masked attention based on spatial locality TODO implement in a more efficient way (torch sparse or correlation filter) ''' def __init__(self, radius, flat=True): super(MaskedAttention, self).__init__() self.radius = radius self.flat = flat self.masks = {} self.index = {} def mask(self, H, W): if not ('%s-%s' %(H,W) in self.masks): self.make(H, W) return self.masks['%s-%s' %(H,W)] def index(self, H, W): if not ('%s-%s' %(H,W) in self.index): self.make_index(H, W) return self.index['%s-%s' %(H,W)] def make(self, H, W): if self.flat: H = int(H**0.5) W = int(W**0.5) gx, gy = torch.meshgrid(torch.arange(0, H), torch.arange(0, W)) D = ( (gx[None, None, :, :] - gx[:, :, None, None])**2 + (gy[None, None, :, :] - gy[:, :, None, None])**2 ).float() ** 0.5 D = (D < self.radius)[None].float() if self.flat: D = self.flatten(D) self.masks['%s-%s' %(H,W)] = D return D def flatten(self, D): return torch.flatten(torch.flatten(D, 1, 2), -2, -1) def make_index(self, H, W, pad=False): mask = self.mask(H, W).view(1, -1).byte() idx = torch.arange(0, mask.numel())[mask[0]][None] self.index['%s-%s' %(H,W)] = idx return idx def forward(self, x): H, W = x.shape[-2:] sid = '%s-%s' % (H,W) if sid not in self.masks: self.masks[sid] = self.make(H, W).to(x.device) mask = self.masks[sid] return x * mask[0] ================================================ FILE: unitrack/model/hrnet.py ================================================ # ------------------------------------------------------------------------------ # Copyright (c) Microsoft # Licensed under the MIT License. # Written by Bin Xiao (Bin.Xiao@microsoft.com) # Modified by Ke Sun (sunk@mail.ustc.edu.cn) # Modified by Zhongdao Wang(wcd17@mails.tsinghua.edu.cn) # ------------------------------------------------------------------------------ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import pdb import logging import functools import numpy as np import torch import torch.nn as nn import torch._utils import torch.nn.functional as F BN_MOMENTUM = 0.1 logger = logging.getLogger(__name__) def conv3x3(in_planes, out_planes, stride=1): """3x3 convolution with padding""" return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None): super(BasicBlock, self).__init__() self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None): super(Bottleneck, self).__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class HighResolutionModule(nn.Module): def __init__(self, num_branches, blocks, num_blocks, num_inchannels, num_channels, fuse_method, multi_scale_output=True): super(HighResolutionModule, self).__init__() self._check_branches( num_branches, blocks, num_blocks, num_inchannels, num_channels) self.num_inchannels = num_inchannels self.fuse_method = fuse_method self.num_branches = num_branches self.multi_scale_output = multi_scale_output self.branches = self._make_branches( num_branches, blocks, num_blocks, num_channels) self.fuse_layers = self._make_fuse_layers() self.relu = nn.ReLU(False) def _check_branches(self, num_branches, blocks, num_blocks, num_inchannels, num_channels): if num_branches != len(num_blocks): error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format( num_branches, len(num_blocks)) logger.error(error_msg) raise ValueError(error_msg) if num_branches != len(num_channels): error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format( num_branches, len(num_channels)) logger.error(error_msg) raise ValueError(error_msg) if num_branches != len(num_inchannels): error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format( num_branches, len(num_inchannels)) logger.error(error_msg) raise ValueError(error_msg) def _make_one_branch(self, branch_index, block, num_blocks, num_channels, stride=1): downsample = None if stride != 1 or \ self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion: downsample = nn.Sequential( nn.Conv2d(self.num_inchannels[branch_index], num_channels[branch_index] * block.expansion, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(num_channels[branch_index] * block.expansion, momentum=BN_MOMENTUM), ) layers = [] layers.append(block(self.num_inchannels[branch_index], num_channels[branch_index], stride, downsample)) self.num_inchannels[branch_index] = \ num_channels[branch_index] * block.expansion for i in range(1, num_blocks[branch_index]): layers.append(block(self.num_inchannels[branch_index], num_channels[branch_index])) return nn.Sequential(*layers) def _make_branches(self, num_branches, block, num_blocks, num_channels): branches = [] for i in range(num_branches): branches.append( self._make_one_branch(i, block, num_blocks, num_channels)) return nn.ModuleList(branches) def _make_fuse_layers(self): if self.num_branches == 1: return None num_branches = self.num_branches num_inchannels = self.num_inchannels fuse_layers = [] for i in range(num_branches if self.multi_scale_output else 1): fuse_layer = [] for j in range(num_branches): if j > i: fuse_layer.append(nn.Sequential( nn.Conv2d(num_inchannels[j], num_inchannels[i], 1, 1, 0, bias=False), nn.BatchNorm2d(num_inchannels[i], momentum=BN_MOMENTUM), nn.Upsample(scale_factor=2**(j-i), mode='nearest'))) elif j == i: fuse_layer.append(None) else: conv3x3s = [] for k in range(i-j): if k == i - j - 1: num_outchannels_conv3x3 = num_inchannels[i] conv3x3s.append(nn.Sequential( nn.Conv2d(num_inchannels[j], num_outchannels_conv3x3, 3, 2, 1, bias=False), nn.BatchNorm2d(num_outchannels_conv3x3, momentum=BN_MOMENTUM))) else: num_outchannels_conv3x3 = num_inchannels[j] conv3x3s.append(nn.Sequential( nn.Conv2d(num_inchannels[j], num_outchannels_conv3x3, 3, 2, 1, bias=False), nn.BatchNorm2d(num_outchannels_conv3x3, momentum=BN_MOMENTUM), nn.ReLU(False))) fuse_layer.append(nn.Sequential(*conv3x3s)) fuse_layers.append(nn.ModuleList(fuse_layer)) return nn.ModuleList(fuse_layers) def get_num_inchannels(self): return self.num_inchannels def forward(self, x): if self.num_branches == 1: return [self.branches[0](x[0])] for i in range(self.num_branches): x[i] = self.branches[i](x[i]) x_fuse = [] for i in range(len(self.fuse_layers)): y = x[0] if i == 0 else self.fuse_layers[i][0](x[0]) for j in range(1, self.num_branches): if i == j: y = y + x[j] else: fused = self.fuse_layers[i][j](x[j]) fh, fw = fused.shape[-2:] yh, yw = y.shape[-2:] if fh > yh: fused = fused[:,:,(fh-yh)//2:-(fh-yh)//2,:] if fw > yw: fused = fused[:,:,:,(fw-yw)//2:-(fw-yw)//2] y = y + fused x_fuse.append(self.relu(y)) return x_fuse blocks_dict = { 'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck } class HighResolutionNet(nn.Module): def __init__(self, cfg, **kwargs): super(HighResolutionNet, self).__init__() self.cfg = cfg self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.stage1_cfg = cfg['MODEL']['EXTRA']['STAGE1'] num_channels = self.stage1_cfg['NUM_CHANNELS'][0] block = blocks_dict[self.stage1_cfg['BLOCK']] num_blocks = self.stage1_cfg['NUM_BLOCKS'][0] self.layer1 = self._make_layer(block, 64, num_channels, num_blocks) stage1_out_channel = block.expansion*num_channels self.stage2_cfg = cfg['MODEL']['EXTRA']['STAGE2'] num_channels = self.stage2_cfg['NUM_CHANNELS'] block = blocks_dict[self.stage2_cfg['BLOCK']] num_channels = [ num_channels[i] * block.expansion for i in range(len(num_channels))] self.transition1 = self._make_transition_layer( [stage1_out_channel], num_channels) self.stage2, pre_stage_channels = self._make_stage( self.stage2_cfg, num_channels) self.stage3_cfg = cfg['MODEL']['EXTRA']['STAGE3'] num_channels = self.stage3_cfg['NUM_CHANNELS'] block = blocks_dict[self.stage3_cfg['BLOCK']] num_channels = [ num_channels[i] * block.expansion for i in range(len(num_channels))] self.transition2 = self._make_transition_layer( pre_stage_channels, num_channels) self.stage3, pre_stage_channels = self._make_stage( self.stage3_cfg, num_channels) self.stage4_cfg = cfg['MODEL']['EXTRA']['STAGE4'] num_channels = self.stage4_cfg['NUM_CHANNELS'] block = blocks_dict[self.stage4_cfg['BLOCK']] num_channels = [ num_channels[i] * block.expansion for i in range(len(num_channels))] self.transition3 = self._make_transition_layer( pre_stage_channels, num_channels) self.stage4, pre_stage_channels = self._make_stage( self.stage4_cfg, num_channels, multi_scale_output=True) # Classification Head self.incre_modules, self.downsamp_modules, \ self.final_layer = self._make_head(pre_stage_channels) self.classifier = nn.Linear(2048, 1000) def _make_head(self, pre_stage_channels): head_block = Bottleneck head_channels = [32, 64, 128, 256] # Increasing the #channels on each resolution # from C, 2C, 4C, 8C to 128, 256, 512, 1024 incre_modules = [] for i, channels in enumerate(pre_stage_channels): incre_module = self._make_layer(head_block, channels, head_channels[i], 1, stride=1) incre_modules.append(incre_module) incre_modules = nn.ModuleList(incre_modules) # downsampling modules downsamp_modules = [] for i in range(len(pre_stage_channels)-1): in_channels = head_channels[i] * head_block.expansion out_channels = head_channels[i+1] * head_block.expansion downsamp_module = nn.Sequential( nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=2, padding=1), nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM), nn.ReLU(inplace=True) ) downsamp_modules.append(downsamp_module) downsamp_modules = nn.ModuleList(downsamp_modules) final_layer = nn.Sequential( nn.Conv2d( in_channels=head_channels[3] * head_block.expansion, out_channels=2048, kernel_size=1, stride=1, padding=0 ), nn.BatchNorm2d(2048, momentum=BN_MOMENTUM), nn.ReLU(inplace=True) ) return incre_modules, downsamp_modules, final_layer def _make_transition_layer( self, num_channels_pre_layer, num_channels_cur_layer): num_branches_cur = len(num_channels_cur_layer) num_branches_pre = len(num_channels_pre_layer) transition_layers = [] for i in range(num_branches_cur): if i < num_branches_pre: if num_channels_cur_layer[i] != num_channels_pre_layer[i]: transition_layers.append(nn.Sequential( nn.Conv2d(num_channels_pre_layer[i], num_channels_cur_layer[i], 3, 1, 1, bias=False), nn.BatchNorm2d( num_channels_cur_layer[i], momentum=BN_MOMENTUM), nn.ReLU(inplace=True))) else: transition_layers.append(None) else: conv3x3s = [] for j in range(i+1-num_branches_pre): inchannels = num_channels_pre_layer[-1] outchannels = num_channels_cur_layer[i] \ if j == i-num_branches_pre else inchannels conv3x3s.append(nn.Sequential( nn.Conv2d( inchannels, outchannels, 3, 2, 1, bias=False), nn.BatchNorm2d(outchannels, momentum=BN_MOMENTUM), nn.ReLU(inplace=True))) transition_layers.append(nn.Sequential(*conv3x3s)) return nn.ModuleList(transition_layers) def _make_layer(self, block, inplanes, planes, blocks, stride=1): downsample = None if stride != 1 or inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2d(inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), ) layers = [] layers.append(block(inplanes, planes, stride, downsample)) inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(inplanes, planes)) return nn.Sequential(*layers) def _make_stage(self, layer_config, num_inchannels, multi_scale_output=True): num_modules = layer_config['NUM_MODULES'] num_branches = layer_config['NUM_BRANCHES'] num_blocks = layer_config['NUM_BLOCKS'] num_channels = layer_config['NUM_CHANNELS'] block = blocks_dict[layer_config['BLOCK']] fuse_method = layer_config['FUSE_METHOD'] modules = [] for i in range(num_modules): # multi_scale_output is only used last module if not multi_scale_output and i == num_modules - 1: reset_multi_scale_output = False else: reset_multi_scale_output = True modules.append( HighResolutionModule(num_branches, block, num_blocks, num_inchannels, num_channels, fuse_method, reset_multi_scale_output) ) num_inchannels = modules[-1].get_num_inchannels() return nn.Sequential(*modules), num_inchannels def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = self.conv2(x) x = self.bn2(x) x = self.relu(x) x = self.layer1(x) x_list = [] for i in range(self.stage2_cfg['NUM_BRANCHES']): if self.transition1[i] is not None: x_list.append(self.transition1[i](x)) else: x_list.append(x) y_list = self.stage2(x_list) x_list = [] for i in range(self.stage3_cfg['NUM_BRANCHES']): if self.transition2[i] is not None: x_list.append(self.transition2[i](y_list[-1])) else: x_list.append(y_list[i]) y_list = self.stage3(x_list) x_list = [] for i in range(self.stage4_cfg['NUM_BRANCHES']): if self.transition3[i] is not None: x_list.append(self.transition3[i](y_list[-1])) else: x_list.append(y_list[i]) y_list = self.stage4(x_list) # Classification Head y_list_out = {} y_list_out[0] = self.incre_modules[0](y_list[0]) for i in range(len(self.downsamp_modules)): y_list_out[i+1] = self.incre_modules[i+1](y_list[i+1]) + \ self.downsamp_modules[i](y_list_out[i]) #y = self.final_layer(y) ret = y_list_out[self.cfg['MODEL']['RETURN_STAGE']] ret_size = y_list_out[1].shape[-2:] ret = F.interpolate(ret, ret_size, mode='bilinear') return ret def init_weights(self, pretrained='',): print('=> init weights from normal distribution') for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_( m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) if os.path.isfile(pretrained): pretrained_dict = torch.load(pretrained) print('=> loading pretrained model {}'.format(pretrained)) model_dict = self.state_dict() pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict.keys()} for k, _ in pretrained_dict.items(): print( '=> loading {} pretrained model {}'.format(k, pretrained)) model_dict.update(pretrained_dict) self.load_state_dict(model_dict) config = { 'hrnet_w18': { 'MODEL':{ 'EXTRA':{ 'STAGE1':{ 'NUM_MODULES':1, 'NUM_BRANCHES':1, 'BLOCK': 'BOTTLENECK', 'NUM_BLOCKS':[4,], 'NUM_CHANNELS':[64,], 'FUSE_METHOD': 'SUM', }, 'STAGE2':{ 'NUM_MODULES':1, 'NUM_BRANCHES':2, 'BLOCK': 'BASIC', 'NUM_BLOCKS':[4,4,], 'NUM_CHANNELS':[18, 36], 'FUSE_METHOD': 'SUM', }, 'STAGE3':{ 'NUM_MODULES':4, 'NUM_BRANCHES':3, 'BLOCK': 'BASIC', 'NUM_BLOCKS':[4,4,4], 'NUM_CHANNELS':[18, 36, 72], 'FUSE_METHOD': 'SUM', }, 'STAGE4':{ 'NUM_MODULES':3, 'NUM_BRANCHES':4, 'BLOCK': 'BASIC', 'NUM_BLOCKS':[4,4,4,4], 'NUM_CHANNELS':[18, 36, 72, 144], 'FUSE_METHOD': 'SUM', }, } } }, 'hrnet_w32': { 'MODEL':{ 'EXTRA':{ 'STAGE1':{ 'NUM_MODULES':1, 'NUM_BRANCHES':1, 'BLOCK': 'BOTTLENECK', 'NUM_BLOCKS':[4,], 'NUM_CHANNELS':[64,], 'FUSE_METHOD': 'SUM', }, 'STAGE2':{ 'NUM_MODULES':1, 'NUM_BRANCHES':2, 'BLOCK': 'BASIC', 'NUM_BLOCKS':[4,4,], 'NUM_CHANNELS':[32, 64], 'FUSE_METHOD': 'SUM', }, 'STAGE3':{ 'NUM_MODULES':4, 'NUM_BRANCHES':3, 'BLOCK': 'BASIC', 'NUM_BLOCKS':[4,4,4], 'NUM_CHANNELS':[32, 64, 128], 'FUSE_METHOD': 'SUM', }, 'STAGE4':{ 'NUM_MODULES':3, 'NUM_BRANCHES':4, 'BLOCK': 'BASIC', 'NUM_BLOCKS':[4,4,4,4], 'NUM_CHANNELS':[32, 64, 128, 256], 'FUSE_METHOD': 'SUM', }, } } } } def get_cls_net(c, **kwargs): cfg = config[c] cfg['MODEL']['RETURN_STAGE'] = kwargs['return_stage'] model = HighResolutionNet(cfg, **kwargs) model.init_weights(pretrained=kwargs['pretrained']) return model if __name__ == '__main__': net = get_cls_net('hrnet_w18', return_stage=2, pretrained='../weights/hrnetv2_w18_imagenet.pth') pdb.set_trace() ================================================ FILE: unitrack/model/model.py ================================================ import pdb import os.path as osp import torch import torch.nn as nn from unitrack.model import resnet from unitrack.model import hrnet from unitrack.model import random_feat_generator class AppearanceModel(nn.Module): def __init__(self, args): super(AppearanceModel, self).__init__() self.args = args self.model = make_encoder(args).to(self.args.device) def forward(self, x): z = self.model(x) return z def partial_load(pretrained_dict, model, skip_keys=[], log=False): model_dict = model.state_dict() # 1. filter out unnecessary keys filtered_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict and not any([sk in k for sk in skip_keys])} skipped_keys = [k for k in pretrained_dict if k not in filtered_dict] unload_keys = [k for k in model_dict if k not in pretrained_dict] # 2. overwrite entries in the existing state dict model_dict.update(filtered_dict) # 3. load the new state dict model.load_state_dict(model_dict) if log: print('\nSkipped keys: ', skipped_keys) print('\nLoading keys: ', filtered_dict.keys()) print('\nUnLoaded keys: ', unload_keys) def load_vince_model(path): checkpoint = torch.load(path, map_location={'cuda:0': 'cpu'}) checkpoint = {k.replace('feature_extractor.module.model.', ''): checkpoint[k] for k in checkpoint if 'feature_extractor' in k} return checkpoint def load_uvc_model(ckpt_path): net = resnet.resnet18() net.avgpool, net.fc = None, None ckpt = torch.load(ckpt_path, map_location='cpu') state_dict = {k.replace('module.gray_encoder.', ''):v for k,v in ckpt['state_dict'].items() if 'gray_encoder' in k} partial_load(state_dict, net) return net def load_tc_model(ckpt_path): model_state = torch.load(ckpt_path, map_location='cpu')['state_dict'] net = resnet.resnet50() net_state = net.state_dict() for k in [k for k in model_state.keys() if 'encoderVideo' in k]: kk = k.replace('module.encoderVideo.', '') tmp = model_state[k] if net_state[kk].shape != model_state[k].shape and net_state[kk].dim() == 4 and model_state[k].dim() == 5: tmp = model_state[k].squeeze(2) net_state[kk][:] = tmp[:] net.load_state_dict(net_state) return net class From3D(nn.Module): ''' Use a 2D convnet as a 3D convnet ''' def __init__(self, resnet): super(From3D, self).__init__() self.model = resnet def forward(self, x): N, C, T, h, w = x.shape xx = x.permute(0, 2, 1, 3, 4).contiguous().view(-1, C, h, w) m = self.model(xx) return m.view(N, T, *m.shape[-3:]).permute(0, 2, 1, 3, 4) def make_encoder(args): SSL_MODELS = ['byol', 'deepcluster-v2', 'infomin', 'insdis', 'moco-v1', 'moco-v2', 'pcl-v1', 'pcl-v2','pirl', 'sela-v2', 'swav', 'simclr-v1', 'simclr-v2', 'pixpro', 'detco', 'barlowtwins'] model_type = args.model_type if model_type == 'crw': net = resnet.resnet18() if osp.isfile(args.resume): ckpt = torch.load(args.resume) state = {} for k, v in ckpt['model'].items(): if 'conv1.1.weight' in k or 'conv2.1.weight' in k: state[k.replace('.1.weight', '.weight')] = v if 'encoder.model' in k: state[k.replace('encoder.model.', '')] = v else: state[k] = v partial_load(state, net, skip_keys=['head',]) del ckpt elif model_type == 'random18': net = resnet.resnet18(pretrained=False) elif model_type == 'random50': net = resnet.resnet50(pretrained=False) elif model_type == 'imagenet18': net = resnet.resnet18(pretrained=True) elif model_type == 'imagenet50': net = resnet.resnet50(pretrained=True) elif model_type == 'imagenet101': net = resnet.resnet101(pretrained=True) elif model_type == 'imagenet_resnext50': net = resnet.resnext50_32x4d(pretrained=True) elif model_type == 'imagenet_resnext101': net = resnet.resnext101_32x8d(pretrained=True) elif model_type == 'mocov2': net = resnet.resnet50(pretrained=False) net_ckpt = torch.load(args.resume) net_state = {k.replace('module.encoder_q.', ''):v for k,v in net_ckpt['state_dict'].items() \ if 'module.encoder_q' in k} partial_load(net_state, net) elif model_type == 'uvc': net = load_uvc_model(args.resume) elif model_type == 'timecycle': net = load_tc_model(args.resume) elif model_type in SSL_MODELS: net = resnet.resnet50(pretrained=False) net_ckpt = torch.load(args.resume) partial_load(net_ckpt, net) elif 'hrnet' in model_type: net = hrnet.get_cls_net(model_type, return_stage=args.return_stage, pretrained=args.resume) elif model_type == 'random': net = random_feat_generator.RandomFeatGenerator(args) else: raise ValueError('Invalid model_type.') if hasattr(net, 'modify'): net.modify(remove_layers=args.remove_layers) if 'Conv2d' in str(net) and not args.infer2D: net = From3D(net) return net ================================================ FILE: unitrack/model/random_feat_generator.py ================================================ ################################################################### # File Name: random_feat_generator.py # Author: Zhongdao Wang # mail: wcd17@mails.tsinghua.edu.cn # Created Time: Mon May 10 16:13:46 2021 ################################################################### from __future__ import print_function from __future__ import division from __future__ import absolute_import import torch import torch.nn as nn class RandomFeatGenerator(nn.Module): def __init__(self, args): super(RandomFeatGenerator, self).__init__() self.df = args.down_factor self.dim = args.dim self.dummy = nn.Linear(2,3) def forward(self, x): if len(x.shape) == 4: N,C,H,W = x.shape elif len(x.shape) == 5: N,C,T,H,W = x.shape else: raise ValueError c, h, w = self.dim, round(H/self.df), round(W/self.df) if len(x.shape) == 4: feat = torch.rand(N,c,h,w).cuda() elif len(x.shape) == 5: feat = torch.rand(N,c,T,h,w).cuda() return feat def __str__(self): return '' ================================================ FILE: unitrack/model/resnet.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F try: from torch.hub import load_state_dict_from_url except ImportError: from torch.utils.model_zoo import load_url as load_state_dict_from_url import torchvision.models.resnet as torch_resnet from torchvision.models.resnet import BasicBlock, Bottleneck model_urls = {'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', 'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth', 'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth', 'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth', 'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth', } class ResNet(torch_resnet.ResNet): def __init__(self, *args, **kwargs): super(ResNet, self).__init__(*args, **kwargs) def modify(self, remove_layers=[], padding=''): # Set stride of layer3 and layer 4 to 1 (from 2) filter_layers = lambda x: [l for l in x if getattr(self, l) is not None] for layer in filter_layers(['layer3', 'layer4']): for m in getattr(self, layer).modules(): if isinstance(m, torch.nn.Conv2d): m.stride = tuple(1 for _ in m.stride) # Set padding (zeros or reflect, doesn't change much; # zeros requires lower temperature) if padding != '' and padding != 'no': for m in self.modules(): if isinstance(m, torch.nn.Conv2d) and sum(m.padding) > 0: m.padding_mode = padding elif padding == 'no': for m in self.modules(): if isinstance(m, torch.nn.Conv2d) and sum(m.padding) > 0: m.padding = (0,0) # Remove extraneous layers remove_layers += ['fc', 'avgpool'] for layer in filter_layers(remove_layers): setattr(self, layer, None) def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = x if self.maxpool is None else self.maxpool(x) x = self.layer1(x) x = F.avg_pool2d(x,(2,2)) if self.layer2 is None else self.layer2(x) x = x if self.layer3 is None else self.layer3(x) x = x if self.layer4 is None else self.layer4(x) return x def _resnet(arch, block, layers, pretrained, progress, **kwargs): model = ResNet(block, layers, **kwargs) if pretrained: state_dict = load_state_dict_from_url(model_urls[arch], progress=progress) model.load_state_dict(state_dict) return model def resnet18(pretrained=False, progress=True, **kwargs): return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress, **kwargs) def resnet50(pretrained=False, progress=True, **kwargs) -> ResNet: return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress, **kwargs) def resnet101(pretrained=False, progress=True, **kwargs): return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress, **kwargs) def resnet152(pretrained=False, progress=True, **kwargs): r"""ResNet-152 model from `"Deep Residual Learning for Image Recognition" `_ Args: pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress, **kwargs) def resnext50_32x4d(pretrained=False, progress=True, **kwargs): r"""ResNeXt-50 32x4d model from `"Aggregated Residual Transformation for Deep Neural Networks" `_ Args: pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ kwargs['groups'] = 32 kwargs['width_per_group'] = 4 return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3], pretrained, progress, **kwargs) def resnext101_32x8d(pretrained=False, progress=True, **kwargs): r"""ResNeXt-101 32x8d model from `"Aggregated Residual Transformation for Deep Neural Networks" `_ Args: pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ kwargs['groups'] = 32 kwargs['width_per_group'] = 8 return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3], pretrained, progress, **kwargs) def wide_resnet50_2(pretrained=False, progress=True, **kwargs): r"""Wide ResNet-50-2 model from `"Wide Residual Networks" `_ The model is the same as ResNet except for the bottleneck number of channels which is twice larger in every block. The number of channels in outer 1x1 convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 channels, and in Wide ResNet-50-2 has 2048-1024-2048. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ kwargs['width_per_group'] = 64 * 2 return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3], pretrained, progress, **kwargs) def wide_resnet101_2(pretrained=False, progress=True, **kwargs): r"""Wide ResNet-101-2 model from `"Wide Residual Networks" `_ The model is the same as ResNet except for the bottleneck number of channels which is twice larger in every block. The number of channels in outer 1x1 convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 channels, and in Wide ResNet-50-2 has 2048-1024-2048. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ kwargs['width_per_group'] = 64 * 2 return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3], pretrained, progress, **kwargs) ================================================ FILE: unitrack/multitracker.py ================================================ import os import pdb import cv2 import time import itertools import os.path as osp from collections import deque import numpy as np import torch import torch.nn.functional as F from torchvision import ops from unitrack.model import AppearanceModel, partial_load from unitrack.utils.log import logger from unitrack.core.association import matching from unitrack.core.propagation import propagate from unitrack.core.motion.kalman_filter import KalmanFilter from unitrack.utils.box import * from unitrack.utils.mask import * from .basetrack import * class AssociationTracker(object): def __init__(self, opt): self.opt = opt self.tracked_stracks = [] # type: list[STrack] self.lost_stracks = [] # type: list[STrack] self.removed_stracks = [] # type: list[STrack] self.frame_id = 0 self.det_thresh = opt.conf_thres self.buffer_size = opt.track_buffer self.max_time_lost = self.buffer_size self.kalman_filter = KalmanFilter() self.app_model = AppearanceModel(opt).to(opt.device) self.app_model.eval() if not self.opt.asso_with_motion: self.opt.motion_lambda = 1 self.opt.motion_gated = False def extract_emb(self, img, obs): raise NotImplementedError def prepare_obs(self, img, img0, obs, embs=None): raise NotImplementedError def update(self, img, img0, obs, embs=None): torch.cuda.empty_cache() self.frame_id += 1 activated_stracks = [] refind_stracks = [] lost_stracks = [] removed_stracks = [] t1 = time.time() detections = self.prepare_obs(img, img0, obs, embs=None) ''' Add newly detected tracklets to tracked_stracks''' unconfirmed = [] tracked_stracks = [] # type: list[STrack] for track in self.tracked_stracks: if not track.is_activated: unconfirmed.append(track) else: tracked_stracks.append(track) ''' Step 2: First association, with embedding''' tracks = joint_stracks(tracked_stracks, self.lost_stracks) dists, recons_ftrk = matching.reconsdot_distance(tracks, detections) if self.opt.use_kalman: # Predict the current location with KF STrack.multi_predict(tracks) dists = matching.fuse_motion(self.kalman_filter, dists, tracks, detections, lambda_=self.opt.motion_lambda, gate=self.opt.motion_gated) if obs.shape[1] == 6: dists = matching.category_gate(dists, tracks, detections) matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7) for itracked, idet in matches: track = tracks[itracked] det = detections[idet] if track.state == TrackState.Tracked: track.update(detections[idet], self.frame_id) activated_stracks.append(track) else: track.re_activate(det, self.frame_id, new_id=False) refind_stracks.append(track) if self.opt.use_kalman: '''(optional) Step 3: Second association, with IOU''' tracks = [tracks[i] for i in u_track if tracks[i].state==TrackState.Tracked] detections = [detections[i] for i in u_detection] dists = matching.iou_distance(tracks, detections) matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.5) for itracked, idet in matches: track = tracks[itracked] det = detections[idet] if track.state == TrackState.Tracked: track.update(det, self.frame_id) activated_stracks.append(track) else: track.re_activate(det, self.frame_id, new_id=False) refind_stracks.append(track) '''Deal with unconfirmed tracks, usually tracks with only one beginning frame''' detections = [detections[i] for i in u_detection] dists = matching.iou_distance(unconfirmed, detections) matches, u_unconfirmed, u_detection = matching.linear_assignment( dists, thresh=self.opt.confirm_iou_thres) for itracked, idet in matches: unconfirmed[itracked].update(detections[idet], self.frame_id) activated_stracks.append(unconfirmed[itracked]) for it in u_unconfirmed: track = unconfirmed[it] track.mark_removed() removed_stracks.append(track) for it in u_track: track = tracks[it] if not track.state == TrackState.Lost: track.mark_lost() lost_stracks.append(track) """ Step 4: Init new stracks""" for inew in u_detection: track = detections[inew] if track.score < self.det_thresh: continue track.activate(self.kalman_filter, self.frame_id) activated_stracks.append(track) """ Step 5: Update state""" for track in self.lost_stracks: if self.frame_id - track.end_frame > self.max_time_lost: track.mark_removed() removed_stracks.append(track) self.tracked_stracks = [t for t in self.tracked_stracks if t.state == TrackState.Tracked] self.tracked_stracks = joint_stracks(self.tracked_stracks, activated_stracks) self.tracked_stracks = joint_stracks(self.tracked_stracks, refind_stracks) self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks) self.lost_stracks.extend(lost_stracks) self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks) self.removed_stracks.extend(removed_stracks) self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks( self.tracked_stracks, self.lost_stracks, ioudist=self.opt.dup_iou_thres) # get scores of lost tracks output_stracks = [track for track in self.tracked_stracks if track.is_activated] return output_stracks def reset_all(self, ): self.tracked_stracks = [] # type: list[STrack] self.lost_stracks = [] # type: list[STrack] self.removed_stracks = [] # type: list[STrack] self.frame_id = 0 ================================================ FILE: unitrack/utils/__init__.py ================================================ from collections import defaultdict, deque import datetime import time import torch import errno import os import pdb import sys from . import visualize from . import box from . import meter from . import log import numpy as np from torch import nn from torch.nn import functional as F def to_numpy(tensor): if torch.is_tensor(tensor): return tensor.cpu().numpy() elif type(tensor).__module__ != 'numpy': raise ValueError("Cannot convert {} to numpy array" .format(type(tensor))) return tensor def to_torch(ndarray): if type(ndarray).__module__ == 'numpy': return torch.from_numpy(ndarray) elif not torch.is_tensor(ndarray): raise ValueError("Cannot convert {} to torch tensor" .format(type(ndarray))) return ndarray def im_to_numpy(img): img = to_numpy(img) img = np.transpose(img, (1, 2, 0)) # H*W*C return img def im_to_torch(img): img = np.transpose(img, (2, 0, 1)) # C*H*W img = to_torch(img).float() return img ================================================ FILE: unitrack/utils/box.py ================================================ ################################################################### # File Name: box.py # Author: Zhongdao Wang # mail: wcd17@mails.tsinghua.edu.cn # Created Time: Wed Dec 23 16:27:15 2020 ################################################################### import torch import torchvision import numpy as np def xyxy2xywh(x): # Convert bounding box format from [x1, y1, x2, y2] to [x, y, w, h] y = x.clone() if x.dtype is torch.float32 else x.copy() y[:, 0] = (x[:, 0] + x[:, 2]) / 2 y[:, 1] = (x[:, 1] + x[:, 3]) / 2 y[:, 2] = x[:, 2] - x[:, 0] y[:, 3] = x[:, 3] - x[:, 1] return y def xywh2xyxy(x): # Convert bounding box format from [x, y, w, h] to [x1, y1, x2, y2] y = x.clone() if x.dtype is torch.float32 else x.copy() y[:, 0] = (x[:, 0] - x[:, 2] / 2) y[:, 1] = (x[:, 1] - x[:, 3] / 2) y[:, 2] = (x[:, 0] + x[:, 2] / 2) y[:, 3] = (x[:, 1] + x[:, 3] / 2) return y def tlwh2xyxy(x): # Convert bounding box format from [x, y, w, h] to [x1, y1, x2, y2] y = x.clone() if x.dtype is torch.float32 else x.copy() y[:, 2] = (x[:, 0] + x[:, 2]) y[:, 3] = (x[:, 1] + x[:, 3]) return y def tlwh_to_xywh(tlwh): ret = np.asarray(tlwh).copy() ret[:2] += ret[2:] / 2 return ret def tlwh_to_xyah(tlwh): """Convert bounding box to format `(center x, center y, aspect ratio, height)`, where the aspect ratio is `width / height`. """ ret = np.asarray(tlwh).copy() ret[:2] += ret[2:] / 2 ret[2] /= (ret[3] + 1e-6) return ret def tlbr_to_tlwh(tlbr): ret = np.asarray(tlbr).copy() ret[2:] -= ret[:2] return ret def tlwh_to_tlbr(tlwh): ret = np.asarray(tlwh).copy() ret[2:] += ret[:2] return ret def scale_box(scale, coords): c = coords.clone() c[:, [0, 2]] = coords[:, [0, 2]] * scale[0] c[:, [1, 3]] = coords[:, [1, 3]] * scale[1] return c def scale_box_letterbox_size(img_size, coords, img0_shape): gain_w = float(img_size[0]) / img0_shape[1] # gain = old / new gain_h = float(img_size[1]) / img0_shape[0] gain = min(gain_w, gain_h) pad_x = (img_size[0] - img0_shape[1] * gain) / 2 # width padding pad_y = (img_size[1] - img0_shape[0] * gain) / 2 # height padding coords[:, 0:4] *= gain coords[:, [0, 2]] += pad_x coords[:, [1, 3]] += pad_y return coords def scale_box_input_size(img_size, coords, img0_shape): # Rescale x1, y1, x2, y2 from 416 to image size gain_w = float(img_size[0]) / img0_shape[1] # gain = old / new gain_h = float(img_size[1]) / img0_shape[0] gain = min(gain_w, gain_h) pad_x = (img_size[0] - img0_shape[1] * gain) / 2 # width padding pad_y = (img_size[1] - img0_shape[0] * gain) / 2 # height padding coords[:, [0, 2]] -= pad_x coords[:, [1, 3]] -= pad_y coords[:, 0:4] /= gain return coords def clip_boxes(boxes, im_shape): """ Clip boxes to image boundaries. """ boxes = np.asarray(boxes) if boxes.shape[0] == 0: return boxes boxes = np.copy(boxes) # x1 >= 0 boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) # y1 >= 0 boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) # x2 < im_shape[1] boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) # y2 < im_shape[0] boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) return boxes def clip_box(bbox, im_shape): h, w = im_shape[:2] bbox = np.copy(bbox) bbox[0] = max(min(bbox[0], w - 1), 0) bbox[1] = max(min(bbox[1], h - 1), 0) bbox[2] = max(min(bbox[2], w - 1), 0) bbox[3] = max(min(bbox[3], h - 1), 0) return bbox def int_box(box): box = np.asarray(box, dtype=np.float) box = np.round(box) return np.asarray(box, dtype=np.int) def remove_duplicated_box(boxes, iou_th=0.5): if isinstance(boxes, np.ndarray): boxes = torch.from_numpy(boxes) jac = torchvision.ops.box_iou(boxes, boxes).float() jac -= torch.eye(jac.shape[0]) keep = np.ones(len(boxes)) == 1 for i, b in enumerate(boxes): if b[0] == -1 and b[1] == -1 and b[2] == 10 and b[3] == 10: keep[i] = False for r, row in enumerate(jac): if keep[r]: discard = torch.where(row > iou_th) keep[discard] = False return np.where(keep)[0] def skltn2box(skltn): dskltn = dict() for s in skltn: dskltn[s['id'][0]] = (int(s['x'][0]), int(s['y'][0])) if len(dskltn) == 0: return np.array( [-1, -1, np.random.randint(1, 40), np.random.randint(1, 70)]) xmin = np.min([dskltn[k][0] for k in dskltn]) xmax = np.max([dskltn[k][0] for k in dskltn]) ymin = np.min([dskltn[k][1] for k in dskltn]) ymax = np.max([dskltn[k][1] for k in dskltn]) if xmin == xmax: xmax += 10 if ymin == ymax: ymax += 10 return np.array([xmin, ymin, xmax, ymax]) ================================================ FILE: unitrack/utils/io.py ================================================ import os import os.path as osp from typing import Dict import numpy as np from utils.log import logger def mkdir_if_missing(d): if not osp.exists(d): os.makedirs(d) def write_mots_results(filename, results, data_type='mot'): if not filename: return path = os.path.dirname(filename) if not os.path.exists(path): os.makedirs(path) if data_type in ('mot'): save_format = '{frame} {id} {cid} {imh} {imw} {rle}\n' else: raise ValueError(data_type) with open(filename, 'w') as f: for frame_id, tlwhs, rles, track_ids in results: for rle, track_id in zip(rles, track_ids): if track_id < 0: continue rle_str = rle['counts'] imh, imw = rle['size'] line = save_format.format(frame=frame_id, id=track_id+2000, cid=2, imh=imh, imw=imw, rle=rle_str) f.write(line) logger.info('Save results to {}'.format(filename)) def write_mot_results(filename, results, data_type='mot'): if not filename: return path = os.path.dirname(filename) if not os.path.exists(path): os.makedirs(path) if data_type in ('mot', 'mcmot', 'lab'): save_format = '{frame},{id},{x1},{y1},{w},{h},1,-1,-1,-1\n' elif data_type == 'kitti': save_format = '{frame} {id} pedestrian -1 -1 -10 {x1} {y1} {x2} {y2} -1 -1 -1 -1000 -1000 -1000 -10 {score}\n' else: raise ValueError(data_type) with open(filename, 'w') as f: for frame_id, tlwhs, track_ids in results: if data_type == 'kitti': frame_id -= 1 for tlwh, track_id in zip(tlwhs, track_ids): if track_id < 0: continue x1, y1, w, h = tlwh x2, y2 = x1 + w, y1 + h line = save_format.format(frame=frame_id, id=track_id, x1=x1, y1=y1, x2=x2, y2=y2, w=w, h=h) f.write(line) logger.info('Save results to {}'.format(filename)) def read_mot_results(filename, data_type='mot', is_gt=False, is_ignore=False): if data_type in ('mot', 'lab'): read_fun = _read_mot_results else: raise ValueError('Unknown data type: {}'.format(data_type)) return read_fun(filename, is_gt, is_ignore) """ labels={'ped', ... % 1 'person_on_vhcl', ... % 2 'car', ... % 3 'bicycle', ... % 4 'mbike', ... % 5 'non_mot_vhcl', ... % 6 'static_person', ... % 7 'distractor', ... % 8 'occluder', ... % 9 'occluder_on_grnd', ... %10 'occluder_full', ... % 11 'reflection', ... % 12 'crowd' ... % 13 }; """ def _read_mot_results(filename, is_gt, is_ignore): valid_labels = {1} ignore_labels = {2, 7, 8, 12} results_dict = dict() if os.path.isfile(filename): with open(filename, 'r') as f: for line in f.readlines(): linelist = line.split(',') if len(linelist) < 7: continue fid = int(linelist[0]) if fid < 1: continue results_dict.setdefault(fid, list()) if is_gt: if 'MOT16-' in filename or 'MOT17-' in filename: label = int(float(linelist[7])) mark = int(float(linelist[6])) if mark == 0 or label not in valid_labels: continue score = 1 elif is_ignore: if 'MOT16-' in filename or 'MOT17-' in filename: label = int(float(linelist[7])) vis_ratio = float(linelist[8]) if label not in ignore_labels and vis_ratio >= 0: continue else: continue score = 1 else: score = float(linelist[6]) tlwh = tuple(map(float, linelist[2:6])) target_id = int(linelist[1]) results_dict[fid].append((tlwh, target_id, score)) return results_dict def unzip_objs(objs): if len(objs) > 0: tlwhs, ids, scores = zip(*objs) else: tlwhs, ids, scores = [], [], [] tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4) return tlwhs, ids, scores ================================================ FILE: unitrack/utils/log.py ================================================ import logging def get_logger(name='root'): formatter = logging.Formatter( # fmt='%(asctime)s [%(levelname)s]: %(filename)s(%(funcName)s:%(lineno)s) >> %(message)s') fmt='%(asctime)s [%(levelname)s]: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') handler = logging.StreamHandler() handler.setFormatter(formatter) logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) logger.addHandler(handler) return logger logger = get_logger('root') ================================================ FILE: unitrack/utils/mask.py ================================================ ################################################################### # File Name: mask.py # Author: Zhongdao Wang # mail: wcd17@mails.tsinghua.edu.cn # Created Time: Tue Feb 9 10:05:47 2021 ################################################################### from __future__ import print_function from __future__ import division from __future__ import absolute_import import cv2 import torch import numpy as np import pycocotools.mask as mask_utils def coords2bbox(coords, extend=2): """ INPUTS: - coords: coordinates of pixels in the next frame """ center = torch.mean(coords, dim=0) # b * 2 center = center.view(1,2) center_repeat = center.repeat(coords.size(0),1) dis_x = torch.sqrt(torch.pow(coords[:,0] - center_repeat[:,0], 2)) dis_x = max(torch.mean(dis_x, dim=0).detach(),1) dis_y = torch.sqrt(torch.pow(coords[:,1] - center_repeat[:,1], 2)) dis_y = max(torch.mean(dis_y, dim=0).detach(),1) left = center[:,0] - dis_x*extend right = center[:,0] + dis_x*extend top = center[:,1] - dis_y*extend bottom = center[:,1] + dis_y*extend return (top.item(), left.item(), bottom.item(), right.item()) def coords2bbox_all(coords): left = coords[:, 0].min().item() top = coords[:, 1].min().item() right = coords[:, 0].max().item() bottom = coords[:, 1].max().item() return top, left, bottom, right def coords2bboxTensor(coords, extend=2): """ INPUTS: - coords: coordinates of pixels in the next frame """ center = torch.mean(coords, dim=0) # b * 2 center = center.view(1,2) center_repeat = center.repeat(coords.size(0),1) dis_x = torch.sqrt(torch.pow(coords[:,0] - center_repeat[:,0], 2)) dis_x = max(torch.mean(dis_x, dim=0).detach(),1) dis_y = torch.sqrt(torch.pow(coords[:,1] - center_repeat[:,1], 2)) dis_y = max(torch.mean(dis_y, dim=0).detach(),1) left = center[:,0] - dis_x*extend right = center[:,0] + dis_x*extend top = center[:,1] - dis_y*extend bottom = center[:,1] + dis_y*extend return torch.Tensor([top.item(), left.item(), bottom.item(), right.item()]).to(coords.device) def mask2box(masks): boxes = [] for mask in masks: m = mask[0].nonzero().float() if m.numel() > 0: box = coords2bbox(m, extend=2) else: box = (-1,-1,10,10) boxes.append(box) return np.asarray(boxes) def tensor_mask2box(masks): boxes = [] for mask in masks: m = mask.nonzero().float() if m.numel() > 0: # box = coords2bbox(m, extend=2) box = coords2bbox_all(m) else: box = (-1,-1,10,10) boxes.append(box) return np.asarray(boxes) def batch_mask2boxlist(masks): """ Args: masks: Tensor b,n,h,w Returns: List[List[box]] """ batch_bbox = [] for i, b_masks in enumerate(masks): boxes = [] for mask in b_masks: m = mask.nonzero().float() if m.numel() > 0: box = coords2bboxTensor(m, extend=2) else: box = torch.Tensor([0,0,0,0]).to(m.device) boxes.append(box.unsqueeze(0)) boxes_t = torch.cat(boxes, 0) batch_bbox.append(boxes_t) return batch_bbox def bboxlist2roi(bbox_list): """Convert a list of bboxes to roi format. Args: bbox_list (list[Tensor]): a list of bboxes corresponding to a batch of images. Returns: Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2] """ rois_list = [] for img_id, bboxes in enumerate(bbox_list): if bboxes.size(0) > 0: img_inds = bboxes.new_full((bboxes.size(0), 1), img_id) rois = torch.cat([img_inds, bboxes[:, :4]], dim=-1) else: rois = bboxes.new_zeros((0, 5)) rois_list.append(rois) rois = torch.cat(rois_list, 0) return rois def bbox2roi(bbox_list): """Convert a list of bboxes to roi format. Args: bbox_list (list[Tensor]): a list of bboxes corresponding to a batch of images. Returns: Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2] """ rois_list = [] for img_id, bboxes in enumerate(bbox_list): if bboxes.size(0) > 0: img_inds = bboxes.new_full((bboxes.size(0), 1), img_id) rois = torch.cat([img_inds, bboxes[:, :4]], dim=-1) else: rois = bboxes.new_zeros((0, 5)) rois_list.append(rois) rois = torch.cat(rois_list, 0) return rois def temp_interp_mask(maskseq, T): ''' maskseq: list of elements (RLE_mask, timestamp) return list of RLE_mask, length of list is T ''' size = maskseq[0][0]['size'] blank_mask = np.asfortranarray(np.zeros(size).astype(np.uint8)) blank_mask = mask_utils.encode(blank_mask) blank_mask['counts'] = blank_mask['counts'].decode('ascii') ret = [blank_mask,] * T for m, t in maskseq: ret[t] = m return ret def mask_seq_jac(sa, sb): j = np.zeros((len(sa), len(sb))) for ia, a in enumerate(sa): for ib, b in enumerate(sb): ious = [mask_utils.iou([at], [bt], [False,]) for (at, bt) in zip(a,b)] tiou = np.mean(ious) j[ia, ib] = tiou return j def skltn2mask(skltn, size): h, w = size mask = np.zeros((h,w)) dskltn = dict() for s in skltn: dskltn[s['id'][0]] = (int(s['x'][0]), int(s['y'][0])) if len(dskltn)==0: return mask trunk_polygon = list() for k in np.array([3,4,10,13,9])-1: p = dskltn.get(k, None) if not p is None: trunk_polygon.append(p) trunk_polygon = np.asarray(trunk_polygon, 'int32') if len(trunk_polygon) > 2: cv2.fillConvexPoly(mask, trunk_polygon, 1) xmin = np.min([dskltn[k][0] for k in dskltn]) xmax = np.max([dskltn[k][0] for k in dskltn]) ymin = np.min([dskltn[k][1] for k in dskltn]) ymax = np.max([dskltn[k][1] for k in dskltn]) line_width = np.max([int(np.max([xmax-xmin, ymax-ymin, 0])/20),8]) skeleton = [[10, 11], [11, 12], [9,8], [8,7], [10, 13], [9, 13], [13, 15], [10,4], [4,5], [5,6], [9,3], [3,2], [2,1]] for sk in skeleton: st = dskltn.get(sk[0]-1, None) ed = dskltn.get(sk[1]-1, None) if st is None or ed is None: continue cv2.line(mask, st, ed, color=1, thickness=line_width) #dmask = cv2.resize(mask, (w//8, h//8), interpolation=cv2.INTER_NEAREST) #pdb.set_trace() return mask def pts2array(pts): arr = np.zeros((15,3)) for s in pts: arr[s['id'][0]][0] = int(s['x'][0]) arr[s['id'][0]][1] = int(s['y'][0]) arr[s['id'][0]][2] = s['score'][0] return arr ================================================ FILE: unitrack/utils/meter.py ================================================ ################################################################### # File Name: meter.py # Author: Zhongdao Wang # mail: wcd17@mails.tsinghua.edu.cn # Created Time: Wed Dec 23 16:35:34 2020 ################################################################### from __future__ import print_function from __future__ import division from __future__ import absolute_import import time class Timer(object): """A simple timer.""" def __init__(self): self.total_time = 0. self.calls = 0 self.start_time = 0. self.diff = 0. self.average_time = 0. self.duration = 0. def tic(self): # using time.time instead of time.clock because time time.clock # does not normalize for multithreading self.start_time = time.time() def toc(self, average=True): self.diff = time.time() - self.start_time self.total_time += self.diff self.calls += 1 self.average_time = self.total_time / self.calls if average: self.duration = self.average_time else: self.duration = self.diff return self.duration def clear(self): self.total_time = 0. self.calls = 0 self.start_time = 0. self.diff = 0. self.average_time = 0. self.duration = 0. ================================================ FILE: unitrack/utils/palette.py ================================================ palette_str = '''0 0 0 128 0 0 0 128 0 128 128 0 0 0 128 128 0 128 0 128 128 128 128 128 64 0 0 191 0 0 64 128 0 191 128 0 64 0 128 191 0 128 64 128 128 191 128 128 0 64 0 128 64 0 0 191 0 128 191 0 0 64 128 128 64 128 22 22 22 23 23 23 24 24 24 25 25 25 26 26 26 27 27 27 28 28 28 29 29 29 30 30 30 31 31 31 32 32 32 33 33 33 34 34 34 35 35 35 36 36 36 37 37 37 38 38 38 39 39 39 40 40 40 41 41 41 42 42 42 43 43 43 44 44 44 45 45 45 46 46 46 47 47 47 48 48 48 49 49 49 50 50 50 51 51 51 52 52 52 53 53 53 54 54 54 55 55 55 56 56 56 57 57 57 58 58 58 59 59 59 60 60 60 61 61 61 62 62 62 63 63 63 64 64 64 65 65 65 66 66 66 67 67 67 68 68 68 69 69 69 70 70 70 71 71 71 72 72 72 73 73 73 74 74 74 75 75 75 76 76 76 77 77 77 78 78 78 79 79 79 80 80 80 81 81 81 82 82 82 83 83 83 84 84 84 85 85 85 86 86 86 87 87 87 88 88 88 89 89 89 90 90 90 91 91 91 92 92 92 93 93 93 94 94 94 95 95 95 96 96 96 97 97 97 98 98 98 99 99 99 100 100 100 101 101 101 102 102 102 103 103 103 104 104 104 105 105 105 106 106 106 107 107 107 108 108 108 109 109 109 110 110 110 111 111 111 112 112 112 113 113 113 114 114 114 115 115 115 116 116 116 117 117 117 118 118 118 119 119 119 120 120 120 121 121 121 122 122 122 123 123 123 124 124 124 125 125 125 126 126 126 127 127 127 128 128 128 129 129 129 130 130 130 131 131 131 132 132 132 133 133 133 134 134 134 135 135 135 136 136 136 137 137 137 138 138 138 139 139 139 140 140 140 141 141 141 142 142 142 143 143 143 144 144 144 145 145 145 146 146 146 147 147 147 148 148 148 149 149 149 150 150 150 151 151 151 152 152 152 153 153 153 154 154 154 155 155 155 156 156 156 157 157 157 158 158 158 159 159 159 160 160 160 161 161 161 162 162 162 163 163 163 164 164 164 165 165 165 166 166 166 167 167 167 168 168 168 169 169 169 170 170 170 171 171 171 172 172 172 173 173 173 174 174 174 175 175 175 176 176 176 177 177 177 178 178 178 179 179 179 180 180 180 181 181 181 182 182 182 183 183 183 184 184 184 185 185 185 186 186 186 187 187 187 188 188 188 189 189 189 190 190 190 191 191 191 192 192 192 193 193 193 194 194 194 195 195 195 196 196 196 197 197 197 198 198 198 199 199 199 200 200 200 201 201 201 202 202 202 203 203 203 204 204 204 205 205 205 206 206 206 207 207 207 208 208 208 209 209 209 210 210 210 211 211 211 212 212 212 213 213 213 214 214 214 215 215 215 216 216 216 217 217 217 218 218 218 219 219 219 220 220 220 221 221 221 222 222 222 223 223 223 224 224 224 225 225 225 226 226 226 227 227 227 228 228 228 229 229 229 230 230 230 231 231 231 232 232 232 233 233 233 234 234 234 235 235 235 236 236 236 237 237 237 238 238 238 239 239 239 240 240 240 241 241 241 242 242 242 243 243 243 244 244 244 245 245 245 246 246 246 247 247 247 248 248 248 249 249 249 250 250 250 251 251 251 252 252 252 253 253 253 254 254 254 255 255 255''' import numpy as np tensor = np.array([[int(x) for x in line.split()] for line in palette_str.split('\n')]) ================================================ FILE: unitrack/utils/visualize.py ================================================ import cv2 import numpy as np import imageio as io from matplotlib import cm import time import PIL import pycocotools.mask as mask_utils from . import palette def dump_predictions(pred, lbl_set, img, prefix): ''' Save: 1. Predicted labels for evaluation 2. Label heatmaps for visualization ''' lbl_set = palette.tensor.astype(np.uint8) sz = img.shape[:-1] # Upsample predicted soft label maps # pred_dist = pred.copy() pred_dist = cv2.resize(pred, sz[::-1])[:] # Argmax to get the hard label for index pred_lbl = np.argmax(pred_dist, axis=-1) pred_lbl = np.array(lbl_set, dtype=np.int32)[pred_lbl] mask = np.float32(pred_lbl.sum(2) > 0)[:,:,None] alpha = 0.5 img_with_label = mask * (np.float32(img) * alpha + \ np.float32(pred_lbl) * (1-alpha)) + (1-mask) * np.float32(img) # Visualize label distribution for object 1 (debugging/analysis) pred_soft = pred_dist[..., 1] pred_soft = cv2.resize(pred_soft, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_NEAREST) pred_soft = cm.jet(pred_soft)[..., :3] * 255.0 img_with_heatmap1 = np.float32(img) * 0.5 + np.float32(pred_soft) * 0.5 # Save blend image for visualization io.imwrite('%s_blend.jpg' % prefix, np.uint8(img_with_label)) if prefix[-4] != '.': # Super HACK-y imname2 = prefix + '_mask.png' else: imname2 = prefix.replace('jpg','png') # Save predicted labels for evaluation io.imwrite(imname2, np.uint8(pred_lbl)) return img_with_label, pred_lbl, img_with_heatmap1 def make_gif(video, outname='/tmp/test.gif', sz=256): if hasattr(video, 'shape'): video = video.cpu() if video.shape[0] == 3: video = video.transpose(0, 1) video = video.numpy().transpose(0, 2, 3, 1) video = (video*255).astype(np.uint8) video = [cv2.resize(vv, (sz, sz)) for vv in video] if outname is None: return np.stack(video) io.mimsave(outname, video, duration = 0.2) def get_color(idx): idx = idx * 17 color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255) return color def plot_tracking(image, obs, obj_ids, scores=None, frame_id=0, fps=0.): im = np.ascontiguousarray(np.copy(image)) im_h, im_w = im.shape[:2] text_scale = max(1, image.shape[1] / 1600.) text_thickness = 1 if text_scale > 1.1 else 1 line_thickness = max(1, int(image.shape[1] / 150.)) alpha = 0.4 for i, ob in enumerate(obs): obj_id = int(obj_ids[i]) id_text = '{}'.format(int(obj_id)) _line_thickness = 1 if obj_id <= 0 else line_thickness color = get_color(obj_id) if len(ob) == 4: x1, y1, w, h = ob intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h))) cv2.rectangle(im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness) cv2.putText(im, id_text, (intbox[0], intbox[1] + 30), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), thickness=text_thickness) elif isinstance(ob, dict): mask = mask_utils.decode(ob) mask = cv2.resize(mask, (im_w, im_h), interpolation=cv2.INTER_LINEAR) mask = (mask > 0.5).astype(np.uint8)[:,:,None] mask_color = mask * color im = (1 - mask) * im + mask * (alpha*im + (1-alpha)*mask_color) else: raise ValueError('Observation format not supported.') return im def vis_pose(oriImg, points): pa = np.zeros(15) pa[2] = 0 pa[12] = 8 pa[8] = 4 pa[4] = 0 pa[11] = 7 pa[7] = 3 pa[3] = 0 pa[0] = 1 pa[14] = 10 pa[10] = 6 pa[6] = 1 pa[13] = 9 pa[9] = 5 pa[5] = 1 colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], [170,0,255],[255,0,255]] canvas = oriImg stickwidth = 4 x = points[0, :] y = points[1, :] for n in range(len(x)): pair_id = int(pa[n]) x1 = int(x[pair_id]) y1 = int(y[pair_id]) x2 = int(x[n]) y2 = int(y[n]) if x1 >= 0 and y1 >= 0 and x2 >= 0 and y2 >= 0: cv2.line(canvas, (x1, y1), (x2, y2), colors[n], 8) return canvas def draw_skeleton(aa, kp, color, show_skeleton_labels=False, dataset= "PoseTrack"): if dataset == "COCO": skeleton = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12], [7, 13], [6, 7], [6, 8], [7, 9], [8, 10], [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]] kp_names = ['nose', 'l_eye', 'r_eye', 'l_ear', 'r_ear', 'l_shoulder', 'r_shoulder', 'l_elbow', 'r_elbow', 'l_wrist', 'r_wrist', 'l_hip', 'r_hip', 'l_knee', 'r_knee', 'l_ankle', 'r_ankle'] elif dataset == "PoseTrack": skeleton = [[10, 11], [11, 12], [9,8], [8,7], [10, 13], [9, 13], [13, 15], [10,4], [4,5], [5,6], [9,3], [3,2], [2,1]] kp_names = ['right_ankle', 'right_knee', 'right_pelvis', 'left_pelvis', 'left_knee', 'left_ankle', 'right_wrist', 'right_elbow', 'right_shoulder', 'left_shoulder', 'left_elbow', 'left_wrist', 'upper_neck', 'nose', 'head'] for i, j in skeleton: if kp[i-1][0] >= 0 and kp[i-1][1] >= 0 and kp[j-1][0] >= 0 and kp[j-1][1] >= 0 and \ (len(kp[i-1]) <= 2 or (len(kp[i-1]) > 2 and kp[i-1][2] > 0.1 and kp[j-1][2] > 0.1)): st = (int(kp[i-1][0]), int(kp[i-1][1])) ed = (int(kp[j-1][0]), int(kp[j-1][1])) cv2.line(aa, st, ed, color, max(1, int(aa.shape[1]/150.))) for j in range(len(kp)): if kp[j][0] >= 0 and kp[j][1] >= 0: pt = (int(kp[j][0]), int(kp[j][1])) if len(kp[j]) <= 2 or (len(kp[j]) > 2 and kp[j][2] > 1.1): cv2.circle(aa, pt, 2, tuple((0,0,255)), 2) elif len(kp[j]) <= 2 or (len(kp[j]) > 2 and kp[j][2] > 0.1): cv2.circle(aa, pt, 2, tuple((255,0,0)), 2) if show_skeleton_labels and (len(kp[j]) <= 2 or (len(kp[j]) > 2 and kp[j][2] > 0.1)): cv2.putText(aa, kp_names[j], tuple(kp[j][:2]), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0))