Repository: SysCV/MaskFreeVIS Branch: main Commit: 0e7018b7fe61 Files: 215 Total size: 1.3 MB Directory structure: gitextract_tlc1nw96/ ├── DATASET_prepare.md ├── LICENSE ├── README.md ├── configs/ │ ├── coco/ │ │ └── instance-segmentation/ │ │ ├── Base-COCO-InstanceSegmentation.yaml │ │ └── maskformer2_R50_bs16_50ep.yaml │ └── youtubevis_2019/ │ ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml │ ├── Base-YouTubeVIS-VideoInstanceSegmentation_long.yaml │ ├── Base-YouTubeVIS-VideoInstanceSegmentation_long_bs16.yaml │ ├── swin/ │ │ └── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml │ ├── video_maskformer2_R101_bs16_8ep.yaml │ ├── video_maskformer2_R50_bs16_8ep.yaml │ └── video_maskformer2_R50_bs16_8ep_swin.yaml ├── demo/ │ ├── README.md │ ├── demo.py │ └── predictor.py ├── demo_video/ │ ├── README.md │ ├── demo.py │ ├── predictor.py │ └── visualizer.py ├── mask2former/ │ ├── __init__.py │ ├── config.py │ ├── data/ │ │ ├── __init__.py │ │ ├── dataset_mappers/ │ │ │ ├── __init__.py │ │ │ ├── __init__.py.new │ │ │ ├── coco_instance_new_baseline_dataset_mapper.py │ │ │ ├── coco_panoptic_new_baseline_dataset_mapper.py │ │ │ ├── mask_former_instance_dataset_mapper.py │ │ │ ├── mask_former_panoptic_dataset_mapper.py │ │ │ └── mask_former_semantic_dataset_mapper.py │ │ └── datasets/ │ │ ├── __init__.py │ │ ├── register_ade20k_full.py │ │ ├── register_ade20k_instance.py │ │ ├── register_ade20k_panoptic.py │ │ ├── register_coco_panoptic_annos_semseg.py │ │ ├── register_coco_stuff_10k.py │ │ ├── register_mapillary_vistas.py │ │ └── register_mapillary_vistas_panoptic.py │ ├── evaluation/ │ │ ├── __init__.py │ │ ├── __init__.py.new │ │ └── instance_evaluation.py │ ├── maskformer_model.py │ ├── modeling/ │ │ ├── __init__.py │ │ ├── backbone/ │ │ │ ├── __init__.py │ │ │ ├── __init__.py.new │ │ │ └── swin.py │ │ ├── criterion.py │ │ ├── matcher.py │ │ ├── meta_arch/ │ │ │ ├── __init__.py │ │ │ ├── __init__.py.new │ │ │ ├── mask_former_head.py │ │ │ └── per_pixel_baseline.py │ │ ├── pixel_decoder/ │ │ │ ├── __init__.py │ │ │ ├── __init__.py.new │ │ │ ├── fpn.py │ │ │ ├── msdeformattn.py │ │ │ └── ops/ │ │ │ ├── functions/ │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn_func.py │ │ │ ├── make.sh │ │ │ ├── modules/ │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn.py │ │ │ ├── setup.py │ │ │ ├── src/ │ │ │ │ ├── cpu/ │ │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ │ └── ms_deform_attn_cpu.h │ │ │ │ ├── cuda/ │ │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ │ ├── ms_deform_attn.h │ │ │ │ └── vision.cpp │ │ │ └── test.py │ │ └── transformer_decoder/ │ │ ├── __init__.py │ │ ├── mask2former_transformer_decoder.py │ │ ├── maskformer_transformer_decoder.py │ │ ├── position_encoding.py │ │ └── transformer.py │ ├── test_time_augmentation.py │ └── utils/ │ ├── __init__.py │ ├── __init__.py.new │ └── misc.py ├── mask2former_video/ │ ├── __init__.py │ ├── config.py │ ├── data_video/ │ │ ├── __init__.py │ │ ├── augmentation.py │ │ ├── build.py │ │ ├── combined_loader.py │ │ ├── dataset_mapper.py │ │ ├── datasets/ │ │ │ ├── __init__.py │ │ │ ├── builtin.py │ │ │ ├── ytvis.py │ │ │ └── ytvis_api/ │ │ │ ├── __init__.py │ │ │ ├── ytvos.py │ │ │ └── ytvoseval.py │ │ └── ytvis_eval.py │ ├── modeling/ │ │ ├── __init__.py │ │ ├── criterion.py │ │ ├── matcher.py │ │ └── transformer_decoder/ │ │ ├── __init__.py │ │ ├── position_encoding.py │ │ └── video_mask2former_transformer_decoder.py │ ├── utils/ │ │ ├── __init__.py │ │ ├── __init__.py.new │ │ └── memory.py │ └── video_maskformer_model.py ├── mfvis_nococo/ │ ├── __init__.py │ ├── configs/ │ │ └── youtubevis_2019/ │ │ ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml │ │ ├── video_maskformer2_R101_bs16_8ep_coco.yaml │ │ ├── video_maskformer2_R50_bs16_8ep.yaml │ │ └── video_maskformer2_R50_bs16_8ep_coco.yaml │ ├── mask2former/ │ │ ├── __init__.py │ │ ├── config.py │ │ ├── data/ │ │ │ ├── __init__.py │ │ │ ├── dataset_mappers/ │ │ │ │ ├── __init__.py │ │ │ │ ├── __init__.py.new │ │ │ │ ├── coco_instance_new_baseline_dataset_mapper.py │ │ │ │ ├── coco_panoptic_new_baseline_dataset_mapper.py │ │ │ │ ├── mask_former_instance_dataset_mapper.py │ │ │ │ ├── mask_former_panoptic_dataset_mapper.py │ │ │ │ └── mask_former_semantic_dataset_mapper.py │ │ │ └── datasets/ │ │ │ ├── __init__.py │ │ │ ├── register_ade20k_full.py │ │ │ ├── register_ade20k_instance.py │ │ │ ├── register_ade20k_panoptic.py │ │ │ ├── register_coco_panoptic_annos_semseg.py │ │ │ ├── register_coco_stuff_10k.py │ │ │ ├── register_mapillary_vistas.py │ │ │ └── register_mapillary_vistas_panoptic.py │ │ ├── evaluation/ │ │ │ ├── __init__.py │ │ │ ├── __init__.py.new │ │ │ └── instance_evaluation.py │ │ ├── maskformer_model.py │ │ ├── modeling/ │ │ │ ├── __init__.py │ │ │ ├── backbone/ │ │ │ │ ├── __init__.py │ │ │ │ ├── __init__.py.new │ │ │ │ └── swin.py │ │ │ ├── criterion.py │ │ │ ├── matcher.py │ │ │ ├── meta_arch/ │ │ │ │ ├── __init__.py │ │ │ │ ├── __init__.py.new │ │ │ │ ├── mask_former_head.py │ │ │ │ └── per_pixel_baseline.py │ │ │ ├── pixel_decoder/ │ │ │ │ ├── __init__.py │ │ │ │ ├── __init__.py.new │ │ │ │ ├── fpn.py │ │ │ │ ├── msdeformattn.py │ │ │ │ └── ops/ │ │ │ │ ├── functions/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── ms_deform_attn_func.py │ │ │ │ ├── make.sh │ │ │ │ ├── modules/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── ms_deform_attn.py │ │ │ │ ├── setup.py │ │ │ │ ├── src/ │ │ │ │ │ ├── cpu/ │ │ │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ │ │ └── ms_deform_attn_cpu.h │ │ │ │ │ ├── cuda/ │ │ │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ │ │ ├── ms_deform_attn.h │ │ │ │ │ └── vision.cpp │ │ │ │ └── test.py │ │ │ └── transformer_decoder/ │ │ │ ├── __init__.py │ │ │ ├── mask2former_transformer_decoder.py │ │ │ ├── maskformer_transformer_decoder.py │ │ │ ├── position_encoding.py │ │ │ └── transformer.py │ │ ├── test_time_augmentation.py │ │ └── utils/ │ │ ├── __init__.py │ │ ├── __init__.py.new │ │ └── misc.py │ ├── mask2former_video/ │ │ ├── __init__.py │ │ ├── config.py │ │ ├── data_video/ │ │ │ ├── __init__.py │ │ │ ├── augmentation.py │ │ │ ├── build.py │ │ │ ├── dataset_mapper.py │ │ │ ├── datasets/ │ │ │ │ ├── __init__.py │ │ │ │ ├── builtin.py │ │ │ │ ├── ytvis.py │ │ │ │ └── ytvis_api/ │ │ │ │ ├── __init__.py │ │ │ │ ├── ytvos.py │ │ │ │ └── ytvoseval.py │ │ │ └── ytvis_eval.py │ │ ├── modeling/ │ │ │ ├── __init__.py │ │ │ ├── criterion.py │ │ │ ├── matcher.py │ │ │ └── transformer_decoder/ │ │ │ ├── __init__.py │ │ │ ├── position_encoding.py │ │ │ └── video_mask2former_transformer_decoder.py │ │ ├── utils/ │ │ │ ├── __init__.py │ │ │ └── memory.py │ │ └── video_maskformer_model.py │ ├── scripts/ │ │ ├── eval_8gpu_mask2former_r101_video.sh │ │ ├── train_8gpu_mask2former_r101_video_coco.sh │ │ ├── train_8gpu_mask2former_r50_video.sh │ │ ├── train_8gpu_mask2former_r50_video_coco.sh │ │ ├── visual_video_r101.sh │ │ └── visual_video_r50.sh │ └── train_net_video.py ├── requirements.txt ├── scripts/ │ ├── eval_8gpu_mask2former_r101_video.sh │ ├── eval_8gpu_mask2former_r50_video.sh │ ├── eval_8gpu_mask2former_swinl_video.sh │ ├── train_8gpu_mask2former_r101_video.sh │ ├── train_8gpu_mask2former_r50_video.sh │ ├── train_8gpu_mask2former_swinl_video.sh │ └── visual_video.sh ├── tools/ │ ├── README.md │ ├── analyze_model.py │ ├── convert-pretrained-swin-model-to-d2.py │ ├── convert-torchvision-to-d2.py │ ├── evaluate_coco_boundary_ap.py │ └── evaluate_pq_for_semantic_segmentation.py ├── train_net.py ├── train_net_video.py └── util/ ├── __init__.py ├── box_ops.py ├── misc.py └── plot_utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: DATASET_prepare.md ================================================ # Prepare Datasets for MaskFreeVIS A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog) for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc). This document explains how to setup the builtin datasets so they can be used by the above APIs. [Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`, and how to add new datasets to them. MaskFreeVIS has builtin support for a few datasets. The datasets are assumed to exist in a directory specified by the environment variable `DETECTRON2_DATASETS`. You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`. If left unset, the default is `./datasets` relative to your current working directory. The model zoo contains configs and models that use these builtin datasets. We will convert each object mask to box when after reading the corresponding instance annotation. ## Expected dataset structure for [COCO](https://cocodataset.org/#download): ``` coco/ annotations/ instances_{train,val}2017.json panoptic_{train,val}2017.json {train,val}2017/ # image files that are mentioned in the corresponding json panoptic_{train,val}2017/ # png annotations panoptic_semseg_{train,val}2017/ # generated by the script mentioned below ``` Install panopticapi by: ``` pip install git+https://github.com/cocodataset/panopticapi.git ``` Then, run `python datasets/prepare_coco_semantic_annos_from_panoptic_annos.py`, to extract semantic annotations from panoptic annotations (only used for evaluation). ## Expected dataset structure for [YouTubeVIS 2019](https://competitions.codalab.org/competitions/20128): ``` ytvis_2019/ {train,valid,test}.json {train,valid,test}/ Annotations/ JPEGImages/ ``` ## Expected dataset structure for [YouTubeVIS 2021](https://competitions.codalab.org/competitions/28988): ``` ytvis_2021/ {train,valid,test}.json {train,valid,test}/ Annotations/ JPEGImages/ ``` ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # MaskFreeVIS Mask-Free Video Instance Segmentation [CVPR 2023]. This is the official pytorch implementation of [MaskFreeVIS](https://github.com/SysCV/MaskFreeVis/) built on the open-source detectron2. We aim to **remove the necessity for expensive video masks and even image masks** for training VIS models. Our project website contains more information, including the visual video comparison: [vis.xyz/pub/maskfreevis](https://www.vis.xyz/pub/maskfreevis/). > [**Mask-Free Video Instance Segmentation**](https://arxiv.org/abs/2303.15904) > Lei Ke, Martin Danelljan, Henghui Ding, Yu-Wing Tai, Chi-Keung Tang, Fisher Yu \ > CVPR 2023 Highlights ----------------- - **High-performing** video instance segmentation **without using any video masks or even image mask** labels. Using SwinL and built on Mask2Former, MaskFreeVIS achieved 56.0 AP on YTVIS without using any video masks labels. Using ResNet-101, MaskFreeVIS achieves 49.1 AP without using video masks, and 47.3 AP only using COCO mask initialized model. - **Novelty:** a new **parameter-free** Temporal KNN-patch Loss (TK-Loss), which leverages temporal masks consistency using unsupervised one-to-k patch correspondence. - **Simple:** TK-Loss is flexible to intergrated with state-of-the-art transformer-based VIS models, with no trainable parameters. Visualization results of MaskFreeVIS -----------------
Introduction ----------------- The recent advancement in Video Instance Segmentation (VIS) has largely been driven by the use of deeper and increasingly data-hungry transformer-based models. However, video masks are tedious and expensive to annotate, limiting the scale and diversity of existing VIS datasets. In this work, we aim to remove the mask-annotation requirement. We propose MaskFreeVIS, achieving highly competitive VIS performance, while only using bounding box annotations for the object state. We leverage the rich temporal mask consistency constraints in videos by introducing the Temporal KNN-patch Loss (TK-Loss), providing strong mask supervision without any labels. Our TK-Loss finds one-to-many matches across frames, through an efficient patch-matching step followed by a K-nearest neighbor selection. A consistency loss is then enforced on the found matches. Our mask-free objective is simple to implement, has no trainable parameters, is computationally efficient, yet outperforms baselines employing, e.g., state-of-the-art optical flow to enforce temporal mask consistency. We validate MaskFreeVIS on the YouTube-VIS 2019/2021, OVIS and BDD100K MOTS benchmarks. The results clearly demonstrate the efficacy of our method by drastically narrowing the gap between fully and weakly-supervised VIS performance. Methods ----------------- image ### **Installation** Please see [Getting Started with Detectron2](https://github.com/facebookresearch/detectron2/blob/master/GETTING_STARTED.md) for full usage. ### Requirements - Linux or macOS with Python 3.6 - PyTorch 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation. Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check PyTorch version matches that is required by Detectron2. - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html). - OpenCV is optional but needed by demo and visualization - `pip install -r requirements.txt` ### CUDA kernel for MSDeformAttn After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn: `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit. ```bash cd mask2former/modeling/pixel_decoder/ops sh make.sh ``` #### Building on another system To build on a system that does not have a GPU device but provide the drivers: ```bash TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install ``` ### Example conda environment setup ```bash conda create --name maskfreevis python=3.8 -y conda activate maskfreevis conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia pip install -U opencv-python # under your working directory git clone git@github.com:facebookresearch/detectron2.git cd detectron2 pip install -e . cd .. git clone https://github.com/SysCV/MaskFreeVIS.git cd MaskFreeVIS pip install -r requirements.txt cd mask2former/modeling/pixel_decoder/ops sh make.sh ``` ### **Dataset preparation** Please see the document [here](DATASET_prepare.md). ### **Model Zoo** ## Video Instance Segmentation (YouTubeVIS) Using COCO image masks **without YTVIS video masks** during training:
Config Name Backbone AP download Training Script COCO Init Weight
MaskFreeVIS R50 46.6 model script Init
MaskFreeVIS R101 49.1 model script Init
MaskFreeVIS Swin-L 56.0 model script Init
**For below two training settings without using pseudo COCO images masks** for joint video training, please change the folder to: ``` cd mfvis_nococo ``` 1) Only using **COCO mask initialized model without YTVIS video masks** during training:
Config Name Backbone AP download Training Script COCO Init Weight
MaskFreeVIS R50 43.8 model script Init
MaskFreeVIS R101 47.3 model script Init
2) Only using **COCO box initialized model without YTVIS video masks** during training:
Config Name Backbone AP download Training Script COCO Box Init Weight
MaskFreeVIS R50 42.5 model script Init
Please see our script folder. ## Inference & Evaluation First download the provided trained model from our model zoo table and put them into the mfvis_models. ``` mkdir mfvis_models ``` Refer to our [scripts folder](./scripts) for more commands: Example evaluation scripts: ``` bash scripts/eval_8gpu_mask2former_r50_video.sh bash scripts/eval_8gpu_mask2former_r101_video.sh bash scripts/eval_8gpu_mask2former_swinl_video.sh ``` ## Results Visualization Example visualization script: ``` bash scripts/visual_video.sh ``` Citation --------------- If you find MaskFreeVIS useful in your research or refer to the provided baseline results, please star :star: this repository and consider citing :pencil:: ``` @inproceedings{maskfreevis, author={Ke, Lei and Danelljan, Martin and Ding, Henghui and Tai, Yu-Wing and Tang, Chi-Keung and Yu, Fisher}, title={Mask-Free Video Instance Segmentation}, booktitle = {CVPR}, year = {2023} } ``` ## Acknowledgments - Thanks [BoxInst](https://github.com/aim-uofa/AdelaiDet/blob/master/configs/BoxInst/README.md) image-based instance segmentation losses. - Thanks [Mask2Former](https://github.com/facebookresearch/Mask2Former) and [VMT](https://github.com/SysCV/vmt) for providing useful inference and evaluation toolkits. ================================================ FILE: configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml ================================================ MODEL: BACKBONE: FREEZE_AT: 0 NAME: "build_resnet_backbone" WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] RESNETS: DEPTH: 50 STEM_TYPE: "basic" # not used STEM_OUT_CHANNELS: 64 STRIDE_IN_1X1: False OUT_FEATURES: ["res2", "res3", "res4", "res5"] # NORM: "SyncBN" RES5_MULTI_GRID: [1, 1, 1] # not used DATASETS: TRAIN: ("coco_2017_train",) TEST: ("coco_2017_val",) SOLVER: IMS_PER_BATCH: 16 BASE_LR: 0.0001 STEPS: (327778, 355092) MAX_ITER: 368750 WARMUP_FACTOR: 1.0 WARMUP_ITERS: 10 WEIGHT_DECAY: 0.05 OPTIMIZER: "ADAMW" BACKBONE_MULTIPLIER: 0.1 CLIP_GRADIENTS: ENABLED: True CLIP_TYPE: "full_model" CLIP_VALUE: 0.01 NORM_TYPE: 2.0 AMP: ENABLED: True INPUT: IMAGE_SIZE: 1024 MIN_SCALE: 0.1 MAX_SCALE: 2.0 FORMAT: "RGB" DATASET_MAPPER_NAME: "coco_instance_lsj" TEST: EVAL_PERIOD: 5000 DATALOADER: FILTER_EMPTY_ANNOTATIONS: True NUM_WORKERS: 4 VERSION: 2 ================================================ FILE: configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml ================================================ _BASE_: Base-COCO-InstanceSegmentation.yaml OUTPUT_DIR: './output/' MODEL: META_ARCHITECTURE: "MaskFormer" SEM_SEG_HEAD: NAME: "MaskFormerHead" IGNORE_VALUE: 255 NUM_CLASSES: 80 LOSS_WEIGHT: 1.0 CONVS_DIM: 256 MASK_DIM: 256 NORM: "GN" # pixel decoder PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" IN_FEATURES: ["res2", "res3", "res4", "res5"] DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] COMMON_STRIDE: 4 TRANSFORMER_ENC_LAYERS: 6 MASK_FORMER: TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" DEEP_SUPERVISION: True NO_OBJECT_WEIGHT: 0.1 CLASS_WEIGHT: 2.0 MASK_WEIGHT: 5.0 DICE_WEIGHT: 5.0 HIDDEN_DIM: 256 NUM_OBJECT_QUERIES: 100 NHEADS: 8 DROPOUT: 0.0 DIM_FEEDFORWARD: 2048 ENC_LAYERS: 0 PRE_NORM: False ENFORCE_INPUT_PROJ: False SIZE_DIVISIBILITY: 32 DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query TRAIN_NUM_POINTS: 12544 OVERSAMPLE_RATIO: 3.0 IMPORTANCE_SAMPLE_RATIO: 0.75 TEST: SEMANTIC_ON: False INSTANCE_ON: True PANOPTIC_ON: False OVERLAP_THRESHOLD: 0.8 OBJECT_MASK_THRESHOLD: 0.8 ================================================ FILE: configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation.yaml ================================================ MODEL: BACKBONE: FREEZE_AT: 0 NAME: "build_resnet_backbone" WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] MASK_ON: True RESNETS: DEPTH: 50 STEM_TYPE: "basic" # not used STEM_OUT_CHANNELS: 64 STRIDE_IN_1X1: False OUT_FEATURES: ["res2", "res3", "res4", "res5"] # NORM: "SyncBN" RES5_MULTI_GRID: [1, 1, 1] # not used DATASETS: TRAIN: ("ytvis_2019_train", "coco_2017_train_fake",) TEST: ("ytvis_2019_val",) SOLVER: IMS_PER_BATCH: 16 BASE_LR: 0.0001 STEPS: (4000,) MAX_ITER: 6000 WARMUP_FACTOR: 1.0 WARMUP_ITERS: 10 WEIGHT_DECAY: 0.05 OPTIMIZER: "ADAMW" BACKBONE_MULTIPLIER: 0.1 CLIP_GRADIENTS: ENABLED: True CLIP_TYPE: "full_model" CLIP_VALUE: 0.01 NORM_TYPE: 2.0 AMP: ENABLED: True INPUT: MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip" RANDOM_FLIP: "flip_by_clip" AUGMENTATIONS: [] MIN_SIZE_TRAIN: (360, 480) MIN_SIZE_TEST: 360 CROP: ENABLED: False TYPE: "absolute_range" SIZE: (600, 720) FORMAT: "RGB" TEST: EVAL_PERIOD: 0 DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 4 VERSION: 2 ================================================ FILE: configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation_long.yaml ================================================ MODEL: BACKBONE: FREEZE_AT: 0 NAME: "build_resnet_backbone" WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] MASK_ON: True RESNETS: DEPTH: 50 STEM_TYPE: "basic" # not used STEM_OUT_CHANNELS: 64 STRIDE_IN_1X1: False OUT_FEATURES: ["res2", "res3", "res4", "res5"] # NORM: "SyncBN" RES5_MULTI_GRID: [1, 1, 1] # not used DATASETS: TRAIN: ("coco_2017_train_fake", "ytvis_2019_train",) TEST: ("ytvis_2019_val",) SOLVER: IMS_PER_BATCH: 8 BASE_LR: 0.00005 STEPS: (75000,) MAX_ITER: 140000 WARMUP_FACTOR: 1.0 WARMUP_ITERS: 10 WEIGHT_DECAY: 0.05 OPTIMIZER: "ADAMW" BACKBONE_MULTIPLIER: 0.1 CLIP_GRADIENTS: ENABLED: True CLIP_TYPE: "full_model" CLIP_VALUE: 0.01 NORM_TYPE: 2.0 AMP: ENABLED: True INPUT: MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip" RANDOM_FLIP: "flip_by_clip" AUGMENTATIONS: [] MIN_SIZE_TRAIN: (360, 480) MIN_SIZE_TEST: 360 CROP: ENABLED: False TYPE: "absolute_range" SIZE: (600, 720) FORMAT: "RGB" TEST: EVAL_PERIOD: 0 DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 4 VERSION: 2 ================================================ FILE: configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation_long_bs16.yaml ================================================ MODEL: BACKBONE: FREEZE_AT: 0 NAME: "build_resnet_backbone" WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] MASK_ON: True RESNETS: DEPTH: 50 STEM_TYPE: "basic" # not used STEM_OUT_CHANNELS: 64 STRIDE_IN_1X1: False OUT_FEATURES: ["res2", "res3", "res4", "res5"] # NORM: "SyncBN" RES5_MULTI_GRID: [1, 1, 1] # not used DATASETS: TRAIN: ("coco_2017_train_fake", "ytvis_2019_train",) TEST: ("ytvis_2019_val",) SOLVER: IMS_PER_BATCH: 16 BASE_LR: 0.0001 STEPS: (37500,) MAX_ITER: 70000 WARMUP_FACTOR: 1.0 WARMUP_ITERS: 10 WEIGHT_DECAY: 0.05 OPTIMIZER: "ADAMW" BACKBONE_MULTIPLIER: 0.1 CLIP_GRADIENTS: ENABLED: True CLIP_TYPE: "full_model" CLIP_VALUE: 0.01 NORM_TYPE: 2.0 AMP: ENABLED: True INPUT: MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip" RANDOM_FLIP: "flip_by_clip" AUGMENTATIONS: [] MIN_SIZE_TRAIN: (360, 480) MIN_SIZE_TEST: 360 CROP: ENABLED: False TYPE: "absolute_range" SIZE: (600, 720) FORMAT: "RGB" TEST: EVAL_PERIOD: 0 DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 4 VERSION: 2 ================================================ FILE: configs/youtubevis_2019/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml ================================================ _BASE_: ../video_maskformer2_R50_bs16_8ep_swin.yaml OUTPUT_DIR: 'swinl_joint_withcoco' MODEL: WEIGHTS: "./pretrained_model/model_final_e5f453.pkl" BACKBONE: NAME: "D2SwinTransformer" SWIN: EMBED_DIM: 192 DEPTHS: [2, 2, 18, 2] NUM_HEADS: [6, 12, 24, 48] WINDOW_SIZE: 12 APE: False DROP_PATH_RATE: 0.3 PATCH_NORM: True PRETRAIN_IMG_SIZE: 384 #WEIGHTS: "model_final_e5f453.pkl" PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] MASK_FORMER: NUM_OBJECT_QUERIES: 200 INPUT: MIN_SIZE_TEST: 480 ================================================ FILE: configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml ================================================ _BASE_: video_maskformer2_R50_bs16_8ep.yaml OUTPUT_DIR: './r101_coco_joint/' MODEL: WEIGHTS: "pretrained_model/model_final_eba159.pkl" RESNETS: DEPTH: 101 STEM_TYPE: "basic" # not used STEM_OUT_CHANNELS: 64 STRIDE_IN_1X1: False OUT_FEATURES: ["res2", "res3", "res4", "res5"] # NORM: "SyncBN" RES5_MULTI_GRID: [1, 1, 1] # not used ================================================ FILE: configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml ================================================ _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation_long_bs16.yaml OUTPUT_DIR: './r50_coco_joint/' SEED: 29118357 MODEL: WEIGHTS: "./pretrained_model/model_final_3c8ec9.pkl" META_ARCHITECTURE: "VideoMaskFormer" SEM_SEG_HEAD: NAME: "MaskFormerHead" IGNORE_VALUE: 255 NUM_CLASSES: 40 LOSS_WEIGHT: 1.0 CONVS_DIM: 256 MASK_DIM: 256 NORM: "GN" # pixel decoder PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" IN_FEATURES: ["res2", "res3", "res4", "res5"] DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] COMMON_STRIDE: 4 TRANSFORMER_ENC_LAYERS: 6 MASK_FORMER: TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder" TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" DEEP_SUPERVISION: True NO_OBJECT_WEIGHT: 0.1 CLASS_WEIGHT: 2.0 MASK_WEIGHT: 5.0 DICE_WEIGHT: 5.0 HIDDEN_DIM: 256 NUM_OBJECT_QUERIES: 100 NHEADS: 8 DROPOUT: 0.0 DIM_FEEDFORWARD: 2048 ENC_LAYERS: 0 PRE_NORM: False ENFORCE_INPUT_PROJ: False SIZE_DIVISIBILITY: 32 DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query TRAIN_NUM_POINTS: 20000 #20000 #12544 OVERSAMPLE_RATIO: 3.0 IMPORTANCE_SAMPLE_RATIO: 0.75 TEST: SEMANTIC_ON: False INSTANCE_ON: True PANOPTIC_ON: False OVERLAP_THRESHOLD: 0.8 OBJECT_MASK_THRESHOLD: 0.8 INPUT: MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip" PSEUDO: SAMPLING_FRAME_NUM: 4 SAMPLING_FRAME_RANGE: 20 AUGMENTATIONS: ['rotation'] MIN_SIZE_TRAIN: (288, 320, 352, 384, 416, 448, 480, 512) MAX_SIZE_TRAIN: 768 CROP: ENABLED: True TYPE: "absolute_range" SIZE: (384, 600) LSJ_AUG: ENABLED: False IMAGE_SIZE: 768 MIN_SCALE: 0.1 MAX_SCALE: 2.0 DATALOADER: FILTER_EMPTY_ANNOTATIONS: True # NUM_WORKERS: 8 ================================================ FILE: configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep_swin.yaml ================================================ _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation_long.yaml OUTPUT_DIR: './swinl_joint_withcoco/' SEED: 29118357 MODEL: WEIGHTS: "./pretrained_model/model_final_3c8ec9.pkl" META_ARCHITECTURE: "VideoMaskFormer" SEM_SEG_HEAD: NAME: "MaskFormerHead" IGNORE_VALUE: 255 NUM_CLASSES: 40 LOSS_WEIGHT: 1.0 CONVS_DIM: 256 MASK_DIM: 256 NORM: "GN" # pixel decoder PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" IN_FEATURES: ["res2", "res3", "res4", "res5"] DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] COMMON_STRIDE: 4 TRANSFORMER_ENC_LAYERS: 6 MASK_FORMER: TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder" TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" DEEP_SUPERVISION: True NO_OBJECT_WEIGHT: 0.1 CLASS_WEIGHT: 2.0 MASK_WEIGHT: 5.0 DICE_WEIGHT: 5.0 HIDDEN_DIM: 256 NUM_OBJECT_QUERIES: 100 NHEADS: 8 DROPOUT: 0.0 DIM_FEEDFORWARD: 2048 ENC_LAYERS: 0 PRE_NORM: False ENFORCE_INPUT_PROJ: False SIZE_DIVISIBILITY: 32 DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query TRAIN_NUM_POINTS: 20000 #20000 #12544 OVERSAMPLE_RATIO: 3.0 IMPORTANCE_SAMPLE_RATIO: 0.75 TEST: SEMANTIC_ON: False INSTANCE_ON: True PANOPTIC_ON: False OVERLAP_THRESHOLD: 0.8 OBJECT_MASK_THRESHOLD: 0.8 INPUT: MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip" PSEUDO: SAMPLING_FRAME_NUM: 4 SAMPLING_FRAME_RANGE: 20 AUGMENTATIONS: ['rotation'] MIN_SIZE_TRAIN: (288, 320, 352, 384, 416, 448, 480, 512) MAX_SIZE_TRAIN: 768 CROP: ENABLED: True TYPE: "absolute_range" SIZE: (384, 600) LSJ_AUG: ENABLED: False IMAGE_SIZE: 768 MIN_SCALE: 0.1 MAX_SCALE: 2.0 DATALOADER: FILTER_EMPTY_ANNOTATIONS: True # NUM_WORKERS: 8 ================================================ FILE: demo/README.md ================================================ ## Mask2Former Demo We provide a command line tool to run a simple demo of builtin configs. The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md). ================================================ FILE: demo/demo.py ================================================ # Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py import argparse import glob import multiprocessing as mp import os # fmt: off import sys sys.path.insert(1, os.path.join(sys.path[0], '..')) # fmt: on import tempfile import time import warnings import cv2 import numpy as np import tqdm from detectron2.config import get_cfg from detectron2.data.detection_utils import read_image from detectron2.projects.deeplab import add_deeplab_config from detectron2.utils.logger import setup_logger from mask2former import add_maskformer2_config from predictor import VisualizationDemo # constants WINDOW_NAME = "mask2former demo" def setup_cfg(args): # load config from file and command-line arguments cfg = get_cfg() add_deeplab_config(cfg) add_maskformer2_config(cfg) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() return cfg def get_parser(): parser = argparse.ArgumentParser(description="maskformer2 demo for builtin configs") parser.add_argument( "--config-file", default="configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.") parser.add_argument("--video-input", help="Path to video file.") parser.add_argument( "--input", nargs="+", help="A list of space separated input images; " "or a single glob pattern such as 'directory/*.jpg'", ) parser.add_argument( "--output", help="A file or directory to save output visualizations. " "If not given, will show output in an OpenCV window.", ) parser.add_argument( "--confidence-threshold", type=float, default=0.5, help="Minimum score for instance predictions to be shown", ) parser.add_argument( "--opts", help="Modify config options using the command-line 'KEY VALUE' pairs", default=[], nargs=argparse.REMAINDER, ) return parser def test_opencv_video_format(codec, file_ext): with tempfile.TemporaryDirectory(prefix="video_format_test") as dir: filename = os.path.join(dir, "test_file" + file_ext) writer = cv2.VideoWriter( filename=filename, fourcc=cv2.VideoWriter_fourcc(*codec), fps=float(30), frameSize=(10, 10), isColor=True, ) [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)] writer.release() if os.path.isfile(filename): return True return False if __name__ == "__main__": mp.set_start_method("spawn", force=True) args = get_parser().parse_args() setup_logger(name="fvcore") logger = setup_logger() logger.info("Arguments: " + str(args)) cfg = setup_cfg(args) demo = VisualizationDemo(cfg) if args.input: if len(args.input) == 1: args.input = glob.glob(os.path.expanduser(args.input[0])) assert args.input, "The input path(s) was not found" for path in tqdm.tqdm(args.input, disable=not args.output): # use PIL, to be consistent with evaluation img = read_image(path, format="BGR") start_time = time.time() predictions, visualized_output = demo.run_on_image(img, args.confidence_threshold) logger.info( "{}: {} in {:.2f}s".format( path, "detected {} instances".format(len(predictions["instances"])) if "instances" in predictions else "finished", time.time() - start_time, ) ) if args.output: if os.path.isdir(args.output): assert os.path.isdir(args.output), args.output out_filename = os.path.join(args.output, os.path.basename(path)) else: #assert len(args.input) == 1, "Please specify a directory with args.output" os.makedirs(args.output) out_filename = os.path.join(args.output, os.path.basename(path)) #out_filename = args.output visualized_output.save(out_filename) else: cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1]) if cv2.waitKey(0) == 27: break # esc to quit elif args.webcam: assert args.input is None, "Cannot have both --input and --webcam!" assert args.output is None, "output not yet supported with --webcam!" cam = cv2.VideoCapture(0) for vis in tqdm.tqdm(demo.run_on_video(cam)): cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) cv2.imshow(WINDOW_NAME, vis) if cv2.waitKey(1) == 27: break # esc to quit cam.release() cv2.destroyAllWindows() elif args.video_input: video = cv2.VideoCapture(args.video_input) width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) frames_per_second = video.get(cv2.CAP_PROP_FPS) num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) basename = os.path.basename(args.video_input) codec, file_ext = ( ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4") ) if codec == ".mp4v": warnings.warn("x264 codec not available, switching to mp4v") if args.output: if os.path.isdir(args.output): output_fname = os.path.join(args.output, basename) output_fname = os.path.splitext(output_fname)[0] + file_ext else: output_fname = args.output assert not os.path.isfile(output_fname), output_fname output_file = cv2.VideoWriter( filename=output_fname, # some installation of opencv may not support x264 (due to its license), # you can try other format (e.g. MPEG) fourcc=cv2.VideoWriter_fourcc(*codec), fps=float(frames_per_second), frameSize=(width, height), isColor=True, ) assert os.path.isfile(args.video_input) for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames): if args.output: output_file.write(vis_frame) else: cv2.namedWindow(basename, cv2.WINDOW_NORMAL) cv2.imshow(basename, vis_frame) if cv2.waitKey(1) == 27: break # esc to quit video.release() if args.output: output_file.release() else: cv2.destroyAllWindows() ================================================ FILE: demo/predictor.py ================================================ # Copied from: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py import atexit import bisect import multiprocessing as mp from collections import deque import cv2 import torch import numpy as np from detectron2.data import MetadataCatalog from detectron2.engine.defaults import DefaultPredictor from detectron2.utils.video_visualizer import VideoVisualizer from detectron2.utils.visualizer import ColorMode, Visualizer import matplotlib.pyplot as plt class VisualizationDemo(object): def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False): """ Args: cfg (CfgNode): instance_mode (ColorMode): parallel (bool): whether to run the model in different processes from visualization. Useful since the visualization logic can be slow. """ self.metadata = MetadataCatalog.get( cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused" ) self.cpu_device = torch.device("cpu") self.instance_mode = instance_mode self.parallel = parallel self.cfg_vis = cfg if parallel: num_gpu = torch.cuda.device_count() self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu) else: self.predictor = DefaultPredictor(cfg) def run_on_image(self, image, conf_thre): """ Args: image (np.ndarray): an image of shape (H, W, C) (in BGR order). This is the format used by OpenCV. Returns: predictions (dict): the output of the model. vis_output (VisImage): the visualized image output. """ vis_output = None predictions = self.predictor(image) # Convert image from OpenCV BGR format to Matplotlib RGB format. image = image[:, :, ::-1] visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode) if "panoptic_seg" in predictions: panoptic_seg, segments_info = predictions["panoptic_seg"] vis_output = visualizer.draw_panoptic_seg_predictions( panoptic_seg.to(self.cpu_device), segments_info ) else: if "sem_seg" in predictions: vis_output = visualizer.draw_sem_seg( predictions["sem_seg"].argmax(dim=0).to(self.cpu_device) ) if "instances" in predictions: instances = predictions["instances"].to(self.cpu_device) instances = instances[instances.scores >= conf_thre] ''' mask = instances.pred_masks.squeeze(1).data.cpu().numpy() for i_m in range(len(mask)): print('mask shape:', mask.shape) print('mask max:', mask.max()) #heatmapshow = cv2.normalize(mask[i], heatmapshow, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8U) heatmapshow = cv2.applyColorMap((mask[i_m] * 255).astype(np.uint8), cv2.COLORMAP_JET) cv2.imwrite(str(i_m)+"_heatmap_n.jpg", heatmapshow) ''' ''' print('instances scores:', instances.scores.shape) print('instances scores:', instances.scores) print('instances class:', instances.pred_classes.shape) print('instances boxes:', instances.pred_boxes) print('instances masks:', instances.pred_masks.shape) instances.pred_boxes = None ''' vis_output = visualizer.draw_instance_predictions(predictions=instances) return predictions, vis_output def _frame_from_video(self, video): while video.isOpened(): success, frame = video.read() if success: yield frame else: break def run_on_video(self, video): """ Visualizes predictions on frames of the input video. Args: video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be either a webcam or a video file. Yields: ndarray: BGR visualizations of each video frame. """ video_visualizer = VideoVisualizer(self.metadata, self.instance_mode) def process_predictions(frame, predictions): frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) if "panoptic_seg" in predictions: panoptic_seg, segments_info = predictions["panoptic_seg"] vis_frame = video_visualizer.draw_panoptic_seg_predictions( frame, panoptic_seg.to(self.cpu_device), segments_info ) elif "instances" in predictions: predictions = predictions["instances"].to(self.cpu_device) vis_frame = video_visualizer.draw_instance_predictions(frame, predictions) elif "sem_seg" in predictions: vis_frame = video_visualizer.draw_sem_seg( frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device) ) # Converts Matplotlib RGB format to OpenCV BGR format vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR) return vis_frame frame_gen = self._frame_from_video(video) if self.parallel: buffer_size = self.predictor.default_buffer_size frame_data = deque() for cnt, frame in enumerate(frame_gen): frame_data.append(frame) self.predictor.put(frame) if cnt >= buffer_size: frame = frame_data.popleft() predictions = self.predictor.get() yield process_predictions(frame, predictions) while len(frame_data): frame = frame_data.popleft() predictions = self.predictor.get() yield process_predictions(frame, predictions) else: for frame in frame_gen: yield process_predictions(frame, self.predictor(frame)) class AsyncPredictor: """ A predictor that runs the model asynchronously, possibly on >1 GPUs. Because rendering the visualization takes considerably amount of time, this helps improve throughput a little bit when rendering videos. """ class _StopToken: pass class _PredictWorker(mp.Process): def __init__(self, cfg, task_queue, result_queue): self.cfg = cfg self.task_queue = task_queue self.result_queue = result_queue super().__init__() def run(self): predictor = DefaultPredictor(self.cfg) while True: task = self.task_queue.get() if isinstance(task, AsyncPredictor._StopToken): break idx, data = task result = predictor(data) self.result_queue.put((idx, result)) def __init__(self, cfg, num_gpus: int = 1): """ Args: cfg (CfgNode): num_gpus (int): if 0, will run on CPU """ num_workers = max(num_gpus, 1) self.task_queue = mp.Queue(maxsize=num_workers * 3) self.result_queue = mp.Queue(maxsize=num_workers * 3) self.procs = [] for gpuid in range(max(num_gpus, 1)): cfg = cfg.clone() cfg.defrost() cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu" self.procs.append( AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue) ) self.put_idx = 0 self.get_idx = 0 self.result_rank = [] self.result_data = [] for p in self.procs: p.start() atexit.register(self.shutdown) def put(self, image): self.put_idx += 1 self.task_queue.put((self.put_idx, image)) def get(self): self.get_idx += 1 # the index needed for this request if len(self.result_rank) and self.result_rank[0] == self.get_idx: res = self.result_data[0] del self.result_data[0], self.result_rank[0] return res while True: # make sure the results are returned in the correct order idx, res = self.result_queue.get() if idx == self.get_idx: return res insert = bisect.bisect(self.result_rank, idx) self.result_rank.insert(insert, idx) self.result_data.insert(insert, res) def __len__(self): return self.put_idx - self.get_idx def __call__(self, image): self.put(image) return self.get() def shutdown(self): for _ in self.procs: self.task_queue.put(AsyncPredictor._StopToken()) @property def default_buffer_size(self): return len(self.procs) * 5 ================================================ FILE: demo_video/README.md ================================================ ## Video Mask2Former Demo We provide a command line tool to run a simple demo of builtin configs. The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md). ================================================ FILE: demo_video/demo.py ================================================ # Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py import argparse import glob import multiprocessing as mp import os # fmt: off import sys sys.path.insert(1, os.path.join(sys.path[0], '..')) # fmt: on import tempfile import time import warnings import cv2 import numpy as np import tqdm from torch.cuda.amp import autocast from detectron2.config import get_cfg from detectron2.data.detection_utils import read_image from detectron2.projects.deeplab import add_deeplab_config from detectron2.utils.logger import setup_logger from mask2former import add_maskformer2_config from mask2former_video import add_maskformer2_video_config from predictor import VisualizationDemo import imageio # constants WINDOW_NAME = "mask2former video demo" def setup_cfg(args): # load config from file and command-line arguments cfg = get_cfg() add_deeplab_config(cfg) add_maskformer2_config(cfg) add_maskformer2_video_config(cfg) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() return cfg def get_parser(): parser = argparse.ArgumentParser(description="maskformer2 demo for builtin configs") parser.add_argument( "--config-file", default="configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--video-input", help="Path to video file.") parser.add_argument( "--input", nargs="+", help="A list of space separated input images; " "or a single glob pattern such as 'directory/*.jpg'" "this will be treated as frames of a video", ) parser.add_argument( "--output", help="A file or directory to save output visualizations. " "If not given, will show output in an OpenCV window.", ) parser.add_argument( "--save-frames", default=False, help="Save frame level image outputs.", ) parser.add_argument( "--confidence-threshold", type=float, default=0.5, help="Minimum score for instance predictions to be shown", ) parser.add_argument( "--opts", help="Modify config options using the command-line 'KEY VALUE' pairs", default=[], nargs=argparse.REMAINDER, ) return parser def test_opencv_video_format(codec, file_ext): with tempfile.TemporaryDirectory(prefix="video_format_test") as dir: filename = os.path.join(dir, "test_file" + file_ext) writer = cv2.VideoWriter( filename=filename, fourcc=cv2.VideoWriter_fourcc(*codec), fps=float(30), frameSize=(10, 10), isColor=True, ) [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)] writer.release() if os.path.isfile(filename): return True return False if __name__ == "__main__": mp.set_start_method("spawn", force=True) args = get_parser().parse_args() setup_logger(name="fvcore") logger = setup_logger() logger.info("Arguments: " + str(args)) cfg = setup_cfg(args) demo = VisualizationDemo(cfg) if args.output: os.makedirs(args.output, exist_ok=True) if args.input: # if len(args.input) == 1: # args.input = glob.glob(os.path.expanduser(args.input[0])) # assert args.input, "The input path(s) was not found" print('args input:', args.input) args.input = args.input[0] for file_name in os.listdir(args.input): input_path_list = sorted([args.input + file_name + '/' + f for f in os.listdir(args.input + file_name)]) print('input path list:', input_path_list) if len(input_path_list) == 0: continue vid_frames = [] for path in input_path_list: img = read_image(path, format="BGR") vid_frames.append(img) start_time = time.time() with autocast(): predictions, visualized_output = demo.run_on_video(vid_frames, args.confidence_threshold) logger.info( "detected {} instances per frame in {:.2f}s".format( len(predictions["pred_scores"]), time.time() - start_time ) ) if args.output: if args.save_frames: if args.output: os.makedirs(args.output + file_name, exist_ok=True) print('save frames') for path, _vis_output in zip(input_path_list, visualized_output): out_filename = os.path.join(args.output, file_name, os.path.basename(path)) _vis_output.save(out_filename) H, W = visualized_output[0].height, visualized_output[0].width images = [] for _vis_output in visualized_output: frame = _vis_output.get_image()#[:, :, ::-1] images.append(frame) imageio.mimsave(args.output + file_name + ".gif", images, fps=5) ''' cap = cv2.VideoCapture(-1) fourcc = cv2.VideoWriter_fourcc(*"mp4v") out = cv2.VideoWriter(os.path.join(args.output, "visualization.mp4"), fourcc, 10.0, (W, H), True) for _vis_output in visualized_output: frame = _vis_output.get_image()[:, :, ::-1] out.write(frame) cap.release() out.release() ''' elif args.video_input: video = cv2.VideoCapture(args.video_input) vid_frames = [] while video.isOpened(): success, frame = video.read() if success: vid_frames.append(frame) else: break start_time = time.time() with autocast(): predictions, visualized_output = demo.run_on_video(vid_frames) logger.info( "detected {} instances per frame in {:.2f}s".format( len(predictions["pred_scores"]), time.time() - start_time ) ) if args.output: if args.save_frames: for idx, _vis_output in enumerate(visualized_output): out_filename = os.path.join(args.output, f"{idx}.jpg") _vis_output.save(out_filename) H, W = visualized_output[0].height, visualized_output[0].width cap = cv2.VideoCapture(-1) fourcc = cv2.VideoWriter_fourcc(*"mp4v") out = cv2.VideoWriter(os.path.join(args.output, "visualization.mp4"), fourcc, 10.0, (W, H), True) for _vis_output in visualized_output: frame = _vis_output.get_image()[:, :, ::-1] out.write(frame) cap.release() out.release() ================================================ FILE: demo_video/predictor.py ================================================ # reference: https://github.com/sukjunhwang/IFC/blob/master/projects/IFC/demo/predictor.py import atexit import bisect import multiprocessing as mp from collections import deque import cv2 import torch from visualizer import TrackVisualizer from detectron2.data import MetadataCatalog from detectron2.engine.defaults import DefaultPredictor from detectron2.structures import Instances from detectron2.utils.video_visualizer import VideoVisualizer from detectron2.utils.visualizer import ColorMode class VisualizationDemo(object): def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False): """ Args: cfg (CfgNode): instance_mode (ColorMode): parallel (bool): whether to run the model in different processes from visualization. Useful since the visualization logic can be slow. """ self.metadata = MetadataCatalog.get( cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused" ) self.cpu_device = torch.device("cpu") self.instance_mode = instance_mode self.parallel = parallel if parallel: num_gpu = torch.cuda.device_count() self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu) else: self.predictor = VideoPredictor(cfg) def run_on_video(self, frames, conf_thre): """ Args: frames (List[np.ndarray]): a list of images of shape (H, W, C) (in BGR order). This is the format used by OpenCV. Returns: predictions (dict): the output of the model. vis_output (VisImage): the visualized image output. """ vis_output = None predictions = self.predictor(frames) image_size = predictions["image_size"] pred_scores = predictions["pred_scores"] pred_labels = predictions["pred_labels"] pred_masks = predictions["pred_masks"] remain_index = [ii for ii in range(len(pred_scores)) if pred_scores[ii] >= conf_thre ] pred_scores = [pred_scores[ind] for ind in remain_index] pred_labels = [pred_labels[ind] for ind in remain_index] pred_masks = [pred_masks[ind] for ind in remain_index] frame_masks = list(zip(*pred_masks)) total_vis_output = [] for frame_idx in range(len(frames)): frame = frames[frame_idx][:, :, ::-1] visualizer = TrackVisualizer(frame, self.metadata, instance_mode=self.instance_mode) ins = Instances(image_size) if len(pred_scores) > 0: print('pred scores:', pred_scores) ins.scores = pred_scores ins.pred_classes = pred_labels ins.pred_masks = torch.stack(frame_masks[frame_idx], dim=0) vis_output = visualizer.draw_instance_predictions(predictions=ins) total_vis_output.append(vis_output) return predictions, total_vis_output class VideoPredictor(DefaultPredictor): """ Create a simple end-to-end predictor with the given config that runs on single device for a single input image. Compared to using the model directly, this class does the following additions: 1. Load checkpoint from `cfg.MODEL.WEIGHTS`. 2. Always take BGR image as the input and apply conversion defined by `cfg.INPUT.FORMAT`. 3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`. 4. Take one input image and produce a single output, instead of a batch. If you'd like to do anything more fancy, please refer to its source code as examples to build and use the model manually. Attributes: metadata (Metadata): the metadata of the underlying dataset, obtained from cfg.DATASETS.TEST. Examples: :: pred = DefaultPredictor(cfg) inputs = cv2.imread("input.jpg") outputs = pred(inputs) """ def __call__(self, frames): """ Args: original_image (np.ndarray): an image of shape (H, W, C) (in BGR order). Returns: predictions (dict): the output of the model for one image only. See :doc:`/tutorials/models` for details about the format. """ with torch.no_grad(): # https://github.com/sphinx-doc/sphinx/issues/4258 input_frames = [] for original_image in frames: # Apply pre-processing to image. if self.input_format == "RGB": # whether the model expects BGR inputs or RGB original_image = original_image[:, :, ::-1] height, width = original_image.shape[:2] image = self.aug.get_transform(original_image).apply_image(original_image) image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) input_frames.append(image) inputs = {"image": input_frames, "height": height, "width": width} predictions = self.model([inputs]) return predictions class AsyncPredictor: """ A predictor that runs the model asynchronously, possibly on >1 GPUs. Because rendering the visualization takes considerably amount of time, this helps improve throughput when rendering videos. """ class _StopToken: pass class _PredictWorker(mp.Process): def __init__(self, cfg, task_queue, result_queue): self.cfg = cfg self.task_queue = task_queue self.result_queue = result_queue super().__init__() def run(self): predictor = VideoPredictor(self.cfg) while True: task = self.task_queue.get() if isinstance(task, AsyncPredictor._StopToken): break idx, data = task result = predictor(data) self.result_queue.put((idx, result)) def __init__(self, cfg, num_gpus: int = 1): """ Args: cfg (CfgNode): num_gpus (int): if 0, will run on CPU """ num_workers = max(num_gpus, 1) self.task_queue = mp.Queue(maxsize=num_workers * 3) self.result_queue = mp.Queue(maxsize=num_workers * 3) self.procs = [] for gpuid in range(max(num_gpus, 1)): cfg = cfg.clone() cfg.defrost() cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu" self.procs.append( AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue) ) self.put_idx = 0 self.get_idx = 0 self.result_rank = [] self.result_data = [] for p in self.procs: p.start() atexit.register(self.shutdown) def put(self, image): self.put_idx += 1 self.task_queue.put((self.put_idx, image)) def get(self): self.get_idx += 1 # the index needed for this request if len(self.result_rank) and self.result_rank[0] == self.get_idx: res = self.result_data[0] del self.result_data[0], self.result_rank[0] return res while True: # make sure the results are returned in the correct order idx, res = self.result_queue.get() if idx == self.get_idx: return res insert = bisect.bisect(self.result_rank, idx) self.result_rank.insert(insert, idx) self.result_data.insert(insert, res) def __len__(self): return self.put_idx - self.get_idx def __call__(self, image): self.put(image) return self.get() def shutdown(self): for _ in self.procs: self.task_queue.put(AsyncPredictor._StopToken()) @property def default_buffer_size(self): return len(self.procs) * 5 ================================================ FILE: demo_video/visualizer.py ================================================ # reference: https://github.com/sukjunhwang/IFC/blob/master/projects/IFC/demo/visualizer.py import torch import numpy as np import matplotlib.colors as mplc from detectron2.utils.visualizer import ColorMode, GenericMask, Visualizer, _create_text_labels _ID_JITTERS = [[0.9047944201469568, 0.3241718265806123, 0.33443746665210006], [0.4590171386127151, 0.9095038146383864, 0.3143840671974788], [0.4769356899795538, 0.5044406738441948, 0.5354530846360839], [0.00820945625670777, 0.24099210193126785, 0.15471834055332978], [0.6195684374237388, 0.4020380013509799, 0.26100266066404676], [0.08281237756545068, 0.05900744492710419, 0.06106221202154216], [0.2264886829978755, 0.04925271007292076, 0.10214429345996079], [0.1888247470009874, 0.11275000298612425, 0.46112894830685514], [0.37415767691880975, 0.844284596118331, 0.950471611180866], [0.3817344218157631, 0.3483259270707101, 0.6572989333690541], [0.2403115731054466, 0.03078280287279167, 0.5385975692534737], [0.7035076951650824, 0.12352084932325424, 0.12873080308790197], [0.12607434914489934, 0.111244793010015, 0.09333334699716023], [0.6551607300342269, 0.7003064103554443, 0.4131794512286162], [0.13592107365596595, 0.5390702818232149, 0.004540643174930525], [0.38286244894454347, 0.709142545393449, 0.529074791609835], [0.4279376583651734, 0.5634708596431771, 0.8505569717104301], [0.3460488523902999, 0.464769595519293, 0.6676839675477276], [0.8544063246675081, 0.5041190233407755, 0.9081217697141578], [0.9207009090747208, 0.2403865944739051, 0.05375410999863772], [0.6515786136947107, 0.6299918449948327, 0.45292029442034387], [0.986174217295693, 0.2424849846977214, 0.3981993323108266], [0.22101915872994693, 0.3408589198278038, 0.006381420347677524], [0.3159785813515982, 0.1145748921741011, 0.595754317197274], [0.10263421488052715, 0.5864139253490858, 0.23908000741142432], [0.8272999391532938, 0.6123527260897751, 0.3365197327803193], [0.5269583712937912, 0.25668929554516506, 0.7888411215078127], [0.2433880265410031, 0.7240751234287827, 0.8483215810528648], [0.7254601709704898, 0.8316525547295984, 0.9325253855921963], [0.5574483824856672, 0.2935331727879944, 0.6594839453793155], [0.6209642371433579, 0.054030693198821256, 0.5080873988178534], [0.9055507077365624, 0.12865888619203514, 0.9309191861440005], [0.9914469722960537, 0.3074114506206205, 0.8762107657323488], [0.4812682518247371, 0.15055826298548158, 0.9656340505308308], [0.6459219454316445, 0.9144794010251625, 0.751338812155106], [0.860840174209798, 0.8844626353077639, 0.3604624506769899], [0.8194991672032272, 0.926399617787601, 0.8059222327343247], [0.6540413175393658, 0.04579445254618297, 0.26891917826531275], [0.37778835833987046, 0.36247927666109536, 0.7989799305827889], [0.22738304978177726, 0.9038018263773739, 0.6970838854138303], [0.6362015495896184, 0.527680794236961, 0.5570915425178721], [0.6436401915860954, 0.6316925317144524, 0.9137151236993912], [0.04161828388587163, 0.3832413349082706, 0.6880829921949752], [0.7768167825719299, 0.8933821497682587, 0.7221278391266809], [0.8632760876301346, 0.3278628094906323, 0.8421587587114462], [0.8556499133262127, 0.6497385872901932, 0.5436895688477963], [0.9861940318610894, 0.03562313777386272, 0.9183454677106616], [0.8042586091176366, 0.6167222703170994, 0.24181981557207644], [0.9504247117633057, 0.3454233714011461, 0.6883727005547743], [0.9611909135491202, 0.46384154263898114, 0.32700443315058914], [0.523542176970206, 0.446222414615845, 0.9067402987747814], [0.7536954008682911, 0.6675512338797588, 0.22538238957839196], [0.1554052265688285, 0.05746097492966129, 0.8580358872587424], [0.8540838640971405, 0.9165504335482566, 0.6806982829158964], [0.7065090319405029, 0.8683059983962002, 0.05167128320624026], [0.39134812961899124, 0.8910075505622979, 0.7639815712623922], [0.1578117311479783, 0.20047326898284668, 0.9220177338840568], [0.2017488993096358, 0.6949259970936679, 0.8729196864798128], [0.5591089340651949, 0.15576770423813258, 0.1469857469387812], [0.14510398622626974, 0.24451497734532168, 0.46574271993578786], [0.13286397822351492, 0.4178244533944635, 0.03728728952131943], [0.556463206310225, 0.14027595183361663, 0.2731537988657907], [0.4093837966398032, 0.8015225687789814, 0.8033567296903834], [0.527442563956637, 0.902232617214431, 0.7066626674362227], [0.9058355503297827, 0.34983989180213004, 0.8353262183839384], [0.7108382186953104, 0.08591307895133471, 0.21434688012521974], [0.22757345065207668, 0.7943075496583976, 0.2992305547627421], [0.20454109788173636, 0.8251670332103687, 0.012981987094547232], [0.7672562637297392, 0.005429019973062554, 0.022163616037108702], [0.37487345910117564, 0.5086240194440863, 0.9061216063654387], [0.9878004014101087, 0.006345852772772331, 0.17499753379350858], [0.030061528704491303, 0.1409704315546606, 0.3337131835834506], [0.5022506782611504, 0.5448435505388706, 0.40584238936140726], [0.39560774627423445, 0.8905943695833262, 0.5850815030921116], [0.058615671926786406, 0.5365713844300387, 0.1620457551256279], [0.41843842882069693, 0.1536005983609976, 0.3127878501592438], [0.05947621790155899, 0.5412421167331932, 0.2611322146455659], [0.5196159938235607, 0.7066461551682705, 0.970261497412556], [0.30443031606149007, 0.45158581060034975, 0.4331841153149706], [0.8848298403933996, 0.7241791700943656, 0.8917110054596072], [0.5720260591898779, 0.3072801598203052, 0.8891066705989902], [0.13964015336177327, 0.2531778096760302, 0.5703756837403124], [0.2156307542329836, 0.4139947500641685, 0.87051676884144], [0.10800455881891169, 0.05554646035458266, 0.2947027428551443], [0.35198009410633857, 0.365849666213808, 0.06525787683513773], [0.5223264108118847, 0.9032195574351178, 0.28579084943315025], [0.7607724246546966, 0.3087194381828555, 0.6253235528354899], [0.5060485442077824, 0.19173600467625274, 0.9931175692203702], [0.5131805830323746, 0.07719515392040577, 0.923212006754969], [0.3629762141280106, 0.02429179642710888, 0.6963754952399983], [0.7542592485456767, 0.6478893299494212, 0.3424965345400731], [0.49944574453364454, 0.6775665366832825, 0.33758796076989583], [0.010621818120767679, 0.8221571611173205, 0.5186257457566332], [0.5857910304290109, 0.7178133992025467, 0.9729243483606071], [0.16987399482717613, 0.9942570210657463, 0.18120758122552927], [0.016362572521240848, 0.17582788603087263, 0.7255176922640298], [0.10981764283706419, 0.9078582203470377, 0.7638063718334003], [0.9252097840441119, 0.3330197086990039, 0.27888705301420136], [0.12769972651171546, 0.11121470804891687, 0.12710743734391716], [0.5753520518360334, 0.2763862879599456, 0.6115636613363361]] _OFF_WHITE = (1.0, 1.0, 240.0 / 255) class TrackVisualizer(Visualizer): def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE): super().__init__( img_rgb, metadata=metadata, scale=scale, instance_mode=instance_mode ) self.cpu_device = torch.device("cpu") def _jitter(self, color, id): """ Randomly modifies given color to produce a slightly different color than the color given. Args: color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color picked. The values in the list are in the [0.0, 1.0] range. Returns: jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color after being jittered. The values in the list are in the [0.0, 1.0] range. """ color = mplc.to_rgb(color) vec = _ID_JITTERS[id] # better to do it in another color space vec = vec / np.linalg.norm(vec) * 0.5 res = np.clip(vec + color, 0, 1) return tuple(res) def overlay_instances( self, *, boxes=None, labels=None, masks=None, keypoints=None, assigned_colors=None, alpha=0.5 ): """ Args: boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`, or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image, or a :class:`RotatedBoxes`, or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format for the N objects in a single image, labels (list[str]): the text to be displayed for each instance. masks (masks-like object): Supported types are: * :class:`detectron2.structures.PolygonMasks`, :class:`detectron2.structures.BitMasks`. * list[list[ndarray]]: contains the segmentation masks for all objects in one image. The first level of the list corresponds to individual instances. The second level to all the polygon that compose the instance, and the third level to the polygon coordinates. The third level should have the format of [x0, y0, x1, y1, ..., xn, yn] (n >= 3). * list[ndarray]: each ndarray is a binary mask of shape (H, W). * list[dict]: each dict is a COCO-style RLE. keypoints (Keypoint or array like): an array-like object of shape (N, K, 3), where the N is the number of instances and K is the number of keypoints. The last dimension corresponds to (x, y, visibility or score). assigned_colors (list[matplotlib.colors]): a list of colors, where each color corresponds to each mask or box in the image. Refer to 'matplotlib.colors' for full list of formats that the colors are accepted in. Returns: output (VisImage): image object with visualizations. """ num_instances = 0 if boxes is not None: boxes = self._convert_boxes(boxes) num_instances = len(boxes) if masks is not None: # print('masks:', masks) #masks = self._convert_masks(masks) if num_instances: assert len(masks) == num_instances else: num_instances = len(masks) if keypoints is not None: if num_instances: assert len(keypoints) == num_instances else: num_instances = len(keypoints) keypoints = self._convert_keypoints(keypoints) if labels is not None: assert len(labels) == num_instances if assigned_colors is None: assigned_colors = [random_color(ii, rgb=True, maximum=1) for ii in range(num_instances)] if num_instances == 0: return self.output if boxes is not None and boxes.shape[1] == 5: return self.overlay_rotated_instances( boxes=boxes, labels=labels, assigned_colors=assigned_colors ) # Display in largest to smallest order to reduce occlusion. areas = None if boxes is not None: areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1) elif masks is not None: areas = np.asarray([x.sum() for x in masks]) if areas is not None: sorted_idxs = np.argsort(-areas).tolist() # Re-order overlapped instances in descending order. boxes = boxes[sorted_idxs] if boxes is not None else None labels = [labels[k] for k in sorted_idxs] if labels is not None else None masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None assigned_colors = [assigned_colors[idx] for idx in sorted_idxs] keypoints = keypoints[sorted_idxs] if keypoints is not None else None for i in range(num_instances): color = assigned_colors[i] # if boxes is not None: # self.draw_box(boxes[i], edge_color=color) if masks is not None: #self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha) binary_mask = masks[i].astype(np.uint8) #alpha = 0.7 #print('binary mask:', binary_mask) self.draw_binary_mask( binary_mask, color=color, edge_color=None, # _OFF_WHITE alpha=alpha, ) if False: # if labels is not None: # first get a box if boxes is not None: x0, y0, x1, y1 = boxes[i] text_pos = (x0, y0) # if drawing boxes, put text on the box corner. horiz_align = "left" elif masks is not None: # skip small mask without polygon if len(masks[i].polygons) == 0: continue x0, y0, x1, y1 = masks[i].bbox() # draw text in the center (defined by median) when box is not drawn # median is less sensitive to outliers. text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1] horiz_align = "center" else: continue # drawing the box confidence for keypoints isn't very useful. # for small objects, draw text at the side to avoid occlusion instance_area = (y1 - y0) * (x1 - x0) if ( instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale or y1 - y0 < 40 * self.output.scale ): if y1 >= self.output.height - 5: text_pos = (x1, y0) else: text_pos = (x0, y1) height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width) lighter_color = self._change_color_brightness(color, brightness_factor=0.7) font_size = ( np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size ) # self.draw_text( # labels[i], # text_pos, # color=lighter_color, # horizontal_alignment=horiz_align, # font_size=font_size, # ) # draw keypoints if keypoints is not None: for keypoints_per_instance in keypoints: self.draw_and_connect_keypoints(keypoints_per_instance) return self.output def draw_instance_predictions(self, predictions): """ Draw instance-level prediction results on an image. Args: predictions (Instances): the output of an instance detection/segmentation model. Following fields will be used to draw: "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle"). Returns: output (VisImage): image object with visualizations. """ preds = predictions.to(self.cpu_device) boxes = preds.pred_boxes if preds.has("pred_boxes") else None scores = preds.scores if preds.has("scores") else None classes = preds.pred_classes if preds.has("pred_classes") else None labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None)) if labels is not None: labels = ["[{}] ".format(_id) + l for _id, l in enumerate(labels)] if preds.has("pred_masks"): masks = np.asarray(preds.pred_masks) print('enter here==========') # masks = [GenericMask(x, self.output.height, self.output.width) for x in masks] else: masks = None if classes is None: return self.output colors = [ self._jitter([x / 255 for x in self.metadata.thing_colors[c]], id) for id, c in enumerate(classes) ] alpha = 0.5 if self._instance_mode == ColorMode.IMAGE_BW: self.output.img = self._create_grayscale_image( (preds.pred_masks.any(dim=0) > 0).numpy() if preds.has("pred_masks") else None ) alpha = 0.3 self.overlay_instances( masks=masks, boxes=boxes, labels=labels, assigned_colors=colors, alpha=alpha, ) return self.output ================================================ FILE: mask2former/__init__.py ================================================ from . import data # register all new datasets from . import modeling # config from .config import add_maskformer2_config # dataset loading from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper from .data.dataset_mappers.mask_former_instance_dataset_mapper import ( MaskFormerInstanceDatasetMapper, ) from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import ( MaskFormerPanopticDatasetMapper, ) from .data.dataset_mappers.mask_former_semantic_dataset_mapper import ( MaskFormerSemanticDatasetMapper, ) # models from .maskformer_model import MaskFormer from .test_time_augmentation import SemanticSegmentorWithTTA # evaluation from .evaluation.instance_evaluation import InstanceSegEvaluator ================================================ FILE: mask2former/config.py ================================================ # -*- coding: utf-8 -*- from detectron2.config import CfgNode as CN def add_maskformer2_config(cfg): """ Add config for MASK_FORMER. """ # NOTE: configs from original maskformer # data config # select the dataset mapper cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic" # Color augmentation cfg.INPUT.COLOR_AUG_SSD = False # We retry random cropping until no single category in semantic segmentation GT occupies more # than `SINGLE_CATEGORY_MAX_AREA` part of the crop. cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0 # Pad image and segmentation GT in dataset mapper. cfg.INPUT.SIZE_DIVISIBILITY = -1 # solver config # weight decay on embedding cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0 # optimizer cfg.SOLVER.OPTIMIZER = "ADAMW" cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1 # mask_former model config cfg.MODEL.MASK_FORMER = CN() # loss cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1 cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0 cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0 cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0 # transformer config cfg.MODEL.MASK_FORMER.NHEADS = 8 cfg.MODEL.MASK_FORMER.DROPOUT = 0.1 cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048 cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0 cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6 cfg.MODEL.MASK_FORMER.PRE_NORM = False cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256 cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100 cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5" cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False # mask_former inference config cfg.MODEL.MASK_FORMER.TEST = CN() cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0 cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0 cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet) # you can use this config to override cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32 # pixel decoder config cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256 # adding transformer in pixel decoder cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0 # pixel decoder cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder" # swin transformer backbone cfg.MODEL.SWIN = CN() cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224 cfg.MODEL.SWIN.PATCH_SIZE = 4 cfg.MODEL.SWIN.EMBED_DIM = 96 cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2] cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24] cfg.MODEL.SWIN.WINDOW_SIZE = 7 cfg.MODEL.SWIN.MLP_RATIO = 4.0 cfg.MODEL.SWIN.QKV_BIAS = True cfg.MODEL.SWIN.QK_SCALE = None cfg.MODEL.SWIN.DROP_RATE = 0.0 cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0 cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3 cfg.MODEL.SWIN.APE = False cfg.MODEL.SWIN.PATCH_NORM = True cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"] cfg.MODEL.SWIN.USE_CHECKPOINT = False # NOTE: maskformer2 extra configs # transformer module cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder" # LSJ aug cfg.INPUT.IMAGE_SIZE = 1024 cfg.INPUT.MIN_SCALE = 0.1 cfg.INPUT.MAX_SCALE = 2.0 # MSDeformAttn encoder configs cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"] cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4 cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8 # point loss configs # Number of points sampled during training for a mask point head. cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112 # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the # original paper. cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0 # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in # the original paper. cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75 ================================================ FILE: mask2former/data/__init__.py ================================================ from . import datasets ================================================ FILE: mask2former/data/dataset_mappers/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. ================================================ FILE: mask2former/data/dataset_mappers/__init__.py.new ================================================ ================================================ FILE: mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py ================================================ # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py import copy import logging import numpy as np import torch from detectron2.config import configurable from detectron2.data import detection_utils as utils from detectron2.data import transforms as T from detectron2.data.transforms import TransformGen from detectron2.structures import BitMasks, Instances from pycocotools import mask as coco_mask __all__ = ["COCOInstanceNewBaselineDatasetMapper"] def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor: """ Compute the bounding boxes around the provided masks. Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. Args: masks (Tensor[N, H, W]): masks to transform where N is the number of masks and (H, W) are the spatial dimensions. Returns: Tensor[N, 4]: bounding boxes """ if masks.numel() == 0: return masks n = masks.shape[0] for index, mask in enumerate(masks): y, x = torch.where(mask != 0) if len(x) * len(y) == 0: continue h = torch.max(y) - torch.min(y) w = torch.max(x) - torch.min(x) masks[index, torch.min(y):torch.max(y), torch.min(x):torch.max(x)] = 1.0 return masks def convert_coco_poly_to_mask(segmentations, height, width): masks = [] for polygons in segmentations: rles = coco_mask.frPyObjects(polygons, height, width) mask = coco_mask.decode(rles) if len(mask.shape) < 3: mask = mask[..., None] mask = torch.as_tensor(mask, dtype=torch.uint8) mask = mask.any(dim=2) masks.append(mask) if masks: masks = torch.stack(masks, dim=0) masks = masks_to_boxes(masks) else: masks = torch.zeros((0, height, width), dtype=torch.uint8) return masks def build_transform_gen(cfg, is_train): """ Create a list of default :class:`Augmentation` from config. Now it includes resizing and flipping. Returns: list[Augmentation] """ assert is_train, "Only support training augmentation" image_size = cfg.INPUT.IMAGE_SIZE min_scale = cfg.INPUT.MIN_SCALE max_scale = cfg.INPUT.MAX_SCALE augmentation = [] if cfg.INPUT.RANDOM_FLIP != "none": augmentation.append( T.RandomFlip( horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal", vertical=cfg.INPUT.RANDOM_FLIP == "vertical", ) ) augmentation.extend([ T.ResizeScale( min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size ), T.FixedSizeCrop(crop_size=(image_size, image_size)), ]) return augmentation # This is specifically designed for the COCO dataset. class COCOInstanceNewBaselineDatasetMapper: """ A callable which takes a dataset dict in Detectron2 Dataset format, and map it into a format used by MaskFormer. This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. The callable currently does the following: 1. Read the image from "file_name" 2. Applies geometric transforms to the image and annotation 3. Find and applies suitable cropping to the image and annotation 4. Prepare image and annotation to Tensors """ @configurable def __init__( self, is_train=True, *, tfm_gens, image_format, ): """ NOTE: this interface is experimental. Args: is_train: for training or inference augmentations: a list of augmentations or deterministic transforms to apply tfm_gens: data augmentation image_format: an image format supported by :func:`detection_utils.read_image`. """ self.tfm_gens = tfm_gens logging.getLogger(__name__).info( "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens)) ) self.img_format = image_format self.is_train = is_train @classmethod def from_config(cls, cfg, is_train=True): # Build augmentation tfm_gens = build_transform_gen(cfg, is_train) ret = { "is_train": is_train, "tfm_gens": tfm_gens, "image_format": cfg.INPUT.FORMAT, } return ret def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below image = utils.read_image(dataset_dict["file_name"], format=self.img_format) utils.check_image_size(dataset_dict, image) # TODO: get padding mask # by feeding a "segmentation mask" to the same transforms padding_mask = np.ones(image.shape[:2]) image, transforms = T.apply_transform_gens(self.tfm_gens, image) # the crop transformation has default padding value 0 for segmentation padding_mask = transforms.apply_segmentation(padding_mask) padding_mask = ~ padding_mask.astype(bool) image_shape = image.shape[:2] # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask)) if not self.is_train: # USER: Modify this if you want to keep them for some reason. dataset_dict.pop("annotations", None) return dataset_dict if "annotations" in dataset_dict: # USER: Modify this if you want to keep them for some reason. for anno in dataset_dict["annotations"]: # Let's always keep mask # if not self.mask_on: # anno.pop("segmentation", None) anno.pop("keypoints", None) # USER: Implement additional transformations if you have other types of data annos = [ utils.transform_instance_annotations(obj, transforms, image_shape) for obj in dataset_dict.pop("annotations") if obj.get("iscrowd", 0) == 0 ] # NOTE: does not support BitMask due to augmentation # Current BitMask cannot handle empty objects instances = utils.annotations_to_instances(annos, image_shape) # After transforms such as cropping are applied, the bounding box may no longer # tightly bound the object. As an example, imagine a triangle object # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to # the intersection of original bounding box and the cropping box. instances.gt_boxes = instances.gt_masks.get_bounding_boxes() # Need to filter empty instances first (due to augmentation) instances = utils.filter_empty_instances(instances) # Generate masks from polygon h, w = instances.image_size # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float) if hasattr(instances, 'gt_masks'): gt_masks = instances.gt_masks gt_masks_box = convert_coco_poly_to_mask(gt_masks.polygons, h, w) instances.gt_masks = gt_masks_box dataset_dict["instances"] = instances return dataset_dict ================================================ FILE: mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py ================================================ # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py import copy import logging import numpy as np import torch from detectron2.config import configurable from detectron2.data import detection_utils as utils from detectron2.data import transforms as T from detectron2.data.transforms import TransformGen from detectron2.structures import BitMasks, Boxes, Instances __all__ = ["COCOPanopticNewBaselineDatasetMapper"] def build_transform_gen(cfg, is_train): """ Create a list of default :class:`Augmentation` from config. Now it includes resizing and flipping. Returns: list[Augmentation] """ assert is_train, "Only support training augmentation" image_size = cfg.INPUT.IMAGE_SIZE min_scale = cfg.INPUT.MIN_SCALE max_scale = cfg.INPUT.MAX_SCALE augmentation = [] if cfg.INPUT.RANDOM_FLIP != "none": augmentation.append( T.RandomFlip( horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal", vertical=cfg.INPUT.RANDOM_FLIP == "vertical", ) ) augmentation.extend([ T.ResizeScale( min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size ), T.FixedSizeCrop(crop_size=(image_size, image_size)), ]) return augmentation # This is specifically designed for the COCO dataset. class COCOPanopticNewBaselineDatasetMapper: """ A callable which takes a dataset dict in Detectron2 Dataset format, and map it into a format used by MaskFormer. This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. The callable currently does the following: 1. Read the image from "file_name" 2. Applies geometric transforms to the image and annotation 3. Find and applies suitable cropping to the image and annotation 4. Prepare image and annotation to Tensors """ @configurable def __init__( self, is_train=True, *, tfm_gens, image_format, ): """ NOTE: this interface is experimental. Args: is_train: for training or inference augmentations: a list of augmentations or deterministic transforms to apply crop_gen: crop augmentation tfm_gens: data augmentation image_format: an image format supported by :func:`detection_utils.read_image`. """ self.tfm_gens = tfm_gens logging.getLogger(__name__).info( "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format( str(self.tfm_gens) ) ) self.img_format = image_format self.is_train = is_train @classmethod def from_config(cls, cfg, is_train=True): # Build augmentation tfm_gens = build_transform_gen(cfg, is_train) ret = { "is_train": is_train, "tfm_gens": tfm_gens, "image_format": cfg.INPUT.FORMAT, } return ret def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below image = utils.read_image(dataset_dict["file_name"], format=self.img_format) utils.check_image_size(dataset_dict, image) image, transforms = T.apply_transform_gens(self.tfm_gens, image) image_shape = image.shape[:2] # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) if not self.is_train: # USER: Modify this if you want to keep them for some reason. dataset_dict.pop("annotations", None) return dataset_dict if "pan_seg_file_name" in dataset_dict: pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB") segments_info = dataset_dict["segments_info"] # apply the same transformation to panoptic segmentation pan_seg_gt = transforms.apply_segmentation(pan_seg_gt) from panopticapi.utils import rgb2id pan_seg_gt = rgb2id(pan_seg_gt) instances = Instances(image_shape) classes = [] masks = [] for segment_info in segments_info: class_id = segment_info["category_id"] if not segment_info["iscrowd"]: classes.append(class_id) masks.append(pan_seg_gt == segment_info["id"]) classes = np.array(classes) instances.gt_classes = torch.tensor(classes, dtype=torch.int64) if len(masks) == 0: # Some image does not have annotation (all ignored) instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) instances.gt_boxes = Boxes(torch.zeros((0, 4))) else: masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) ) instances.gt_masks = masks.tensor instances.gt_boxes = masks.get_bounding_boxes() dataset_dict["instances"] = instances return dataset_dict ================================================ FILE: mask2former/data/dataset_mappers/mask_former_instance_dataset_mapper.py ================================================ import copy import logging import numpy as np import pycocotools.mask as mask_util import torch from torch.nn import functional as F from detectron2.config import configurable from detectron2.data import detection_utils as utils from detectron2.data import transforms as T from detectron2.projects.point_rend import ColorAugSSDTransform from detectron2.structures import BitMasks, Instances, polygons_to_bitmask __all__ = ["MaskFormerInstanceDatasetMapper"] class MaskFormerInstanceDatasetMapper: """ A callable which takes a dataset dict in Detectron2 Dataset format, and map it into a format used by MaskFormer for instance segmentation. The callable currently does the following: 1. Read the image from "file_name" 2. Applies geometric transforms to the image and annotation 3. Find and applies suitable cropping to the image and annotation 4. Prepare image and annotation to Tensors """ @configurable def __init__( self, is_train=True, *, augmentations, image_format, size_divisibility, ): """ NOTE: this interface is experimental. Args: is_train: for training or inference augmentations: a list of augmentations or deterministic transforms to apply image_format: an image format supported by :func:`detection_utils.read_image`. size_divisibility: pad image size to be divisible by this value """ self.is_train = is_train self.tfm_gens = augmentations self.img_format = image_format self.size_divisibility = size_divisibility logger = logging.getLogger(__name__) mode = "training" if is_train else "inference" logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}") @classmethod def from_config(cls, cfg, is_train=True): # Build augmentation augs = [ T.ResizeShortestEdge( cfg.INPUT.MIN_SIZE_TRAIN, cfg.INPUT.MAX_SIZE_TRAIN, cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING, ) ] if cfg.INPUT.CROP.ENABLED: augs.append( T.RandomCrop( cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE, ) ) if cfg.INPUT.COLOR_AUG_SSD: augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) augs.append(T.RandomFlip()) ret = { "is_train": is_train, "augmentations": augs, "image_format": cfg.INPUT.FORMAT, "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY, } return ret def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!" dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below image = utils.read_image(dataset_dict["file_name"], format=self.img_format) utils.check_image_size(dataset_dict, image) aug_input = T.AugInput(image) aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) image = aug_input.image # transform instnace masks assert "annotations" in dataset_dict for anno in dataset_dict["annotations"]: anno.pop("keypoints", None) annos = [ utils.transform_instance_annotations(obj, transforms, image.shape[:2]) for obj in dataset_dict.pop("annotations") if obj.get("iscrowd", 0) == 0 ] if len(annos): assert "segmentation" in annos[0] segms = [obj["segmentation"] for obj in annos] masks = [] for segm in segms: if isinstance(segm, list): # polygon masks.append(polygons_to_bitmask(segm, *image.shape[:2])) elif isinstance(segm, dict): # COCO RLE masks.append(mask_util.decode(segm)) elif isinstance(segm, np.ndarray): assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim ) # mask array masks.append(segm) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a binary segmentation mask " " in a 2D numpy array of shape HxW.".format(type(segm)) ) # Pad image and segmentation label here! image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks] classes = [int(obj["category_id"]) for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) if self.size_divisibility > 0: image_size = (image.shape[-2], image.shape[-1]) padding_size = [ 0, self.size_divisibility - image_size[1], 0, self.size_divisibility - image_size[0], ] # pad image image = F.pad(image, padding_size, value=128).contiguous() # pad mask masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks] image_shape = (image.shape[-2], image.shape[-1]) # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = image # Prepare per-category binary masks instances = Instances(image_shape) instances.gt_classes = classes if len(masks) == 0: # Some image does not have annotation (all ignored) instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1])) else: masks = BitMasks(torch.stack(masks)) instances.gt_masks = masks.tensor dataset_dict["instances"] = instances return dataset_dict ================================================ FILE: mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py ================================================ import copy import logging import numpy as np import torch from torch.nn import functional as F from detectron2.config import configurable from detectron2.data import detection_utils as utils from detectron2.data import transforms as T from detectron2.structures import BitMasks, Instances from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper __all__ = ["MaskFormerPanopticDatasetMapper"] class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper): """ A callable which takes a dataset dict in Detectron2 Dataset format, and map it into a format used by MaskFormer for panoptic segmentation. The callable currently does the following: 1. Read the image from "file_name" 2. Applies geometric transforms to the image and annotation 3. Find and applies suitable cropping to the image and annotation 4. Prepare image and annotation to Tensors """ @configurable def __init__( self, is_train=True, *, augmentations, image_format, ignore_label, size_divisibility, ): """ NOTE: this interface is experimental. Args: is_train: for training or inference augmentations: a list of augmentations or deterministic transforms to apply image_format: an image format supported by :func:`detection_utils.read_image`. ignore_label: the label that is ignored to evaluation size_divisibility: pad image size to be divisible by this value """ super().__init__( is_train, augmentations=augmentations, image_format=image_format, ignore_label=ignore_label, size_divisibility=size_divisibility, ) def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!" dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below image = utils.read_image(dataset_dict["file_name"], format=self.img_format) utils.check_image_size(dataset_dict, image) # semantic segmentation if "sem_seg_file_name" in dataset_dict: # PyTorch transformation not implemented for uint16, so converting it to double first sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double") else: sem_seg_gt = None # panoptic segmentation if "pan_seg_file_name" in dataset_dict: pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB") segments_info = dataset_dict["segments_info"] else: pan_seg_gt = None segments_info = None if pan_seg_gt is None: raise ValueError( "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format( dataset_dict["file_name"] ) ) aug_input = T.AugInput(image, sem_seg=sem_seg_gt) aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) image = aug_input.image if sem_seg_gt is not None: sem_seg_gt = aug_input.sem_seg # apply the same transformation to panoptic segmentation pan_seg_gt = transforms.apply_segmentation(pan_seg_gt) from panopticapi.utils import rgb2id pan_seg_gt = rgb2id(pan_seg_gt) # Pad image and segmentation label here! image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) if sem_seg_gt is not None: sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long")) if self.size_divisibility > 0: image_size = (image.shape[-2], image.shape[-1]) padding_size = [ 0, self.size_divisibility - image_size[1], 0, self.size_divisibility - image_size[0], ] image = F.pad(image, padding_size, value=128).contiguous() if sem_seg_gt is not None: sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous() pan_seg_gt = F.pad( pan_seg_gt, padding_size, value=0 ).contiguous() # 0 is the VOID panoptic label image_shape = (image.shape[-2], image.shape[-1]) # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = image if sem_seg_gt is not None: dataset_dict["sem_seg"] = sem_seg_gt.long() if "annotations" in dataset_dict: raise ValueError("Pemantic segmentation dataset should not have 'annotations'.") # Prepare per-category binary masks pan_seg_gt = pan_seg_gt.numpy() instances = Instances(image_shape) classes = [] masks = [] for segment_info in segments_info: class_id = segment_info["category_id"] if not segment_info["iscrowd"]: classes.append(class_id) masks.append(pan_seg_gt == segment_info["id"]) classes = np.array(classes) instances.gt_classes = torch.tensor(classes, dtype=torch.int64) if len(masks) == 0: # Some image does not have annotation (all ignored) instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) else: masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) ) instances.gt_masks = masks.tensor dataset_dict["instances"] = instances return dataset_dict ================================================ FILE: mask2former/data/dataset_mappers/mask_former_semantic_dataset_mapper.py ================================================ import copy import logging import numpy as np import torch from torch.nn import functional as F from detectron2.config import configurable from detectron2.data import MetadataCatalog from detectron2.data import detection_utils as utils from detectron2.data import transforms as T from detectron2.projects.point_rend import ColorAugSSDTransform from detectron2.structures import BitMasks, Instances __all__ = ["MaskFormerSemanticDatasetMapper"] class MaskFormerSemanticDatasetMapper: """ A callable which takes a dataset dict in Detectron2 Dataset format, and map it into a format used by MaskFormer for semantic segmentation. The callable currently does the following: 1. Read the image from "file_name" 2. Applies geometric transforms to the image and annotation 3. Find and applies suitable cropping to the image and annotation 4. Prepare image and annotation to Tensors """ @configurable def __init__( self, is_train=True, *, augmentations, image_format, ignore_label, size_divisibility, ): """ NOTE: this interface is experimental. Args: is_train: for training or inference augmentations: a list of augmentations or deterministic transforms to apply image_format: an image format supported by :func:`detection_utils.read_image`. ignore_label: the label that is ignored to evaluation size_divisibility: pad image size to be divisible by this value """ self.is_train = is_train self.tfm_gens = augmentations self.img_format = image_format self.ignore_label = ignore_label self.size_divisibility = size_divisibility logger = logging.getLogger(__name__) mode = "training" if is_train else "inference" logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}") @classmethod def from_config(cls, cfg, is_train=True): # Build augmentation augs = [ T.ResizeShortestEdge( cfg.INPUT.MIN_SIZE_TRAIN, cfg.INPUT.MAX_SIZE_TRAIN, cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING, ) ] if cfg.INPUT.CROP.ENABLED: augs.append( T.RandomCrop_CategoryAreaConstraint( cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE, cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA, cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, ) ) if cfg.INPUT.COLOR_AUG_SSD: augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) augs.append(T.RandomFlip()) # Assume always applies to the training set. dataset_names = cfg.DATASETS.TRAIN meta = MetadataCatalog.get(dataset_names[0]) ignore_label = meta.ignore_label ret = { "is_train": is_train, "augmentations": augs, "image_format": cfg.INPUT.FORMAT, "ignore_label": ignore_label, "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY, } return ret def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!" dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below image = utils.read_image(dataset_dict["file_name"], format=self.img_format) utils.check_image_size(dataset_dict, image) if "sem_seg_file_name" in dataset_dict: # PyTorch transformation not implemented for uint16, so converting it to double first sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double") else: sem_seg_gt = None if sem_seg_gt is None: raise ValueError( "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format( dataset_dict["file_name"] ) ) aug_input = T.AugInput(image, sem_seg=sem_seg_gt) aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) image = aug_input.image sem_seg_gt = aug_input.sem_seg # Pad image and segmentation label here! image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) if sem_seg_gt is not None: sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) if self.size_divisibility > 0: image_size = (image.shape[-2], image.shape[-1]) padding_size = [ 0, self.size_divisibility - image_size[1], 0, self.size_divisibility - image_size[0], ] image = F.pad(image, padding_size, value=128).contiguous() if sem_seg_gt is not None: sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous() image_shape = (image.shape[-2], image.shape[-1]) # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = image if sem_seg_gt is not None: dataset_dict["sem_seg"] = sem_seg_gt.long() if "annotations" in dataset_dict: raise ValueError("Semantic segmentation dataset should not have 'annotations'.") # Prepare per-category binary masks if sem_seg_gt is not None: sem_seg_gt = sem_seg_gt.numpy() instances = Instances(image_shape) classes = np.unique(sem_seg_gt) # remove ignored region classes = classes[classes != self.ignore_label] instances.gt_classes = torch.tensor(classes, dtype=torch.int64) masks = [] for class_id in classes: masks.append(sem_seg_gt == class_id) if len(masks) == 0: # Some image does not have annotation (all ignored) instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1])) else: masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) ) instances.gt_masks = masks.tensor dataset_dict["instances"] = instances return dataset_dict ================================================ FILE: mask2former/data/datasets/__init__.py ================================================ from . import ( register_ade20k_full, register_ade20k_panoptic, register_coco_stuff_10k, register_mapillary_vistas, register_coco_panoptic_annos_semseg, register_ade20k_instance, register_mapillary_vistas_panoptic, ) ================================================ FILE: mask2former/data/datasets/register_ade20k_full.py ================================================ import os from detectron2.data import DatasetCatalog, MetadataCatalog from detectron2.data.datasets import load_sem_seg ADE20K_SEM_SEG_FULL_CATEGORIES = [ {"name": "wall", "id": 2978, "trainId": 0}, {"name": "building, edifice", "id": 312, "trainId": 1}, {"name": "sky", "id": 2420, "trainId": 2}, {"name": "tree", "id": 2855, "trainId": 3}, {"name": "road, route", "id": 2131, "trainId": 4}, {"name": "floor, flooring", "id": 976, "trainId": 5}, {"name": "ceiling", "id": 447, "trainId": 6}, {"name": "bed", "id": 165, "trainId": 7}, {"name": "sidewalk, pavement", "id": 2377, "trainId": 8}, {"name": "earth, ground", "id": 838, "trainId": 9}, {"name": "cabinet", "id": 350, "trainId": 10}, {"name": "person, individual, someone, somebody, mortal, soul", "id": 1831, "trainId": 11}, {"name": "grass", "id": 1125, "trainId": 12}, {"name": "windowpane, window", "id": 3055, "trainId": 13}, {"name": "car, auto, automobile, machine, motorcar", "id": 401, "trainId": 14}, {"name": "mountain, mount", "id": 1610, "trainId": 15}, {"name": "plant, flora, plant life", "id": 1910, "trainId": 16}, {"name": "table", "id": 2684, "trainId": 17}, {"name": "chair", "id": 471, "trainId": 18}, {"name": "curtain, drape, drapery, mantle, pall", "id": 687, "trainId": 19}, {"name": "door", "id": 774, "trainId": 20}, {"name": "sofa, couch, lounge", "id": 2473, "trainId": 21}, {"name": "sea", "id": 2264, "trainId": 22}, {"name": "painting, picture", "id": 1735, "trainId": 23}, {"name": "water", "id": 2994, "trainId": 24}, {"name": "mirror", "id": 1564, "trainId": 25}, {"name": "house", "id": 1276, "trainId": 26}, {"name": "rug, carpet, carpeting", "id": 2178, "trainId": 27}, {"name": "shelf", "id": 2329, "trainId": 28}, {"name": "armchair", "id": 57, "trainId": 29}, {"name": "fence, fencing", "id": 907, "trainId": 30}, {"name": "field", "id": 913, "trainId": 31}, {"name": "lamp", "id": 1395, "trainId": 32}, {"name": "rock, stone", "id": 2138, "trainId": 33}, {"name": "seat", "id": 2272, "trainId": 34}, {"name": "river", "id": 2128, "trainId": 35}, {"name": "desk", "id": 724, "trainId": 36}, {"name": "bathtub, bathing tub, bath, tub", "id": 155, "trainId": 37}, {"name": "railing, rail", "id": 2053, "trainId": 38}, {"name": "signboard, sign", "id": 2380, "trainId": 39}, {"name": "cushion", "id": 689, "trainId": 40}, {"name": "path", "id": 1788, "trainId": 41}, {"name": "work surface", "id": 3087, "trainId": 42}, {"name": "stairs, steps", "id": 2530, "trainId": 43}, {"name": "column, pillar", "id": 581, "trainId": 44}, {"name": "sink", "id": 2388, "trainId": 45}, {"name": "wardrobe, closet, press", "id": 2985, "trainId": 46}, {"name": "snow", "id": 2454, "trainId": 47}, {"name": "refrigerator, icebox", "id": 2096, "trainId": 48}, {"name": "base, pedestal, stand", "id": 137, "trainId": 49}, {"name": "bridge, span", "id": 294, "trainId": 50}, {"name": "blind, screen", "id": 212, "trainId": 51}, {"name": "runway", "id": 2185, "trainId": 52}, {"name": "cliff, drop, drop-off", "id": 524, "trainId": 53}, {"name": "sand", "id": 2212, "trainId": 54}, {"name": "fireplace, hearth, open fireplace", "id": 943, "trainId": 55}, {"name": "pillow", "id": 1869, "trainId": 56}, {"name": "screen door, screen", "id": 2251, "trainId": 57}, {"name": "toilet, can, commode, crapper, pot, potty, stool, throne", "id": 2793, "trainId": 58}, {"name": "skyscraper", "id": 2423, "trainId": 59}, {"name": "grandstand, covered stand", "id": 1121, "trainId": 60}, {"name": "box", "id": 266, "trainId": 61}, {"name": "pool table, billiard table, snooker table", "id": 1948, "trainId": 62}, {"name": "palm, palm tree", "id": 1744, "trainId": 63}, {"name": "double door", "id": 783, "trainId": 64}, {"name": "coffee table, cocktail table", "id": 571, "trainId": 65}, {"name": "counter", "id": 627, "trainId": 66}, {"name": "countertop", "id": 629, "trainId": 67}, {"name": "chest of drawers, chest, bureau, dresser", "id": 491, "trainId": 68}, {"name": "kitchen island", "id": 1374, "trainId": 69}, {"name": "boat", "id": 223, "trainId": 70}, {"name": "waterfall, falls", "id": 3016, "trainId": 71}, { "name": "stove, kitchen stove, range, kitchen range, cooking stove", "id": 2598, "trainId": 72, }, {"name": "flower", "id": 978, "trainId": 73}, {"name": "bookcase", "id": 239, "trainId": 74}, {"name": "controls", "id": 608, "trainId": 75}, {"name": "book", "id": 236, "trainId": 76}, {"name": "stairway, staircase", "id": 2531, "trainId": 77}, {"name": "streetlight, street lamp", "id": 2616, "trainId": 78}, { "name": "computer, computing machine, computing device, data processor, electronic computer, information processing system", "id": 591, "trainId": 79, }, { "name": "bus, autobus, coach, charabanc, double-decker, jitney, motorbus, motorcoach, omnibus, passenger vehicle", "id": 327, "trainId": 80, }, {"name": "swivel chair", "id": 2679, "trainId": 81}, {"name": "light, light source", "id": 1451, "trainId": 82}, {"name": "bench", "id": 181, "trainId": 83}, {"name": "case, display case, showcase, vitrine", "id": 420, "trainId": 84}, {"name": "towel", "id": 2821, "trainId": 85}, {"name": "fountain", "id": 1023, "trainId": 86}, {"name": "embankment", "id": 855, "trainId": 87}, { "name": "television receiver, television, television set, tv, tv set, idiot box, boob tube, telly, goggle box", "id": 2733, "trainId": 88, }, {"name": "van", "id": 2928, "trainId": 89}, {"name": "hill", "id": 1240, "trainId": 90}, {"name": "awning, sunshade, sunblind", "id": 77, "trainId": 91}, {"name": "poster, posting, placard, notice, bill, card", "id": 1969, "trainId": 92}, {"name": "truck, motortruck", "id": 2880, "trainId": 93}, {"name": "airplane, aeroplane, plane", "id": 14, "trainId": 94}, {"name": "pole", "id": 1936, "trainId": 95}, {"name": "tower", "id": 2828, "trainId": 96}, {"name": "court", "id": 631, "trainId": 97}, {"name": "ball", "id": 103, "trainId": 98}, { "name": "aircraft carrier, carrier, flattop, attack aircraft carrier", "id": 3144, "trainId": 99, }, {"name": "buffet, counter, sideboard", "id": 308, "trainId": 100}, {"name": "hovel, hut, hutch, shack, shanty", "id": 1282, "trainId": 101}, {"name": "apparel, wearing apparel, dress, clothes", "id": 38, "trainId": 102}, {"name": "minibike, motorbike", "id": 1563, "trainId": 103}, {"name": "animal, animate being, beast, brute, creature, fauna", "id": 29, "trainId": 104}, {"name": "chandelier, pendant, pendent", "id": 480, "trainId": 105}, {"name": "step, stair", "id": 2569, "trainId": 106}, {"name": "booth, cubicle, stall, kiosk", "id": 247, "trainId": 107}, {"name": "bicycle, bike, wheel, cycle", "id": 187, "trainId": 108}, {"name": "doorframe, doorcase", "id": 778, "trainId": 109}, {"name": "sconce", "id": 2243, "trainId": 110}, {"name": "pond", "id": 1941, "trainId": 111}, {"name": "trade name, brand name, brand, marque", "id": 2833, "trainId": 112}, {"name": "bannister, banister, balustrade, balusters, handrail", "id": 120, "trainId": 113}, {"name": "bag", "id": 95, "trainId": 114}, {"name": "traffic light, traffic signal, stoplight", "id": 2836, "trainId": 115}, {"name": "gazebo", "id": 1087, "trainId": 116}, {"name": "escalator, moving staircase, moving stairway", "id": 868, "trainId": 117}, {"name": "land, ground, soil", "id": 1401, "trainId": 118}, {"name": "board, plank", "id": 220, "trainId": 119}, {"name": "arcade machine", "id": 47, "trainId": 120}, {"name": "eiderdown, duvet, continental quilt", "id": 843, "trainId": 121}, {"name": "bar", "id": 123, "trainId": 122}, {"name": "stall, stand, sales booth", "id": 2537, "trainId": 123}, {"name": "playground", "id": 1927, "trainId": 124}, {"name": "ship", "id": 2337, "trainId": 125}, {"name": "ottoman, pouf, pouffe, puff, hassock", "id": 1702, "trainId": 126}, { "name": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin", "id": 64, "trainId": 127, }, {"name": "bottle", "id": 249, "trainId": 128}, {"name": "cradle", "id": 642, "trainId": 129}, {"name": "pot, flowerpot", "id": 1981, "trainId": 130}, { "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter", "id": 609, "trainId": 131, }, {"name": "train, railroad train", "id": 2840, "trainId": 132}, {"name": "stool", "id": 2586, "trainId": 133}, {"name": "lake", "id": 1393, "trainId": 134}, {"name": "tank, storage tank", "id": 2704, "trainId": 135}, {"name": "ice, water ice", "id": 1304, "trainId": 136}, {"name": "basket, handbasket", "id": 146, "trainId": 137}, {"name": "manhole", "id": 1494, "trainId": 138}, {"name": "tent, collapsible shelter", "id": 2739, "trainId": 139}, {"name": "canopy", "id": 389, "trainId": 140}, {"name": "microwave, microwave oven", "id": 1551, "trainId": 141}, {"name": "barrel, cask", "id": 131, "trainId": 142}, {"name": "dirt track", "id": 738, "trainId": 143}, {"name": "beam", "id": 161, "trainId": 144}, {"name": "dishwasher, dish washer, dishwashing machine", "id": 747, "trainId": 145}, {"name": "plate", "id": 1919, "trainId": 146}, {"name": "screen, crt screen", "id": 3109, "trainId": 147}, {"name": "ruins", "id": 2179, "trainId": 148}, {"name": "washer, automatic washer, washing machine", "id": 2989, "trainId": 149}, {"name": "blanket, cover", "id": 206, "trainId": 150}, {"name": "plaything, toy", "id": 1930, "trainId": 151}, {"name": "food, solid food", "id": 1002, "trainId": 152}, {"name": "screen, silver screen, projection screen", "id": 2254, "trainId": 153}, {"name": "oven", "id": 1708, "trainId": 154}, {"name": "stage", "id": 2526, "trainId": 155}, {"name": "beacon, lighthouse, beacon light, pharos", "id": 160, "trainId": 156}, {"name": "umbrella", "id": 2901, "trainId": 157}, {"name": "sculpture", "id": 2262, "trainId": 158}, {"name": "aqueduct", "id": 44, "trainId": 159}, {"name": "container", "id": 597, "trainId": 160}, {"name": "scaffolding, staging", "id": 2235, "trainId": 161}, {"name": "hood, exhaust hood", "id": 1260, "trainId": 162}, {"name": "curb, curbing, kerb", "id": 682, "trainId": 163}, {"name": "roller coaster", "id": 2151, "trainId": 164}, {"name": "horse, equus caballus", "id": 3107, "trainId": 165}, {"name": "catwalk", "id": 432, "trainId": 166}, {"name": "glass, drinking glass", "id": 1098, "trainId": 167}, {"name": "vase", "id": 2932, "trainId": 168}, {"name": "central reservation", "id": 461, "trainId": 169}, {"name": "carousel", "id": 410, "trainId": 170}, {"name": "radiator", "id": 2046, "trainId": 171}, {"name": "closet", "id": 533, "trainId": 172}, {"name": "machine", "id": 1481, "trainId": 173}, {"name": "pier, wharf, wharfage, dock", "id": 1858, "trainId": 174}, {"name": "fan", "id": 894, "trainId": 175}, {"name": "inflatable bounce game", "id": 1322, "trainId": 176}, {"name": "pitch", "id": 1891, "trainId": 177}, {"name": "paper", "id": 1756, "trainId": 178}, {"name": "arcade, colonnade", "id": 49, "trainId": 179}, {"name": "hot tub", "id": 1272, "trainId": 180}, {"name": "helicopter", "id": 1229, "trainId": 181}, {"name": "tray", "id": 2850, "trainId": 182}, {"name": "partition, divider", "id": 1784, "trainId": 183}, {"name": "vineyard", "id": 2962, "trainId": 184}, {"name": "bowl", "id": 259, "trainId": 185}, {"name": "bullring", "id": 319, "trainId": 186}, {"name": "flag", "id": 954, "trainId": 187}, {"name": "pot", "id": 1974, "trainId": 188}, {"name": "footbridge, overcrossing, pedestrian bridge", "id": 1013, "trainId": 189}, {"name": "shower", "id": 2356, "trainId": 190}, {"name": "bag, traveling bag, travelling bag, grip, suitcase", "id": 97, "trainId": 191}, {"name": "bulletin board, notice board", "id": 318, "trainId": 192}, {"name": "confessional booth", "id": 592, "trainId": 193}, {"name": "trunk, tree trunk, bole", "id": 2885, "trainId": 194}, {"name": "forest", "id": 1017, "trainId": 195}, {"name": "elevator door", "id": 851, "trainId": 196}, {"name": "laptop, laptop computer", "id": 1407, "trainId": 197}, {"name": "instrument panel", "id": 1332, "trainId": 198}, {"name": "bucket, pail", "id": 303, "trainId": 199}, {"name": "tapestry, tapis", "id": 2714, "trainId": 200}, {"name": "platform", "id": 1924, "trainId": 201}, {"name": "jacket", "id": 1346, "trainId": 202}, {"name": "gate", "id": 1081, "trainId": 203}, {"name": "monitor, monitoring device", "id": 1583, "trainId": 204}, { "name": "telephone booth, phone booth, call box, telephone box, telephone kiosk", "id": 2727, "trainId": 205, }, {"name": "spotlight, spot", "id": 2509, "trainId": 206}, {"name": "ring", "id": 2123, "trainId": 207}, {"name": "control panel", "id": 602, "trainId": 208}, {"name": "blackboard, chalkboard", "id": 202, "trainId": 209}, {"name": "air conditioner, air conditioning", "id": 10, "trainId": 210}, {"name": "chest", "id": 490, "trainId": 211}, {"name": "clock", "id": 530, "trainId": 212}, {"name": "sand dune", "id": 2213, "trainId": 213}, {"name": "pipe, pipage, piping", "id": 1884, "trainId": 214}, {"name": "vault", "id": 2934, "trainId": 215}, {"name": "table football", "id": 2687, "trainId": 216}, {"name": "cannon", "id": 387, "trainId": 217}, {"name": "swimming pool, swimming bath, natatorium", "id": 2668, "trainId": 218}, {"name": "fluorescent, fluorescent fixture", "id": 982, "trainId": 219}, {"name": "statue", "id": 2547, "trainId": 220}, { "name": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system", "id": 1474, "trainId": 221, }, {"name": "exhibitor", "id": 877, "trainId": 222}, {"name": "ladder", "id": 1391, "trainId": 223}, {"name": "carport", "id": 414, "trainId": 224}, {"name": "dam", "id": 698, "trainId": 225}, {"name": "pulpit", "id": 2019, "trainId": 226}, {"name": "skylight, fanlight", "id": 2422, "trainId": 227}, {"name": "water tower", "id": 3010, "trainId": 228}, {"name": "grill, grille, grillwork", "id": 1139, "trainId": 229}, {"name": "display board", "id": 753, "trainId": 230}, {"name": "pane, pane of glass, window glass", "id": 1747, "trainId": 231}, {"name": "rubbish, trash, scrap", "id": 2175, "trainId": 232}, {"name": "ice rink", "id": 1301, "trainId": 233}, {"name": "fruit", "id": 1033, "trainId": 234}, {"name": "patio", "id": 1789, "trainId": 235}, {"name": "vending machine", "id": 2939, "trainId": 236}, {"name": "telephone, phone, telephone set", "id": 2730, "trainId": 237}, {"name": "net", "id": 1652, "trainId": 238}, { "name": "backpack, back pack, knapsack, packsack, rucksack, haversack", "id": 90, "trainId": 239, }, {"name": "jar", "id": 1349, "trainId": 240}, {"name": "track", "id": 2830, "trainId": 241}, {"name": "magazine", "id": 1485, "trainId": 242}, {"name": "shutter", "id": 2370, "trainId": 243}, {"name": "roof", "id": 2155, "trainId": 244}, {"name": "banner, streamer", "id": 118, "trainId": 245}, {"name": "landfill", "id": 1402, "trainId": 246}, {"name": "post", "id": 1957, "trainId": 247}, {"name": "altarpiece, reredos", "id": 3130, "trainId": 248}, {"name": "hat, chapeau, lid", "id": 1197, "trainId": 249}, {"name": "arch, archway", "id": 52, "trainId": 250}, {"name": "table game", "id": 2688, "trainId": 251}, {"name": "bag, handbag, pocketbook, purse", "id": 96, "trainId": 252}, {"name": "document, written document, papers", "id": 762, "trainId": 253}, {"name": "dome", "id": 772, "trainId": 254}, {"name": "pier", "id": 1857, "trainId": 255}, {"name": "shanties", "id": 2315, "trainId": 256}, {"name": "forecourt", "id": 1016, "trainId": 257}, {"name": "crane", "id": 643, "trainId": 258}, {"name": "dog, domestic dog, canis familiaris", "id": 3105, "trainId": 259}, {"name": "piano, pianoforte, forte-piano", "id": 1849, "trainId": 260}, {"name": "drawing", "id": 791, "trainId": 261}, {"name": "cabin", "id": 349, "trainId": 262}, { "name": "ad, advertisement, advertizement, advertising, advertizing, advert", "id": 6, "trainId": 263, }, {"name": "amphitheater, amphitheatre, coliseum", "id": 3114, "trainId": 264}, {"name": "monument", "id": 1587, "trainId": 265}, {"name": "henhouse", "id": 1233, "trainId": 266}, {"name": "cockpit", "id": 559, "trainId": 267}, {"name": "heater, warmer", "id": 1223, "trainId": 268}, {"name": "windmill, aerogenerator, wind generator", "id": 3049, "trainId": 269}, {"name": "pool", "id": 1943, "trainId": 270}, {"name": "elevator, lift", "id": 853, "trainId": 271}, {"name": "decoration, ornament, ornamentation", "id": 709, "trainId": 272}, {"name": "labyrinth", "id": 1390, "trainId": 273}, {"name": "text, textual matter", "id": 2748, "trainId": 274}, {"name": "printer", "id": 2007, "trainId": 275}, {"name": "mezzanine, first balcony", "id": 1546, "trainId": 276}, {"name": "mattress", "id": 1513, "trainId": 277}, {"name": "straw", "id": 2600, "trainId": 278}, {"name": "stalls", "id": 2538, "trainId": 279}, {"name": "patio, terrace", "id": 1790, "trainId": 280}, {"name": "billboard, hoarding", "id": 194, "trainId": 281}, {"name": "bus stop", "id": 326, "trainId": 282}, {"name": "trouser, pant", "id": 2877, "trainId": 283}, {"name": "console table, console", "id": 594, "trainId": 284}, {"name": "rack", "id": 2036, "trainId": 285}, {"name": "notebook", "id": 1662, "trainId": 286}, {"name": "shrine", "id": 2366, "trainId": 287}, {"name": "pantry", "id": 1754, "trainId": 288}, {"name": "cart", "id": 418, "trainId": 289}, {"name": "steam shovel", "id": 2553, "trainId": 290}, {"name": "porch", "id": 1951, "trainId": 291}, {"name": "postbox, mailbox, letter box", "id": 1963, "trainId": 292}, {"name": "figurine, statuette", "id": 918, "trainId": 293}, {"name": "recycling bin", "id": 2086, "trainId": 294}, {"name": "folding screen", "id": 997, "trainId": 295}, {"name": "telescope", "id": 2731, "trainId": 296}, {"name": "deck chair, beach chair", "id": 704, "trainId": 297}, {"name": "kennel", "id": 1365, "trainId": 298}, {"name": "coffee maker", "id": 569, "trainId": 299}, {"name": "altar, communion table, lord's table", "id": 3108, "trainId": 300}, {"name": "fish", "id": 948, "trainId": 301}, {"name": "easel", "id": 839, "trainId": 302}, {"name": "artificial golf green", "id": 63, "trainId": 303}, {"name": "iceberg", "id": 1305, "trainId": 304}, {"name": "candlestick, candle holder", "id": 378, "trainId": 305}, {"name": "shower stall, shower bath", "id": 2362, "trainId": 306}, {"name": "television stand", "id": 2734, "trainId": 307}, { "name": "wall socket, wall plug, electric outlet, electrical outlet, outlet, electric receptacle", "id": 2982, "trainId": 308, }, {"name": "skeleton", "id": 2398, "trainId": 309}, {"name": "grand piano, grand", "id": 1119, "trainId": 310}, {"name": "candy, confect", "id": 382, "trainId": 311}, {"name": "grille door", "id": 1141, "trainId": 312}, {"name": "pedestal, plinth, footstall", "id": 1805, "trainId": 313}, {"name": "jersey, t-shirt, tee shirt", "id": 3102, "trainId": 314}, {"name": "shoe", "id": 2341, "trainId": 315}, {"name": "gravestone, headstone, tombstone", "id": 1131, "trainId": 316}, {"name": "shanty", "id": 2316, "trainId": 317}, {"name": "structure", "id": 2626, "trainId": 318}, {"name": "rocking chair, rocker", "id": 3104, "trainId": 319}, {"name": "bird", "id": 198, "trainId": 320}, {"name": "place mat", "id": 1896, "trainId": 321}, {"name": "tomb", "id": 2800, "trainId": 322}, {"name": "big top", "id": 190, "trainId": 323}, {"name": "gas pump, gasoline pump, petrol pump, island dispenser", "id": 3131, "trainId": 324}, {"name": "lockers", "id": 1463, "trainId": 325}, {"name": "cage", "id": 357, "trainId": 326}, {"name": "finger", "id": 929, "trainId": 327}, {"name": "bleachers", "id": 209, "trainId": 328}, {"name": "ferris wheel", "id": 912, "trainId": 329}, {"name": "hairdresser chair", "id": 1164, "trainId": 330}, {"name": "mat", "id": 1509, "trainId": 331}, {"name": "stands", "id": 2539, "trainId": 332}, {"name": "aquarium, fish tank, marine museum", "id": 3116, "trainId": 333}, {"name": "streetcar, tram, tramcar, trolley, trolley car", "id": 2615, "trainId": 334}, {"name": "napkin, table napkin, serviette", "id": 1644, "trainId": 335}, {"name": "dummy", "id": 818, "trainId": 336}, {"name": "booklet, brochure, folder, leaflet, pamphlet", "id": 242, "trainId": 337}, {"name": "sand trap", "id": 2217, "trainId": 338}, {"name": "shop, store", "id": 2347, "trainId": 339}, {"name": "table cloth", "id": 2686, "trainId": 340}, {"name": "service station", "id": 2300, "trainId": 341}, {"name": "coffin", "id": 572, "trainId": 342}, {"name": "drawer", "id": 789, "trainId": 343}, {"name": "cages", "id": 358, "trainId": 344}, {"name": "slot machine, coin machine", "id": 2443, "trainId": 345}, {"name": "balcony", "id": 101, "trainId": 346}, {"name": "volleyball court", "id": 2969, "trainId": 347}, {"name": "table tennis", "id": 2692, "trainId": 348}, {"name": "control table", "id": 606, "trainId": 349}, {"name": "shirt", "id": 2339, "trainId": 350}, {"name": "merchandise, ware, product", "id": 1533, "trainId": 351}, {"name": "railway", "id": 2060, "trainId": 352}, {"name": "parterre", "id": 1782, "trainId": 353}, {"name": "chimney", "id": 495, "trainId": 354}, {"name": "can, tin, tin can", "id": 371, "trainId": 355}, {"name": "tanks", "id": 2707, "trainId": 356}, {"name": "fabric, cloth, material, textile", "id": 889, "trainId": 357}, {"name": "alga, algae", "id": 3156, "trainId": 358}, {"name": "system", "id": 2683, "trainId": 359}, {"name": "map", "id": 1499, "trainId": 360}, {"name": "greenhouse", "id": 1135, "trainId": 361}, {"name": "mug", "id": 1619, "trainId": 362}, {"name": "barbecue", "id": 125, "trainId": 363}, {"name": "trailer", "id": 2838, "trainId": 364}, {"name": "toilet tissue, toilet paper, bathroom tissue", "id": 2792, "trainId": 365}, {"name": "organ", "id": 1695, "trainId": 366}, {"name": "dishrag, dishcloth", "id": 746, "trainId": 367}, {"name": "island", "id": 1343, "trainId": 368}, {"name": "keyboard", "id": 1370, "trainId": 369}, {"name": "trench", "id": 2858, "trainId": 370}, {"name": "basket, basketball hoop, hoop", "id": 145, "trainId": 371}, {"name": "steering wheel, wheel", "id": 2565, "trainId": 372}, {"name": "pitcher, ewer", "id": 1892, "trainId": 373}, {"name": "goal", "id": 1103, "trainId": 374}, {"name": "bread, breadstuff, staff of life", "id": 286, "trainId": 375}, {"name": "beds", "id": 170, "trainId": 376}, {"name": "wood", "id": 3073, "trainId": 377}, {"name": "file cabinet", "id": 922, "trainId": 378}, {"name": "newspaper, paper", "id": 1655, "trainId": 379}, {"name": "motorboat", "id": 1602, "trainId": 380}, {"name": "rope", "id": 2160, "trainId": 381}, {"name": "guitar", "id": 1151, "trainId": 382}, {"name": "rubble", "id": 2176, "trainId": 383}, {"name": "scarf", "id": 2239, "trainId": 384}, {"name": "barrels", "id": 132, "trainId": 385}, {"name": "cap", "id": 394, "trainId": 386}, {"name": "leaves", "id": 1424, "trainId": 387}, {"name": "control tower", "id": 607, "trainId": 388}, {"name": "dashboard", "id": 700, "trainId": 389}, {"name": "bandstand", "id": 116, "trainId": 390}, {"name": "lectern", "id": 1425, "trainId": 391}, {"name": "switch, electric switch, electrical switch", "id": 2676, "trainId": 392}, {"name": "baseboard, mopboard, skirting board", "id": 141, "trainId": 393}, {"name": "shower room", "id": 2360, "trainId": 394}, {"name": "smoke", "id": 2449, "trainId": 395}, {"name": "faucet, spigot", "id": 897, "trainId": 396}, {"name": "bulldozer", "id": 317, "trainId": 397}, {"name": "saucepan", "id": 2228, "trainId": 398}, {"name": "shops", "id": 2351, "trainId": 399}, {"name": "meter", "id": 1543, "trainId": 400}, {"name": "crevasse", "id": 656, "trainId": 401}, {"name": "gear", "id": 1088, "trainId": 402}, {"name": "candelabrum, candelabra", "id": 373, "trainId": 403}, {"name": "sofa bed", "id": 2472, "trainId": 404}, {"name": "tunnel", "id": 2892, "trainId": 405}, {"name": "pallet", "id": 1740, "trainId": 406}, {"name": "wire, conducting wire", "id": 3067, "trainId": 407}, {"name": "kettle, boiler", "id": 1367, "trainId": 408}, {"name": "bidet", "id": 188, "trainId": 409}, { "name": "baby buggy, baby carriage, carriage, perambulator, pram, stroller, go-cart, pushchair, pusher", "id": 79, "trainId": 410, }, {"name": "music stand", "id": 1633, "trainId": 411}, {"name": "pipe, tube", "id": 1885, "trainId": 412}, {"name": "cup", "id": 677, "trainId": 413}, {"name": "parking meter", "id": 1779, "trainId": 414}, {"name": "ice hockey rink", "id": 1297, "trainId": 415}, {"name": "shelter", "id": 2334, "trainId": 416}, {"name": "weeds", "id": 3027, "trainId": 417}, {"name": "temple", "id": 2735, "trainId": 418}, {"name": "patty, cake", "id": 1791, "trainId": 419}, {"name": "ski slope", "id": 2405, "trainId": 420}, {"name": "panel", "id": 1748, "trainId": 421}, {"name": "wallet", "id": 2983, "trainId": 422}, {"name": "wheel", "id": 3035, "trainId": 423}, {"name": "towel rack, towel horse", "id": 2824, "trainId": 424}, {"name": "roundabout", "id": 2168, "trainId": 425}, {"name": "canister, cannister, tin", "id": 385, "trainId": 426}, {"name": "rod", "id": 2148, "trainId": 427}, {"name": "soap dispenser", "id": 2465, "trainId": 428}, {"name": "bell", "id": 175, "trainId": 429}, {"name": "canvas", "id": 390, "trainId": 430}, {"name": "box office, ticket office, ticket booth", "id": 268, "trainId": 431}, {"name": "teacup", "id": 2722, "trainId": 432}, {"name": "trellis", "id": 2857, "trainId": 433}, {"name": "workbench", "id": 3088, "trainId": 434}, {"name": "valley, vale", "id": 2926, "trainId": 435}, {"name": "toaster", "id": 2782, "trainId": 436}, {"name": "knife", "id": 1378, "trainId": 437}, {"name": "podium", "id": 1934, "trainId": 438}, {"name": "ramp", "id": 2072, "trainId": 439}, {"name": "tumble dryer", "id": 2889, "trainId": 440}, {"name": "fireplug, fire hydrant, plug", "id": 944, "trainId": 441}, {"name": "gym shoe, sneaker, tennis shoe", "id": 1158, "trainId": 442}, {"name": "lab bench", "id": 1383, "trainId": 443}, {"name": "equipment", "id": 867, "trainId": 444}, {"name": "rocky formation", "id": 2145, "trainId": 445}, {"name": "plastic", "id": 1915, "trainId": 446}, {"name": "calendar", "id": 361, "trainId": 447}, {"name": "caravan", "id": 402, "trainId": 448}, {"name": "check-in-desk", "id": 482, "trainId": 449}, {"name": "ticket counter", "id": 2761, "trainId": 450}, {"name": "brush", "id": 300, "trainId": 451}, {"name": "mill", "id": 1554, "trainId": 452}, {"name": "covered bridge", "id": 636, "trainId": 453}, {"name": "bowling alley", "id": 260, "trainId": 454}, {"name": "hanger", "id": 1186, "trainId": 455}, {"name": "excavator", "id": 871, "trainId": 456}, {"name": "trestle", "id": 2859, "trainId": 457}, {"name": "revolving door", "id": 2103, "trainId": 458}, {"name": "blast furnace", "id": 208, "trainId": 459}, {"name": "scale, weighing machine", "id": 2236, "trainId": 460}, {"name": "projector", "id": 2012, "trainId": 461}, {"name": "soap", "id": 2462, "trainId": 462}, {"name": "locker", "id": 1462, "trainId": 463}, {"name": "tractor", "id": 2832, "trainId": 464}, {"name": "stretcher", "id": 2617, "trainId": 465}, {"name": "frame", "id": 1024, "trainId": 466}, {"name": "grating", "id": 1129, "trainId": 467}, {"name": "alembic", "id": 18, "trainId": 468}, {"name": "candle, taper, wax light", "id": 376, "trainId": 469}, {"name": "barrier", "id": 134, "trainId": 470}, {"name": "cardboard", "id": 407, "trainId": 471}, {"name": "cave", "id": 434, "trainId": 472}, {"name": "puddle", "id": 2017, "trainId": 473}, {"name": "tarp", "id": 2717, "trainId": 474}, {"name": "price tag", "id": 2005, "trainId": 475}, {"name": "watchtower", "id": 2993, "trainId": 476}, {"name": "meters", "id": 1545, "trainId": 477}, { "name": "light bulb, lightbulb, bulb, incandescent lamp, electric light, electric-light bulb", "id": 1445, "trainId": 478, }, {"name": "tracks", "id": 2831, "trainId": 479}, {"name": "hair dryer", "id": 1161, "trainId": 480}, {"name": "skirt", "id": 2411, "trainId": 481}, {"name": "viaduct", "id": 2949, "trainId": 482}, {"name": "paper towel", "id": 1769, "trainId": 483}, {"name": "coat", "id": 552, "trainId": 484}, {"name": "sheet", "id": 2327, "trainId": 485}, {"name": "fire extinguisher, extinguisher, asphyxiator", "id": 939, "trainId": 486}, {"name": "water wheel", "id": 3013, "trainId": 487}, {"name": "pottery, clayware", "id": 1986, "trainId": 488}, {"name": "magazine rack", "id": 1486, "trainId": 489}, {"name": "teapot", "id": 2723, "trainId": 490}, {"name": "microphone, mike", "id": 1549, "trainId": 491}, {"name": "support", "id": 2649, "trainId": 492}, {"name": "forklift", "id": 1020, "trainId": 493}, {"name": "canyon", "id": 392, "trainId": 494}, {"name": "cash register, register", "id": 422, "trainId": 495}, {"name": "leaf, leafage, foliage", "id": 1419, "trainId": 496}, {"name": "remote control, remote", "id": 2099, "trainId": 497}, {"name": "soap dish", "id": 2464, "trainId": 498}, {"name": "windshield, windscreen", "id": 3058, "trainId": 499}, {"name": "cat", "id": 430, "trainId": 500}, {"name": "cue, cue stick, pool cue, pool stick", "id": 675, "trainId": 501}, {"name": "vent, venthole, vent-hole, blowhole", "id": 2941, "trainId": 502}, {"name": "videos", "id": 2955, "trainId": 503}, {"name": "shovel", "id": 2355, "trainId": 504}, {"name": "eaves", "id": 840, "trainId": 505}, {"name": "antenna, aerial, transmitting aerial", "id": 32, "trainId": 506}, {"name": "shipyard", "id": 2338, "trainId": 507}, {"name": "hen, biddy", "id": 1232, "trainId": 508}, {"name": "traffic cone", "id": 2834, "trainId": 509}, {"name": "washing machines", "id": 2991, "trainId": 510}, {"name": "truck crane", "id": 2879, "trainId": 511}, {"name": "cds", "id": 444, "trainId": 512}, {"name": "niche", "id": 1657, "trainId": 513}, {"name": "scoreboard", "id": 2246, "trainId": 514}, {"name": "briefcase", "id": 296, "trainId": 515}, {"name": "boot", "id": 245, "trainId": 516}, {"name": "sweater, jumper", "id": 2661, "trainId": 517}, {"name": "hay", "id": 1202, "trainId": 518}, {"name": "pack", "id": 1714, "trainId": 519}, {"name": "bottle rack", "id": 251, "trainId": 520}, {"name": "glacier", "id": 1095, "trainId": 521}, {"name": "pergola", "id": 1828, "trainId": 522}, {"name": "building materials", "id": 311, "trainId": 523}, {"name": "television camera", "id": 2732, "trainId": 524}, {"name": "first floor", "id": 947, "trainId": 525}, {"name": "rifle", "id": 2115, "trainId": 526}, {"name": "tennis table", "id": 2738, "trainId": 527}, {"name": "stadium", "id": 2525, "trainId": 528}, {"name": "safety belt", "id": 2194, "trainId": 529}, {"name": "cover", "id": 634, "trainId": 530}, {"name": "dish rack", "id": 740, "trainId": 531}, {"name": "synthesizer", "id": 2682, "trainId": 532}, {"name": "pumpkin", "id": 2020, "trainId": 533}, {"name": "gutter", "id": 1156, "trainId": 534}, {"name": "fruit stand", "id": 1036, "trainId": 535}, {"name": "ice floe, floe", "id": 1295, "trainId": 536}, {"name": "handle, grip, handgrip, hold", "id": 1181, "trainId": 537}, {"name": "wheelchair", "id": 3037, "trainId": 538}, {"name": "mousepad, mouse mat", "id": 1614, "trainId": 539}, {"name": "diploma", "id": 736, "trainId": 540}, {"name": "fairground ride", "id": 893, "trainId": 541}, {"name": "radio", "id": 2047, "trainId": 542}, {"name": "hotplate", "id": 1274, "trainId": 543}, {"name": "junk", "id": 1361, "trainId": 544}, {"name": "wheelbarrow", "id": 3036, "trainId": 545}, {"name": "stream", "id": 2606, "trainId": 546}, {"name": "toll plaza", "id": 2797, "trainId": 547}, {"name": "punching bag", "id": 2022, "trainId": 548}, {"name": "trough", "id": 2876, "trainId": 549}, {"name": "throne", "id": 2758, "trainId": 550}, {"name": "chair desk", "id": 472, "trainId": 551}, {"name": "weighbridge", "id": 3028, "trainId": 552}, {"name": "extractor fan", "id": 882, "trainId": 553}, {"name": "hanging clothes", "id": 1189, "trainId": 554}, {"name": "dish, dish aerial, dish antenna, saucer", "id": 743, "trainId": 555}, {"name": "alarm clock, alarm", "id": 3122, "trainId": 556}, {"name": "ski lift", "id": 2401, "trainId": 557}, {"name": "chain", "id": 468, "trainId": 558}, {"name": "garage", "id": 1061, "trainId": 559}, {"name": "mechanical shovel", "id": 1523, "trainId": 560}, {"name": "wine rack", "id": 3059, "trainId": 561}, {"name": "tramway", "id": 2843, "trainId": 562}, {"name": "treadmill", "id": 2853, "trainId": 563}, {"name": "menu", "id": 1529, "trainId": 564}, {"name": "block", "id": 214, "trainId": 565}, {"name": "well", "id": 3032, "trainId": 566}, {"name": "witness stand", "id": 3071, "trainId": 567}, {"name": "branch", "id": 277, "trainId": 568}, {"name": "duck", "id": 813, "trainId": 569}, {"name": "casserole", "id": 426, "trainId": 570}, {"name": "frying pan", "id": 1039, "trainId": 571}, {"name": "desk organizer", "id": 727, "trainId": 572}, {"name": "mast", "id": 1508, "trainId": 573}, {"name": "spectacles, specs, eyeglasses, glasses", "id": 2490, "trainId": 574}, {"name": "service elevator", "id": 2299, "trainId": 575}, {"name": "dollhouse", "id": 768, "trainId": 576}, {"name": "hammock", "id": 1172, "trainId": 577}, {"name": "clothes hanging", "id": 537, "trainId": 578}, {"name": "photocopier", "id": 1847, "trainId": 579}, {"name": "notepad", "id": 1664, "trainId": 580}, {"name": "golf cart", "id": 1110, "trainId": 581}, {"name": "footpath", "id": 1014, "trainId": 582}, {"name": "cross", "id": 662, "trainId": 583}, {"name": "baptismal font", "id": 121, "trainId": 584}, {"name": "boiler", "id": 227, "trainId": 585}, {"name": "skip", "id": 2410, "trainId": 586}, {"name": "rotisserie", "id": 2165, "trainId": 587}, {"name": "tables", "id": 2696, "trainId": 588}, {"name": "water mill", "id": 3005, "trainId": 589}, {"name": "helmet", "id": 1231, "trainId": 590}, {"name": "cover curtain", "id": 635, "trainId": 591}, {"name": "brick", "id": 292, "trainId": 592}, {"name": "table runner", "id": 2690, "trainId": 593}, {"name": "ashtray", "id": 65, "trainId": 594}, {"name": "street box", "id": 2607, "trainId": 595}, {"name": "stick", "id": 2574, "trainId": 596}, {"name": "hangers", "id": 1188, "trainId": 597}, {"name": "cells", "id": 456, "trainId": 598}, {"name": "urinal", "id": 2913, "trainId": 599}, {"name": "centerpiece", "id": 459, "trainId": 600}, {"name": "portable fridge", "id": 1955, "trainId": 601}, {"name": "dvds", "id": 827, "trainId": 602}, {"name": "golf club", "id": 1111, "trainId": 603}, {"name": "skirting board", "id": 2412, "trainId": 604}, {"name": "water cooler", "id": 2997, "trainId": 605}, {"name": "clipboard", "id": 528, "trainId": 606}, {"name": "camera, photographic camera", "id": 366, "trainId": 607}, {"name": "pigeonhole", "id": 1863, "trainId": 608}, {"name": "chips", "id": 500, "trainId": 609}, {"name": "food processor", "id": 1001, "trainId": 610}, {"name": "post box", "id": 1958, "trainId": 611}, {"name": "lid", "id": 1441, "trainId": 612}, {"name": "drum", "id": 809, "trainId": 613}, {"name": "blender", "id": 210, "trainId": 614}, {"name": "cave entrance", "id": 435, "trainId": 615}, {"name": "dental chair", "id": 718, "trainId": 616}, {"name": "obelisk", "id": 1674, "trainId": 617}, {"name": "canoe", "id": 388, "trainId": 618}, {"name": "mobile", "id": 1572, "trainId": 619}, {"name": "monitors", "id": 1584, "trainId": 620}, {"name": "pool ball", "id": 1944, "trainId": 621}, {"name": "cue rack", "id": 674, "trainId": 622}, {"name": "baggage carts", "id": 99, "trainId": 623}, {"name": "shore", "id": 2352, "trainId": 624}, {"name": "fork", "id": 1019, "trainId": 625}, {"name": "paper filer", "id": 1763, "trainId": 626}, {"name": "bicycle rack", "id": 185, "trainId": 627}, {"name": "coat rack", "id": 554, "trainId": 628}, {"name": "garland", "id": 1066, "trainId": 629}, {"name": "sports bag", "id": 2508, "trainId": 630}, {"name": "fish tank", "id": 951, "trainId": 631}, {"name": "towel dispenser", "id": 2822, "trainId": 632}, {"name": "carriage", "id": 415, "trainId": 633}, {"name": "brochure", "id": 297, "trainId": 634}, {"name": "plaque", "id": 1914, "trainId": 635}, {"name": "stringer", "id": 2619, "trainId": 636}, {"name": "iron", "id": 1338, "trainId": 637}, {"name": "spoon", "id": 2505, "trainId": 638}, {"name": "flag pole", "id": 955, "trainId": 639}, {"name": "toilet brush", "id": 2786, "trainId": 640}, {"name": "book stand", "id": 238, "trainId": 641}, {"name": "water faucet, water tap, tap, hydrant", "id": 3000, "trainId": 642}, {"name": "ticket office", "id": 2763, "trainId": 643}, {"name": "broom", "id": 299, "trainId": 644}, {"name": "dvd", "id": 822, "trainId": 645}, {"name": "ice bucket", "id": 1288, "trainId": 646}, {"name": "carapace, shell, cuticle, shield", "id": 3101, "trainId": 647}, {"name": "tureen", "id": 2894, "trainId": 648}, {"name": "folders", "id": 992, "trainId": 649}, {"name": "chess", "id": 489, "trainId": 650}, {"name": "root", "id": 2157, "trainId": 651}, {"name": "sewing machine", "id": 2309, "trainId": 652}, {"name": "model", "id": 1576, "trainId": 653}, {"name": "pen", "id": 1810, "trainId": 654}, {"name": "violin", "id": 2964, "trainId": 655}, {"name": "sweatshirt", "id": 2662, "trainId": 656}, {"name": "recycling materials", "id": 2087, "trainId": 657}, {"name": "mitten", "id": 1569, "trainId": 658}, {"name": "chopping board, cutting board", "id": 503, "trainId": 659}, {"name": "mask", "id": 1505, "trainId": 660}, {"name": "log", "id": 1468, "trainId": 661}, {"name": "mouse, computer mouse", "id": 1613, "trainId": 662}, {"name": "grill", "id": 1138, "trainId": 663}, {"name": "hole", "id": 1256, "trainId": 664}, {"name": "target", "id": 2715, "trainId": 665}, {"name": "trash bag", "id": 2846, "trainId": 666}, {"name": "chalk", "id": 477, "trainId": 667}, {"name": "sticks", "id": 2576, "trainId": 668}, {"name": "balloon", "id": 108, "trainId": 669}, {"name": "score", "id": 2245, "trainId": 670}, {"name": "hair spray", "id": 1162, "trainId": 671}, {"name": "roll", "id": 2149, "trainId": 672}, {"name": "runner", "id": 2183, "trainId": 673}, {"name": "engine", "id": 858, "trainId": 674}, {"name": "inflatable glove", "id": 1324, "trainId": 675}, {"name": "games", "id": 1055, "trainId": 676}, {"name": "pallets", "id": 1741, "trainId": 677}, {"name": "baskets", "id": 149, "trainId": 678}, {"name": "coop", "id": 615, "trainId": 679}, {"name": "dvd player", "id": 825, "trainId": 680}, {"name": "rocking horse", "id": 2143, "trainId": 681}, {"name": "buckets", "id": 304, "trainId": 682}, {"name": "bread rolls", "id": 283, "trainId": 683}, {"name": "shawl", "id": 2322, "trainId": 684}, {"name": "watering can", "id": 3017, "trainId": 685}, {"name": "spotlights", "id": 2510, "trainId": 686}, {"name": "post-it", "id": 1960, "trainId": 687}, {"name": "bowls", "id": 265, "trainId": 688}, {"name": "security camera", "id": 2282, "trainId": 689}, {"name": "runner cloth", "id": 2184, "trainId": 690}, {"name": "lock", "id": 1461, "trainId": 691}, {"name": "alarm, warning device, alarm system", "id": 3113, "trainId": 692}, {"name": "side", "id": 2372, "trainId": 693}, {"name": "roulette", "id": 2166, "trainId": 694}, {"name": "bone", "id": 232, "trainId": 695}, {"name": "cutlery", "id": 693, "trainId": 696}, {"name": "pool balls", "id": 1945, "trainId": 697}, {"name": "wheels", "id": 3039, "trainId": 698}, {"name": "spice rack", "id": 2494, "trainId": 699}, {"name": "plant pots", "id": 1908, "trainId": 700}, {"name": "towel ring", "id": 2827, "trainId": 701}, {"name": "bread box", "id": 280, "trainId": 702}, {"name": "video", "id": 2950, "trainId": 703}, {"name": "funfair", "id": 1044, "trainId": 704}, {"name": "breads", "id": 288, "trainId": 705}, {"name": "tripod", "id": 2863, "trainId": 706}, {"name": "ironing board", "id": 1342, "trainId": 707}, {"name": "skimmer", "id": 2409, "trainId": 708}, {"name": "hollow", "id": 1258, "trainId": 709}, {"name": "scratching post", "id": 2249, "trainId": 710}, {"name": "tricycle", "id": 2862, "trainId": 711}, {"name": "file box", "id": 920, "trainId": 712}, {"name": "mountain pass", "id": 1607, "trainId": 713}, {"name": "tombstones", "id": 2802, "trainId": 714}, {"name": "cooker", "id": 610, "trainId": 715}, {"name": "card game, cards", "id": 3129, "trainId": 716}, {"name": "golf bag", "id": 1108, "trainId": 717}, {"name": "towel paper", "id": 2823, "trainId": 718}, {"name": "chaise lounge", "id": 476, "trainId": 719}, {"name": "sun", "id": 2641, "trainId": 720}, {"name": "toilet paper holder", "id": 2788, "trainId": 721}, {"name": "rake", "id": 2070, "trainId": 722}, {"name": "key", "id": 1368, "trainId": 723}, {"name": "umbrella stand", "id": 2903, "trainId": 724}, {"name": "dartboard", "id": 699, "trainId": 725}, {"name": "transformer", "id": 2844, "trainId": 726}, {"name": "fireplace utensils", "id": 942, "trainId": 727}, {"name": "sweatshirts", "id": 2663, "trainId": 728}, { "name": "cellular telephone, cellular phone, cellphone, cell, mobile phone", "id": 457, "trainId": 729, }, {"name": "tallboy", "id": 2701, "trainId": 730}, {"name": "stapler", "id": 2540, "trainId": 731}, {"name": "sauna", "id": 2231, "trainId": 732}, {"name": "test tube", "id": 2746, "trainId": 733}, {"name": "palette", "id": 1738, "trainId": 734}, {"name": "shopping carts", "id": 2350, "trainId": 735}, {"name": "tools", "id": 2808, "trainId": 736}, {"name": "push button, push, button", "id": 2025, "trainId": 737}, {"name": "star", "id": 2541, "trainId": 738}, {"name": "roof rack", "id": 2156, "trainId": 739}, {"name": "barbed wire", "id": 126, "trainId": 740}, {"name": "spray", "id": 2512, "trainId": 741}, {"name": "ear", "id": 831, "trainId": 742}, {"name": "sponge", "id": 2503, "trainId": 743}, {"name": "racket", "id": 2039, "trainId": 744}, {"name": "tins", "id": 2774, "trainId": 745}, {"name": "eyeglasses", "id": 886, "trainId": 746}, {"name": "file", "id": 919, "trainId": 747}, {"name": "scarfs", "id": 2240, "trainId": 748}, {"name": "sugar bowl", "id": 2636, "trainId": 749}, {"name": "flip flop", "id": 963, "trainId": 750}, {"name": "headstones", "id": 1218, "trainId": 751}, {"name": "laptop bag", "id": 1406, "trainId": 752}, {"name": "leash", "id": 1420, "trainId": 753}, {"name": "climbing frame", "id": 526, "trainId": 754}, {"name": "suit hanger", "id": 2639, "trainId": 755}, {"name": "floor spotlight", "id": 975, "trainId": 756}, {"name": "plate rack", "id": 1921, "trainId": 757}, {"name": "sewer", "id": 2305, "trainId": 758}, {"name": "hard drive", "id": 1193, "trainId": 759}, {"name": "sprinkler", "id": 2517, "trainId": 760}, {"name": "tools box", "id": 2809, "trainId": 761}, {"name": "necklace", "id": 1647, "trainId": 762}, {"name": "bulbs", "id": 314, "trainId": 763}, {"name": "steel industry", "id": 2560, "trainId": 764}, {"name": "club", "id": 545, "trainId": 765}, {"name": "jack", "id": 1345, "trainId": 766}, {"name": "door bars", "id": 775, "trainId": 767}, { "name": "control panel, instrument panel, control board, board, panel", "id": 603, "trainId": 768, }, {"name": "hairbrush", "id": 1163, "trainId": 769}, {"name": "napkin holder", "id": 1641, "trainId": 770}, {"name": "office", "id": 1678, "trainId": 771}, {"name": "smoke detector", "id": 2450, "trainId": 772}, {"name": "utensils", "id": 2915, "trainId": 773}, {"name": "apron", "id": 42, "trainId": 774}, {"name": "scissors", "id": 2242, "trainId": 775}, {"name": "terminal", "id": 2741, "trainId": 776}, {"name": "grinder", "id": 1143, "trainId": 777}, {"name": "entry phone", "id": 862, "trainId": 778}, {"name": "newspaper stand", "id": 1654, "trainId": 779}, {"name": "pepper shaker", "id": 1826, "trainId": 780}, {"name": "onions", "id": 1689, "trainId": 781}, { "name": "central processing unit, cpu, c p u , central processor, processor, mainframe", "id": 3124, "trainId": 782, }, {"name": "tape", "id": 2710, "trainId": 783}, {"name": "bat", "id": 152, "trainId": 784}, {"name": "coaster", "id": 549, "trainId": 785}, {"name": "calculator", "id": 360, "trainId": 786}, {"name": "potatoes", "id": 1982, "trainId": 787}, {"name": "luggage rack", "id": 1478, "trainId": 788}, {"name": "salt", "id": 2203, "trainId": 789}, {"name": "street number", "id": 2612, "trainId": 790}, {"name": "viewpoint", "id": 2956, "trainId": 791}, {"name": "sword", "id": 2681, "trainId": 792}, {"name": "cd", "id": 437, "trainId": 793}, {"name": "rowing machine", "id": 2171, "trainId": 794}, {"name": "plug", "id": 1933, "trainId": 795}, {"name": "andiron, firedog, dog, dog-iron", "id": 3110, "trainId": 796}, {"name": "pepper", "id": 1824, "trainId": 797}, {"name": "tongs", "id": 2803, "trainId": 798}, {"name": "bonfire", "id": 234, "trainId": 799}, {"name": "dog dish", "id": 764, "trainId": 800}, {"name": "belt", "id": 177, "trainId": 801}, {"name": "dumbbells", "id": 817, "trainId": 802}, {"name": "videocassette recorder, vcr", "id": 3145, "trainId": 803}, {"name": "hook", "id": 1262, "trainId": 804}, {"name": "envelopes", "id": 864, "trainId": 805}, {"name": "shower faucet", "id": 2359, "trainId": 806}, {"name": "watch", "id": 2992, "trainId": 807}, {"name": "padlock", "id": 1725, "trainId": 808}, {"name": "swimming pool ladder", "id": 2667, "trainId": 809}, {"name": "spanners", "id": 2484, "trainId": 810}, {"name": "gravy boat", "id": 1133, "trainId": 811}, {"name": "notice board", "id": 1667, "trainId": 812}, {"name": "trash bags", "id": 2847, "trainId": 813}, {"name": "fire alarm", "id": 932, "trainId": 814}, {"name": "ladle", "id": 1392, "trainId": 815}, {"name": "stethoscope", "id": 2573, "trainId": 816}, {"name": "rocket", "id": 2140, "trainId": 817}, {"name": "funnel", "id": 1046, "trainId": 818}, {"name": "bowling pins", "id": 264, "trainId": 819}, {"name": "valve", "id": 2927, "trainId": 820}, {"name": "thermometer", "id": 2752, "trainId": 821}, {"name": "cups", "id": 679, "trainId": 822}, {"name": "spice jar", "id": 2493, "trainId": 823}, {"name": "night light", "id": 1658, "trainId": 824}, {"name": "soaps", "id": 2466, "trainId": 825}, {"name": "games table", "id": 1057, "trainId": 826}, {"name": "slotted spoon", "id": 2444, "trainId": 827}, {"name": "reel", "id": 2093, "trainId": 828}, {"name": "scourer", "id": 2248, "trainId": 829}, {"name": "sleeping robe", "id": 2432, "trainId": 830}, {"name": "desk mat", "id": 726, "trainId": 831}, {"name": "dumbbell", "id": 816, "trainId": 832}, {"name": "hammer", "id": 1171, "trainId": 833}, {"name": "tie", "id": 2766, "trainId": 834}, {"name": "typewriter", "id": 2900, "trainId": 835}, {"name": "shaker", "id": 2313, "trainId": 836}, {"name": "cheese dish", "id": 488, "trainId": 837}, {"name": "sea star", "id": 2265, "trainId": 838}, {"name": "racquet", "id": 2043, "trainId": 839}, {"name": "butane gas cylinder", "id": 332, "trainId": 840}, {"name": "paper weight", "id": 1771, "trainId": 841}, {"name": "shaving brush", "id": 2320, "trainId": 842}, {"name": "sunglasses", "id": 2646, "trainId": 843}, {"name": "gear shift", "id": 1089, "trainId": 844}, {"name": "towel rail", "id": 2826, "trainId": 845}, {"name": "adding machine, totalizer, totaliser", "id": 3148, "trainId": 846}, ] def _get_ade20k_full_meta(): # Id 0 is reserved for ignore_label, we change ignore_label for 0 # to 255 in our pre-processing, so all ids are shifted by 1. stuff_ids = [k["id"] for k in ADE20K_SEM_SEG_FULL_CATEGORIES] assert len(stuff_ids) == 847, len(stuff_ids) # For semantic segmentation, this mapping maps from contiguous stuff id # (in [0, 91], used in models) to ids in the dataset (used for processing results) stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)} stuff_classes = [k["name"] for k in ADE20K_SEM_SEG_FULL_CATEGORIES] ret = { "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id, "stuff_classes": stuff_classes, } return ret def register_all_ade20k_full(root): root = os.path.join(root, "ADE20K_2021_17_01") meta = _get_ade20k_full_meta() for name, dirname in [("train", "training"), ("val", "validation")]: image_dir = os.path.join(root, "images_detectron2", dirname) gt_dir = os.path.join(root, "annotations_detectron2", dirname) name = f"ade20k_full_sem_seg_{name}" DatasetCatalog.register( name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="tif", image_ext="jpg") ) MetadataCatalog.get(name).set( stuff_classes=meta["stuff_classes"][:], image_root=image_dir, sem_seg_root=gt_dir, evaluator_type="sem_seg", ignore_label=65535, # NOTE: gt is saved in 16-bit TIFF images ) _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_ade20k_full(_root) ================================================ FILE: mask2former/data/datasets/register_ade20k_instance.py ================================================ import json import logging import numpy as np import os from PIL import Image from detectron2.data import DatasetCatalog, MetadataCatalog from detectron2.data.datasets.coco import load_coco_json, register_coco_instances from detectron2.utils.file_io import PathManager ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}] _PREDEFINED_SPLITS = { # point annotations without masks "ade20k_instance_train": ( "ADEChallengeData2016/images/training", "ADEChallengeData2016/ade20k_instance_train.json", ), "ade20k_instance_val": ( "ADEChallengeData2016/images/validation", "ADEChallengeData2016/ade20k_instance_val.json", ), } def _get_ade_instances_meta(): thing_ids = [k["id"] for k in ADE_CATEGORIES] assert len(thing_ids) == 100, len(thing_ids) # Mapping from the incontiguous ADE category id to an id in [0, 99] thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} thing_classes = [k["name"] for k in ADE_CATEGORIES] ret = { "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, "thing_classes": thing_classes, } return ret def register_all_ade20k_instance(root): for key, (image_root, json_file) in _PREDEFINED_SPLITS.items(): # Assume pre-defined datasets live in `./datasets`. register_coco_instances( key, _get_ade_instances_meta(), os.path.join(root, json_file) if "://" not in json_file else json_file, os.path.join(root, image_root), ) _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_ade20k_instance(_root) ================================================ FILE: mask2former/data/datasets/register_ade20k_panoptic.py ================================================ import json import os from detectron2.data import DatasetCatalog, MetadataCatalog from detectron2.utils.file_io import PathManager ADE20K_150_CATEGORIES = [ {"color": [120, 120, 120], "id": 0, "isthing": 0, "name": "wall"}, {"color": [180, 120, 120], "id": 1, "isthing": 0, "name": "building"}, {"color": [6, 230, 230], "id": 2, "isthing": 0, "name": "sky"}, {"color": [80, 50, 50], "id": 3, "isthing": 0, "name": "floor"}, {"color": [4, 200, 3], "id": 4, "isthing": 0, "name": "tree"}, {"color": [120, 120, 80], "id": 5, "isthing": 0, "name": "ceiling"}, {"color": [140, 140, 140], "id": 6, "isthing": 0, "name": "road, route"}, {"color": [204, 5, 255], "id": 7, "isthing": 1, "name": "bed"}, {"color": [230, 230, 230], "id": 8, "isthing": 1, "name": "window "}, {"color": [4, 250, 7], "id": 9, "isthing": 0, "name": "grass"}, {"color": [224, 5, 255], "id": 10, "isthing": 1, "name": "cabinet"}, {"color": [235, 255, 7], "id": 11, "isthing": 0, "name": "sidewalk, pavement"}, {"color": [150, 5, 61], "id": 12, "isthing": 1, "name": "person"}, {"color": [120, 120, 70], "id": 13, "isthing": 0, "name": "earth, ground"}, {"color": [8, 255, 51], "id": 14, "isthing": 1, "name": "door"}, {"color": [255, 6, 82], "id": 15, "isthing": 1, "name": "table"}, {"color": [143, 255, 140], "id": 16, "isthing": 0, "name": "mountain, mount"}, {"color": [204, 255, 4], "id": 17, "isthing": 0, "name": "plant"}, {"color": [255, 51, 7], "id": 18, "isthing": 1, "name": "curtain"}, {"color": [204, 70, 3], "id": 19, "isthing": 1, "name": "chair"}, {"color": [0, 102, 200], "id": 20, "isthing": 1, "name": "car"}, {"color": [61, 230, 250], "id": 21, "isthing": 0, "name": "water"}, {"color": [255, 6, 51], "id": 22, "isthing": 1, "name": "painting, picture"}, {"color": [11, 102, 255], "id": 23, "isthing": 1, "name": "sofa"}, {"color": [255, 7, 71], "id": 24, "isthing": 1, "name": "shelf"}, {"color": [255, 9, 224], "id": 25, "isthing": 0, "name": "house"}, {"color": [9, 7, 230], "id": 26, "isthing": 0, "name": "sea"}, {"color": [220, 220, 220], "id": 27, "isthing": 1, "name": "mirror"}, {"color": [255, 9, 92], "id": 28, "isthing": 0, "name": "rug"}, {"color": [112, 9, 255], "id": 29, "isthing": 0, "name": "field"}, {"color": [8, 255, 214], "id": 30, "isthing": 1, "name": "armchair"}, {"color": [7, 255, 224], "id": 31, "isthing": 1, "name": "seat"}, {"color": [255, 184, 6], "id": 32, "isthing": 1, "name": "fence"}, {"color": [10, 255, 71], "id": 33, "isthing": 1, "name": "desk"}, {"color": [255, 41, 10], "id": 34, "isthing": 0, "name": "rock, stone"}, {"color": [7, 255, 255], "id": 35, "isthing": 1, "name": "wardrobe, closet, press"}, {"color": [224, 255, 8], "id": 36, "isthing": 1, "name": "lamp"}, {"color": [102, 8, 255], "id": 37, "isthing": 1, "name": "tub"}, {"color": [255, 61, 6], "id": 38, "isthing": 1, "name": "rail"}, {"color": [255, 194, 7], "id": 39, "isthing": 1, "name": "cushion"}, {"color": [255, 122, 8], "id": 40, "isthing": 0, "name": "base, pedestal, stand"}, {"color": [0, 255, 20], "id": 41, "isthing": 1, "name": "box"}, {"color": [255, 8, 41], "id": 42, "isthing": 1, "name": "column, pillar"}, {"color": [255, 5, 153], "id": 43, "isthing": 1, "name": "signboard, sign"}, { "color": [6, 51, 255], "id": 44, "isthing": 1, "name": "chest of drawers, chest, bureau, dresser", }, {"color": [235, 12, 255], "id": 45, "isthing": 1, "name": "counter"}, {"color": [160, 150, 20], "id": 46, "isthing": 0, "name": "sand"}, {"color": [0, 163, 255], "id": 47, "isthing": 1, "name": "sink"}, {"color": [140, 140, 140], "id": 48, "isthing": 0, "name": "skyscraper"}, {"color": [250, 10, 15], "id": 49, "isthing": 1, "name": "fireplace"}, {"color": [20, 255, 0], "id": 50, "isthing": 1, "name": "refrigerator, icebox"}, {"color": [31, 255, 0], "id": 51, "isthing": 0, "name": "grandstand, covered stand"}, {"color": [255, 31, 0], "id": 52, "isthing": 0, "name": "path"}, {"color": [255, 224, 0], "id": 53, "isthing": 1, "name": "stairs"}, {"color": [153, 255, 0], "id": 54, "isthing": 0, "name": "runway"}, {"color": [0, 0, 255], "id": 55, "isthing": 1, "name": "case, display case, showcase, vitrine"}, { "color": [255, 71, 0], "id": 56, "isthing": 1, "name": "pool table, billiard table, snooker table", }, {"color": [0, 235, 255], "id": 57, "isthing": 1, "name": "pillow"}, {"color": [0, 173, 255], "id": 58, "isthing": 1, "name": "screen door, screen"}, {"color": [31, 0, 255], "id": 59, "isthing": 0, "name": "stairway, staircase"}, {"color": [11, 200, 200], "id": 60, "isthing": 0, "name": "river"}, {"color": [255, 82, 0], "id": 61, "isthing": 0, "name": "bridge, span"}, {"color": [0, 255, 245], "id": 62, "isthing": 1, "name": "bookcase"}, {"color": [0, 61, 255], "id": 63, "isthing": 0, "name": "blind, screen"}, {"color": [0, 255, 112], "id": 64, "isthing": 1, "name": "coffee table"}, { "color": [0, 255, 133], "id": 65, "isthing": 1, "name": "toilet, can, commode, crapper, pot, potty, stool, throne", }, {"color": [255, 0, 0], "id": 66, "isthing": 1, "name": "flower"}, {"color": [255, 163, 0], "id": 67, "isthing": 1, "name": "book"}, {"color": [255, 102, 0], "id": 68, "isthing": 0, "name": "hill"}, {"color": [194, 255, 0], "id": 69, "isthing": 1, "name": "bench"}, {"color": [0, 143, 255], "id": 70, "isthing": 1, "name": "countertop"}, {"color": [51, 255, 0], "id": 71, "isthing": 1, "name": "stove"}, {"color": [0, 82, 255], "id": 72, "isthing": 1, "name": "palm, palm tree"}, {"color": [0, 255, 41], "id": 73, "isthing": 1, "name": "kitchen island"}, {"color": [0, 255, 173], "id": 74, "isthing": 1, "name": "computer"}, {"color": [10, 0, 255], "id": 75, "isthing": 1, "name": "swivel chair"}, {"color": [173, 255, 0], "id": 76, "isthing": 1, "name": "boat"}, {"color": [0, 255, 153], "id": 77, "isthing": 0, "name": "bar"}, {"color": [255, 92, 0], "id": 78, "isthing": 1, "name": "arcade machine"}, {"color": [255, 0, 255], "id": 79, "isthing": 0, "name": "hovel, hut, hutch, shack, shanty"}, {"color": [255, 0, 245], "id": 80, "isthing": 1, "name": "bus"}, {"color": [255, 0, 102], "id": 81, "isthing": 1, "name": "towel"}, {"color": [255, 173, 0], "id": 82, "isthing": 1, "name": "light"}, {"color": [255, 0, 20], "id": 83, "isthing": 1, "name": "truck"}, {"color": [255, 184, 184], "id": 84, "isthing": 0, "name": "tower"}, {"color": [0, 31, 255], "id": 85, "isthing": 1, "name": "chandelier"}, {"color": [0, 255, 61], "id": 86, "isthing": 1, "name": "awning, sunshade, sunblind"}, {"color": [0, 71, 255], "id": 87, "isthing": 1, "name": "street lamp"}, {"color": [255, 0, 204], "id": 88, "isthing": 1, "name": "booth"}, {"color": [0, 255, 194], "id": 89, "isthing": 1, "name": "tv"}, {"color": [0, 255, 82], "id": 90, "isthing": 1, "name": "plane"}, {"color": [0, 10, 255], "id": 91, "isthing": 0, "name": "dirt track"}, {"color": [0, 112, 255], "id": 92, "isthing": 1, "name": "clothes"}, {"color": [51, 0, 255], "id": 93, "isthing": 1, "name": "pole"}, {"color": [0, 194, 255], "id": 94, "isthing": 0, "name": "land, ground, soil"}, { "color": [0, 122, 255], "id": 95, "isthing": 1, "name": "bannister, banister, balustrade, balusters, handrail", }, { "color": [0, 255, 163], "id": 96, "isthing": 0, "name": "escalator, moving staircase, moving stairway", }, { "color": [255, 153, 0], "id": 97, "isthing": 1, "name": "ottoman, pouf, pouffe, puff, hassock", }, {"color": [0, 255, 10], "id": 98, "isthing": 1, "name": "bottle"}, {"color": [255, 112, 0], "id": 99, "isthing": 0, "name": "buffet, counter, sideboard"}, { "color": [143, 255, 0], "id": 100, "isthing": 0, "name": "poster, posting, placard, notice, bill, card", }, {"color": [82, 0, 255], "id": 101, "isthing": 0, "name": "stage"}, {"color": [163, 255, 0], "id": 102, "isthing": 1, "name": "van"}, {"color": [255, 235, 0], "id": 103, "isthing": 1, "name": "ship"}, {"color": [8, 184, 170], "id": 104, "isthing": 1, "name": "fountain"}, { "color": [133, 0, 255], "id": 105, "isthing": 0, "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter", }, {"color": [0, 255, 92], "id": 106, "isthing": 0, "name": "canopy"}, { "color": [184, 0, 255], "id": 107, "isthing": 1, "name": "washer, automatic washer, washing machine", }, {"color": [255, 0, 31], "id": 108, "isthing": 1, "name": "plaything, toy"}, {"color": [0, 184, 255], "id": 109, "isthing": 0, "name": "pool"}, {"color": [0, 214, 255], "id": 110, "isthing": 1, "name": "stool"}, {"color": [255, 0, 112], "id": 111, "isthing": 1, "name": "barrel, cask"}, {"color": [92, 255, 0], "id": 112, "isthing": 1, "name": "basket, handbasket"}, {"color": [0, 224, 255], "id": 113, "isthing": 0, "name": "falls"}, {"color": [112, 224, 255], "id": 114, "isthing": 0, "name": "tent"}, {"color": [70, 184, 160], "id": 115, "isthing": 1, "name": "bag"}, {"color": [163, 0, 255], "id": 116, "isthing": 1, "name": "minibike, motorbike"}, {"color": [153, 0, 255], "id": 117, "isthing": 0, "name": "cradle"}, {"color": [71, 255, 0], "id": 118, "isthing": 1, "name": "oven"}, {"color": [255, 0, 163], "id": 119, "isthing": 1, "name": "ball"}, {"color": [255, 204, 0], "id": 120, "isthing": 1, "name": "food, solid food"}, {"color": [255, 0, 143], "id": 121, "isthing": 1, "name": "step, stair"}, {"color": [0, 255, 235], "id": 122, "isthing": 0, "name": "tank, storage tank"}, {"color": [133, 255, 0], "id": 123, "isthing": 1, "name": "trade name"}, {"color": [255, 0, 235], "id": 124, "isthing": 1, "name": "microwave"}, {"color": [245, 0, 255], "id": 125, "isthing": 1, "name": "pot"}, {"color": [255, 0, 122], "id": 126, "isthing": 1, "name": "animal"}, {"color": [255, 245, 0], "id": 127, "isthing": 1, "name": "bicycle"}, {"color": [10, 190, 212], "id": 128, "isthing": 0, "name": "lake"}, {"color": [214, 255, 0], "id": 129, "isthing": 1, "name": "dishwasher"}, {"color": [0, 204, 255], "id": 130, "isthing": 1, "name": "screen"}, {"color": [20, 0, 255], "id": 131, "isthing": 0, "name": "blanket, cover"}, {"color": [255, 255, 0], "id": 132, "isthing": 1, "name": "sculpture"}, {"color": [0, 153, 255], "id": 133, "isthing": 1, "name": "hood, exhaust hood"}, {"color": [0, 41, 255], "id": 134, "isthing": 1, "name": "sconce"}, {"color": [0, 255, 204], "id": 135, "isthing": 1, "name": "vase"}, {"color": [41, 0, 255], "id": 136, "isthing": 1, "name": "traffic light"}, {"color": [41, 255, 0], "id": 137, "isthing": 1, "name": "tray"}, {"color": [173, 0, 255], "id": 138, "isthing": 1, "name": "trash can"}, {"color": [0, 245, 255], "id": 139, "isthing": 1, "name": "fan"}, {"color": [71, 0, 255], "id": 140, "isthing": 0, "name": "pier"}, {"color": [122, 0, 255], "id": 141, "isthing": 0, "name": "crt screen"}, {"color": [0, 255, 184], "id": 142, "isthing": 1, "name": "plate"}, {"color": [0, 92, 255], "id": 143, "isthing": 1, "name": "monitor"}, {"color": [184, 255, 0], "id": 144, "isthing": 1, "name": "bulletin board"}, {"color": [0, 133, 255], "id": 145, "isthing": 0, "name": "shower"}, {"color": [255, 214, 0], "id": 146, "isthing": 1, "name": "radiator"}, {"color": [25, 194, 194], "id": 147, "isthing": 1, "name": "glass, drinking glass"}, {"color": [102, 255, 0], "id": 148, "isthing": 1, "name": "clock"}, {"color": [92, 0, 255], "id": 149, "isthing": 1, "name": "flag"}, ] ADE20k_COLORS = [k["color"] for k in ADE20K_150_CATEGORIES] MetadataCatalog.get("ade20k_sem_seg_train").set( stuff_colors=ADE20k_COLORS[:], ) MetadataCatalog.get("ade20k_sem_seg_val").set( stuff_colors=ADE20k_COLORS[:], ) def load_ade20k_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta): """ Args: image_dir (str): path to the raw dataset. e.g., "~/coco/train2017". gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017". json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json". Returns: list[dict]: a list of dicts in Detectron2 standard format. (See `Using Custom Datasets `_ ) """ def _convert_category_id(segment_info, meta): if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]: segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] segment_info["isthing"] = True else: segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] segment_info["isthing"] = False return segment_info with PathManager.open(json_file) as f: json_info = json.load(f) ret = [] for ann in json_info["annotations"]: image_id = ann["image_id"] # TODO: currently we assume image and label has the same filename but # different extension, and images have extension ".jpg" for COCO. Need # to make image extension a user-provided argument if we extend this # function to support other COCO-like datasets. image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg") label_file = os.path.join(gt_dir, ann["file_name"]) sem_label_file = os.path.join(semseg_dir, ann["file_name"]) segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]] ret.append( { "file_name": image_file, "image_id": image_id, "pan_seg_file_name": label_file, "sem_seg_file_name": sem_label_file, "segments_info": segments_info, } ) assert len(ret), f"No images found in {image_dir}!" assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"] assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"] assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"] return ret def register_ade20k_panoptic( name, metadata, image_root, panoptic_root, semantic_root, panoptic_json, instances_json=None ): """ Register a "standard" version of ADE20k panoptic segmentation dataset named `name`. The dictionaries in this registered dataset follows detectron2's standard format. Hence it's called "standard". Args: name (str): the name that identifies a dataset, e.g. "ade20k_panoptic_train" metadata (dict): extra metadata associated with this dataset. image_root (str): directory which contains all the images panoptic_root (str): directory which contains panoptic annotation images in COCO format panoptic_json (str): path to the json panoptic annotation file in COCO format sem_seg_root (none): not used, to be consistent with `register_coco_panoptic_separated`. instances_json (str): path to the json instance annotation file """ panoptic_name = name DatasetCatalog.register( panoptic_name, lambda: load_ade20k_panoptic_json( panoptic_json, image_root, panoptic_root, semantic_root, metadata ), ) MetadataCatalog.get(panoptic_name).set( panoptic_root=panoptic_root, image_root=image_root, panoptic_json=panoptic_json, json_file=instances_json, evaluator_type="ade20k_panoptic_seg", ignore_label=255, label_divisor=1000, **metadata, ) _PREDEFINED_SPLITS_ADE20K_PANOPTIC = { "ade20k_panoptic_train": ( "ADEChallengeData2016/images/training", "ADEChallengeData2016/ade20k_panoptic_train", "ADEChallengeData2016/ade20k_panoptic_train.json", "ADEChallengeData2016/annotations_detectron2/training", "ADEChallengeData2016/ade20k_instance_train.json", ), "ade20k_panoptic_val": ( "ADEChallengeData2016/images/validation", "ADEChallengeData2016/ade20k_panoptic_val", "ADEChallengeData2016/ade20k_panoptic_val.json", "ADEChallengeData2016/annotations_detectron2/validation", "ADEChallengeData2016/ade20k_instance_val.json", ), } def get_metadata(): meta = {} # The following metadata maps contiguous id from [0, #thing categories + # #stuff categories) to their names and colors. We have to replica of the # same name and color under "thing_*" and "stuff_*" because the current # visualization function in D2 handles thing and class classes differently # due to some heuristic used in Panoptic FPN. We keep the same naming to # enable reusing existing visualization functions. thing_classes = [k["name"] for k in ADE20K_150_CATEGORIES if k["isthing"] == 1] thing_colors = [k["color"] for k in ADE20K_150_CATEGORIES if k["isthing"] == 1] stuff_classes = [k["name"] for k in ADE20K_150_CATEGORIES] stuff_colors = [k["color"] for k in ADE20K_150_CATEGORIES] meta["thing_classes"] = thing_classes meta["thing_colors"] = thing_colors meta["stuff_classes"] = stuff_classes meta["stuff_colors"] = stuff_colors # Convert category id for training: # category id: like semantic segmentation, it is the class id for each # pixel. Since there are some classes not used in evaluation, the category # id is not always contiguous and thus we have two set of category ids: # - original category id: category id in the original dataset, mainly # used for evaluation. # - contiguous category id: [0, #classes), in order to train the linear # softmax classifier. thing_dataset_id_to_contiguous_id = {} stuff_dataset_id_to_contiguous_id = {} for i, cat in enumerate(ADE20K_150_CATEGORIES): if cat["isthing"]: thing_dataset_id_to_contiguous_id[cat["id"]] = i # else: # stuff_dataset_id_to_contiguous_id[cat["id"]] = i # in order to use sem_seg evaluator stuff_dataset_id_to_contiguous_id[cat["id"]] = i meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id return meta def register_all_ade20k_panoptic(root): metadata = get_metadata() for ( prefix, (image_root, panoptic_root, panoptic_json, semantic_root, instance_json), ) in _PREDEFINED_SPLITS_ADE20K_PANOPTIC.items(): # The "standard" version of COCO panoptic segmentation dataset, # e.g. used by Panoptic-DeepLab register_ade20k_panoptic( prefix, metadata, os.path.join(root, image_root), os.path.join(root, panoptic_root), os.path.join(root, semantic_root), os.path.join(root, panoptic_json), os.path.join(root, instance_json), ) _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_ade20k_panoptic(_root) ================================================ FILE: mask2former/data/datasets/register_coco_panoptic_annos_semseg.py ================================================ import json import os from detectron2.data import DatasetCatalog, MetadataCatalog from detectron2.data.datasets import load_sem_seg from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES from detectron2.utils.file_io import PathManager _PREDEFINED_SPLITS_COCO_PANOPTIC = { "coco_2017_train_panoptic": ( # This is the original panoptic annotation directory "coco/panoptic_train2017", "coco/annotations/panoptic_train2017.json", # This directory contains semantic annotations that are # converted from panoptic annotations. # It is used by PanopticFPN. # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py # to create these directories. "coco/panoptic_semseg_train2017", ), "coco_2017_val_panoptic": ( "coco/panoptic_val2017", "coco/annotations/panoptic_val2017.json", "coco/panoptic_semseg_val2017", ), } def get_metadata(): meta = {} # The following metadata maps contiguous id from [0, #thing categories + # #stuff categories) to their names and colors. We have to replica of the # same name and color under "thing_*" and "stuff_*" because the current # visualization function in D2 handles thing and class classes differently # due to some heuristic used in Panoptic FPN. We keep the same naming to # enable reusing existing visualization functions. thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1] thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1] stuff_classes = [k["name"] for k in COCO_CATEGORIES] stuff_colors = [k["color"] for k in COCO_CATEGORIES] meta["thing_classes"] = thing_classes meta["thing_colors"] = thing_colors meta["stuff_classes"] = stuff_classes meta["stuff_colors"] = stuff_colors # Convert category id for training: # category id: like semantic segmentation, it is the class id for each # pixel. Since there are some classes not used in evaluation, the category # id is not always contiguous and thus we have two set of category ids: # - original category id: category id in the original dataset, mainly # used for evaluation. # - contiguous category id: [0, #classes), in order to train the linear # softmax classifier. thing_dataset_id_to_contiguous_id = {} stuff_dataset_id_to_contiguous_id = {} for i, cat in enumerate(COCO_CATEGORIES): if cat["isthing"]: thing_dataset_id_to_contiguous_id[cat["id"]] = i # else: # stuff_dataset_id_to_contiguous_id[cat["id"]] = i # in order to use sem_seg evaluator stuff_dataset_id_to_contiguous_id[cat["id"]] = i meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id return meta def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta): """ Args: image_dir (str): path to the raw dataset. e.g., "~/coco/train2017". gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017". json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json". Returns: list[dict]: a list of dicts in Detectron2 standard format. (See `Using Custom Datasets `_ ) """ def _convert_category_id(segment_info, meta): if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]: segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] segment_info["isthing"] = True else: segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] segment_info["isthing"] = False return segment_info with PathManager.open(json_file) as f: json_info = json.load(f) ret = [] for ann in json_info["annotations"]: image_id = int(ann["image_id"]) # TODO: currently we assume image and label has the same filename but # different extension, and images have extension ".jpg" for COCO. Need # to make image extension a user-provided argument if we extend this # function to support other COCO-like datasets. image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg") label_file = os.path.join(gt_dir, ann["file_name"]) sem_label_file = os.path.join(semseg_dir, ann["file_name"]) segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]] ret.append( { "file_name": image_file, "image_id": image_id, "pan_seg_file_name": label_file, "sem_seg_file_name": sem_label_file, "segments_info": segments_info, } ) assert len(ret), f"No images found in {image_dir}!" assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"] assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"] assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"] return ret def register_coco_panoptic_annos_sem_seg( name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json ): panoptic_name = name delattr(MetadataCatalog.get(panoptic_name), "thing_classes") delattr(MetadataCatalog.get(panoptic_name), "thing_colors") MetadataCatalog.get(panoptic_name).set( thing_classes=metadata["thing_classes"], thing_colors=metadata["thing_colors"], # thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"], ) # the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg" semantic_name = name + "_with_sem_seg" DatasetCatalog.register( semantic_name, lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, sem_seg_root, metadata), ) MetadataCatalog.get(semantic_name).set( sem_seg_root=sem_seg_root, panoptic_root=panoptic_root, image_root=image_root, panoptic_json=panoptic_json, json_file=instances_json, evaluator_type="coco_panoptic_seg", ignore_label=255, label_divisor=1000, **metadata, ) def register_all_coco_panoptic_annos_sem_seg(root): for ( prefix, (panoptic_root, panoptic_json, semantic_root), ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items(): prefix_instances = prefix[: -len("_panoptic")] instances_meta = MetadataCatalog.get(prefix_instances) image_root, instances_json = instances_meta.image_root, instances_meta.json_file register_coco_panoptic_annos_sem_seg( prefix, get_metadata(), image_root, os.path.join(root, panoptic_root), os.path.join(root, panoptic_json), os.path.join(root, semantic_root), instances_json, ) _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_coco_panoptic_annos_sem_seg(_root) ================================================ FILE: mask2former/data/datasets/register_coco_stuff_10k.py ================================================ import os from detectron2.data import DatasetCatalog, MetadataCatalog from detectron2.data.datasets import load_sem_seg COCO_CATEGORIES = [ {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"}, {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"}, {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"}, {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"}, {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"}, {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"}, {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"}, {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"}, {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"}, {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"}, {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"}, {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"}, {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"}, {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"}, {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"}, {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"}, {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"}, {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"}, {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"}, {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"}, {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"}, {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"}, {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"}, {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"}, {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"}, {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"}, {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"}, {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"}, {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"}, {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"}, {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"}, {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"}, {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"}, {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"}, {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"}, {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"}, {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"}, {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"}, {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"}, {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"}, {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"}, {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"}, {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"}, {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"}, {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"}, {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"}, {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"}, {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"}, {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"}, {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"}, {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"}, {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"}, {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"}, {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"}, {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"}, {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"}, {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"}, {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"}, {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"}, {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"}, {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"}, {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"}, {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"}, {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"}, {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"}, {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"}, {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"}, {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"}, {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"}, {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"}, {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"}, {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"}, {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"}, {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"}, {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"}, {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"}, {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"}, {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"}, {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"}, {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"}, {"id": 92, "name": "banner", "supercategory": "textile"}, {"id": 93, "name": "blanket", "supercategory": "textile"}, {"id": 94, "name": "branch", "supercategory": "plant"}, {"id": 95, "name": "bridge", "supercategory": "building"}, {"id": 96, "name": "building-other", "supercategory": "building"}, {"id": 97, "name": "bush", "supercategory": "plant"}, {"id": 98, "name": "cabinet", "supercategory": "furniture-stuff"}, {"id": 99, "name": "cage", "supercategory": "structural"}, {"id": 100, "name": "cardboard", "supercategory": "raw-material"}, {"id": 101, "name": "carpet", "supercategory": "floor"}, {"id": 102, "name": "ceiling-other", "supercategory": "ceiling"}, {"id": 103, "name": "ceiling-tile", "supercategory": "ceiling"}, {"id": 104, "name": "cloth", "supercategory": "textile"}, {"id": 105, "name": "clothes", "supercategory": "textile"}, {"id": 106, "name": "clouds", "supercategory": "sky"}, {"id": 107, "name": "counter", "supercategory": "furniture-stuff"}, {"id": 108, "name": "cupboard", "supercategory": "furniture-stuff"}, {"id": 109, "name": "curtain", "supercategory": "textile"}, {"id": 110, "name": "desk-stuff", "supercategory": "furniture-stuff"}, {"id": 111, "name": "dirt", "supercategory": "ground"}, {"id": 112, "name": "door-stuff", "supercategory": "furniture-stuff"}, {"id": 113, "name": "fence", "supercategory": "structural"}, {"id": 114, "name": "floor-marble", "supercategory": "floor"}, {"id": 115, "name": "floor-other", "supercategory": "floor"}, {"id": 116, "name": "floor-stone", "supercategory": "floor"}, {"id": 117, "name": "floor-tile", "supercategory": "floor"}, {"id": 118, "name": "floor-wood", "supercategory": "floor"}, {"id": 119, "name": "flower", "supercategory": "plant"}, {"id": 120, "name": "fog", "supercategory": "water"}, {"id": 121, "name": "food-other", "supercategory": "food-stuff"}, {"id": 122, "name": "fruit", "supercategory": "food-stuff"}, {"id": 123, "name": "furniture-other", "supercategory": "furniture-stuff"}, {"id": 124, "name": "grass", "supercategory": "plant"}, {"id": 125, "name": "gravel", "supercategory": "ground"}, {"id": 126, "name": "ground-other", "supercategory": "ground"}, {"id": 127, "name": "hill", "supercategory": "solid"}, {"id": 128, "name": "house", "supercategory": "building"}, {"id": 129, "name": "leaves", "supercategory": "plant"}, {"id": 130, "name": "light", "supercategory": "furniture-stuff"}, {"id": 131, "name": "mat", "supercategory": "textile"}, {"id": 132, "name": "metal", "supercategory": "raw-material"}, {"id": 133, "name": "mirror-stuff", "supercategory": "furniture-stuff"}, {"id": 134, "name": "moss", "supercategory": "plant"}, {"id": 135, "name": "mountain", "supercategory": "solid"}, {"id": 136, "name": "mud", "supercategory": "ground"}, {"id": 137, "name": "napkin", "supercategory": "textile"}, {"id": 138, "name": "net", "supercategory": "structural"}, {"id": 139, "name": "paper", "supercategory": "raw-material"}, {"id": 140, "name": "pavement", "supercategory": "ground"}, {"id": 141, "name": "pillow", "supercategory": "textile"}, {"id": 142, "name": "plant-other", "supercategory": "plant"}, {"id": 143, "name": "plastic", "supercategory": "raw-material"}, {"id": 144, "name": "platform", "supercategory": "ground"}, {"id": 145, "name": "playingfield", "supercategory": "ground"}, {"id": 146, "name": "railing", "supercategory": "structural"}, {"id": 147, "name": "railroad", "supercategory": "ground"}, {"id": 148, "name": "river", "supercategory": "water"}, {"id": 149, "name": "road", "supercategory": "ground"}, {"id": 150, "name": "rock", "supercategory": "solid"}, {"id": 151, "name": "roof", "supercategory": "building"}, {"id": 152, "name": "rug", "supercategory": "textile"}, {"id": 153, "name": "salad", "supercategory": "food-stuff"}, {"id": 154, "name": "sand", "supercategory": "ground"}, {"id": 155, "name": "sea", "supercategory": "water"}, {"id": 156, "name": "shelf", "supercategory": "furniture-stuff"}, {"id": 157, "name": "sky-other", "supercategory": "sky"}, {"id": 158, "name": "skyscraper", "supercategory": "building"}, {"id": 159, "name": "snow", "supercategory": "ground"}, {"id": 160, "name": "solid-other", "supercategory": "solid"}, {"id": 161, "name": "stairs", "supercategory": "furniture-stuff"}, {"id": 162, "name": "stone", "supercategory": "solid"}, {"id": 163, "name": "straw", "supercategory": "plant"}, {"id": 164, "name": "structural-other", "supercategory": "structural"}, {"id": 165, "name": "table", "supercategory": "furniture-stuff"}, {"id": 166, "name": "tent", "supercategory": "building"}, {"id": 167, "name": "textile-other", "supercategory": "textile"}, {"id": 168, "name": "towel", "supercategory": "textile"}, {"id": 169, "name": "tree", "supercategory": "plant"}, {"id": 170, "name": "vegetable", "supercategory": "food-stuff"}, {"id": 171, "name": "wall-brick", "supercategory": "wall"}, {"id": 172, "name": "wall-concrete", "supercategory": "wall"}, {"id": 173, "name": "wall-other", "supercategory": "wall"}, {"id": 174, "name": "wall-panel", "supercategory": "wall"}, {"id": 175, "name": "wall-stone", "supercategory": "wall"}, {"id": 176, "name": "wall-tile", "supercategory": "wall"}, {"id": 177, "name": "wall-wood", "supercategory": "wall"}, {"id": 178, "name": "water-other", "supercategory": "water"}, {"id": 179, "name": "waterdrops", "supercategory": "water"}, {"id": 180, "name": "window-blind", "supercategory": "window"}, {"id": 181, "name": "window-other", "supercategory": "window"}, {"id": 182, "name": "wood", "supercategory": "solid"}, ] def _get_coco_stuff_meta(): # Id 0 is reserved for ignore_label, we change ignore_label for 0 # to 255 in our pre-processing. stuff_ids = [k["id"] for k in COCO_CATEGORIES] assert len(stuff_ids) == 171, len(stuff_ids) # For semantic segmentation, this mapping maps from contiguous stuff id # (in [0, 91], used in models) to ids in the dataset (used for processing results) stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)} stuff_classes = [k["name"] for k in COCO_CATEGORIES] ret = { "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id, "stuff_classes": stuff_classes, } return ret def register_all_coco_stuff_10k(root): root = os.path.join(root, "coco", "coco_stuff_10k") meta = _get_coco_stuff_meta() for name, image_dirname, sem_seg_dirname in [ ("train", "images_detectron2/train", "annotations_detectron2/train"), ("test", "images_detectron2/test", "annotations_detectron2/test"), ]: image_dir = os.path.join(root, image_dirname) gt_dir = os.path.join(root, sem_seg_dirname) name = f"coco_2017_{name}_stuff_10k_sem_seg" DatasetCatalog.register( name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg") ) MetadataCatalog.get(name).set( image_root=image_dir, sem_seg_root=gt_dir, evaluator_type="sem_seg", ignore_label=255, **meta, ) _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_coco_stuff_10k(_root) ================================================ FILE: mask2former/data/datasets/register_mapillary_vistas.py ================================================ import os from detectron2.data import DatasetCatalog, MetadataCatalog from detectron2.data.datasets import load_sem_seg MAPILLARY_VISTAS_SEM_SEG_CATEGORIES = [ { "color": [165, 42, 42], "instances": True, "readable": "Bird", "name": "animal--bird", "evaluate": True, }, { "color": [0, 192, 0], "instances": True, "readable": "Ground Animal", "name": "animal--ground-animal", "evaluate": True, }, { "color": [196, 196, 196], "instances": False, "readable": "Curb", "name": "construction--barrier--curb", "evaluate": True, }, { "color": [190, 153, 153], "instances": False, "readable": "Fence", "name": "construction--barrier--fence", "evaluate": True, }, { "color": [180, 165, 180], "instances": False, "readable": "Guard Rail", "name": "construction--barrier--guard-rail", "evaluate": True, }, { "color": [90, 120, 150], "instances": False, "readable": "Barrier", "name": "construction--barrier--other-barrier", "evaluate": True, }, { "color": [102, 102, 156], "instances": False, "readable": "Wall", "name": "construction--barrier--wall", "evaluate": True, }, { "color": [128, 64, 255], "instances": False, "readable": "Bike Lane", "name": "construction--flat--bike-lane", "evaluate": True, }, { "color": [140, 140, 200], "instances": True, "readable": "Crosswalk - Plain", "name": "construction--flat--crosswalk-plain", "evaluate": True, }, { "color": [170, 170, 170], "instances": False, "readable": "Curb Cut", "name": "construction--flat--curb-cut", "evaluate": True, }, { "color": [250, 170, 160], "instances": False, "readable": "Parking", "name": "construction--flat--parking", "evaluate": True, }, { "color": [96, 96, 96], "instances": False, "readable": "Pedestrian Area", "name": "construction--flat--pedestrian-area", "evaluate": True, }, { "color": [230, 150, 140], "instances": False, "readable": "Rail Track", "name": "construction--flat--rail-track", "evaluate": True, }, { "color": [128, 64, 128], "instances": False, "readable": "Road", "name": "construction--flat--road", "evaluate": True, }, { "color": [110, 110, 110], "instances": False, "readable": "Service Lane", "name": "construction--flat--service-lane", "evaluate": True, }, { "color": [244, 35, 232], "instances": False, "readable": "Sidewalk", "name": "construction--flat--sidewalk", "evaluate": True, }, { "color": [150, 100, 100], "instances": False, "readable": "Bridge", "name": "construction--structure--bridge", "evaluate": True, }, { "color": [70, 70, 70], "instances": False, "readable": "Building", "name": "construction--structure--building", "evaluate": True, }, { "color": [150, 120, 90], "instances": False, "readable": "Tunnel", "name": "construction--structure--tunnel", "evaluate": True, }, { "color": [220, 20, 60], "instances": True, "readable": "Person", "name": "human--person", "evaluate": True, }, { "color": [255, 0, 0], "instances": True, "readable": "Bicyclist", "name": "human--rider--bicyclist", "evaluate": True, }, { "color": [255, 0, 100], "instances": True, "readable": "Motorcyclist", "name": "human--rider--motorcyclist", "evaluate": True, }, { "color": [255, 0, 200], "instances": True, "readable": "Other Rider", "name": "human--rider--other-rider", "evaluate": True, }, { "color": [200, 128, 128], "instances": True, "readable": "Lane Marking - Crosswalk", "name": "marking--crosswalk-zebra", "evaluate": True, }, { "color": [255, 255, 255], "instances": False, "readable": "Lane Marking - General", "name": "marking--general", "evaluate": True, }, { "color": [64, 170, 64], "instances": False, "readable": "Mountain", "name": "nature--mountain", "evaluate": True, }, { "color": [230, 160, 50], "instances": False, "readable": "Sand", "name": "nature--sand", "evaluate": True, }, { "color": [70, 130, 180], "instances": False, "readable": "Sky", "name": "nature--sky", "evaluate": True, }, { "color": [190, 255, 255], "instances": False, "readable": "Snow", "name": "nature--snow", "evaluate": True, }, { "color": [152, 251, 152], "instances": False, "readable": "Terrain", "name": "nature--terrain", "evaluate": True, }, { "color": [107, 142, 35], "instances": False, "readable": "Vegetation", "name": "nature--vegetation", "evaluate": True, }, { "color": [0, 170, 30], "instances": False, "readable": "Water", "name": "nature--water", "evaluate": True, }, { "color": [255, 255, 128], "instances": True, "readable": "Banner", "name": "object--banner", "evaluate": True, }, { "color": [250, 0, 30], "instances": True, "readable": "Bench", "name": "object--bench", "evaluate": True, }, { "color": [100, 140, 180], "instances": True, "readable": "Bike Rack", "name": "object--bike-rack", "evaluate": True, }, { "color": [220, 220, 220], "instances": True, "readable": "Billboard", "name": "object--billboard", "evaluate": True, }, { "color": [220, 128, 128], "instances": True, "readable": "Catch Basin", "name": "object--catch-basin", "evaluate": True, }, { "color": [222, 40, 40], "instances": True, "readable": "CCTV Camera", "name": "object--cctv-camera", "evaluate": True, }, { "color": [100, 170, 30], "instances": True, "readable": "Fire Hydrant", "name": "object--fire-hydrant", "evaluate": True, }, { "color": [40, 40, 40], "instances": True, "readable": "Junction Box", "name": "object--junction-box", "evaluate": True, }, { "color": [33, 33, 33], "instances": True, "readable": "Mailbox", "name": "object--mailbox", "evaluate": True, }, { "color": [100, 128, 160], "instances": True, "readable": "Manhole", "name": "object--manhole", "evaluate": True, }, { "color": [142, 0, 0], "instances": True, "readable": "Phone Booth", "name": "object--phone-booth", "evaluate": True, }, { "color": [70, 100, 150], "instances": False, "readable": "Pothole", "name": "object--pothole", "evaluate": True, }, { "color": [210, 170, 100], "instances": True, "readable": "Street Light", "name": "object--street-light", "evaluate": True, }, { "color": [153, 153, 153], "instances": True, "readable": "Pole", "name": "object--support--pole", "evaluate": True, }, { "color": [128, 128, 128], "instances": True, "readable": "Traffic Sign Frame", "name": "object--support--traffic-sign-frame", "evaluate": True, }, { "color": [0, 0, 80], "instances": True, "readable": "Utility Pole", "name": "object--support--utility-pole", "evaluate": True, }, { "color": [250, 170, 30], "instances": True, "readable": "Traffic Light", "name": "object--traffic-light", "evaluate": True, }, { "color": [192, 192, 192], "instances": True, "readable": "Traffic Sign (Back)", "name": "object--traffic-sign--back", "evaluate": True, }, { "color": [220, 220, 0], "instances": True, "readable": "Traffic Sign (Front)", "name": "object--traffic-sign--front", "evaluate": True, }, { "color": [140, 140, 20], "instances": True, "readable": "Trash Can", "name": "object--trash-can", "evaluate": True, }, { "color": [119, 11, 32], "instances": True, "readable": "Bicycle", "name": "object--vehicle--bicycle", "evaluate": True, }, { "color": [150, 0, 255], "instances": True, "readable": "Boat", "name": "object--vehicle--boat", "evaluate": True, }, { "color": [0, 60, 100], "instances": True, "readable": "Bus", "name": "object--vehicle--bus", "evaluate": True, }, { "color": [0, 0, 142], "instances": True, "readable": "Car", "name": "object--vehicle--car", "evaluate": True, }, { "color": [0, 0, 90], "instances": True, "readable": "Caravan", "name": "object--vehicle--caravan", "evaluate": True, }, { "color": [0, 0, 230], "instances": True, "readable": "Motorcycle", "name": "object--vehicle--motorcycle", "evaluate": True, }, { "color": [0, 80, 100], "instances": False, "readable": "On Rails", "name": "object--vehicle--on-rails", "evaluate": True, }, { "color": [128, 64, 64], "instances": True, "readable": "Other Vehicle", "name": "object--vehicle--other-vehicle", "evaluate": True, }, { "color": [0, 0, 110], "instances": True, "readable": "Trailer", "name": "object--vehicle--trailer", "evaluate": True, }, { "color": [0, 0, 70], "instances": True, "readable": "Truck", "name": "object--vehicle--truck", "evaluate": True, }, { "color": [0, 0, 192], "instances": True, "readable": "Wheeled Slow", "name": "object--vehicle--wheeled-slow", "evaluate": True, }, { "color": [32, 32, 32], "instances": False, "readable": "Car Mount", "name": "void--car-mount", "evaluate": True, }, { "color": [120, 10, 10], "instances": False, "readable": "Ego Vehicle", "name": "void--ego-vehicle", "evaluate": True, }, { "color": [0, 0, 0], "instances": False, "readable": "Unlabeled", "name": "void--unlabeled", "evaluate": False, }, ] def _get_mapillary_vistas_meta(): stuff_classes = [k["readable"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES if k["evaluate"]] assert len(stuff_classes) == 65 stuff_colors = [k["color"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES if k["evaluate"]] assert len(stuff_colors) == 65 ret = { "stuff_classes": stuff_classes, "stuff_colors": stuff_colors, } return ret def register_all_mapillary_vistas(root): root = os.path.join(root, "mapillary_vistas") meta = _get_mapillary_vistas_meta() for name, dirname in [("train", "training"), ("val", "validation")]: image_dir = os.path.join(root, dirname, "images") gt_dir = os.path.join(root, dirname, "labels") name = f"mapillary_vistas_sem_seg_{name}" DatasetCatalog.register( name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg") ) MetadataCatalog.get(name).set( image_root=image_dir, sem_seg_root=gt_dir, evaluator_type="sem_seg", ignore_label=65, # different from other datasets, Mapillary Vistas sets ignore_label to 65 **meta, ) _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_mapillary_vistas(_root) ================================================ FILE: mask2former/data/datasets/register_mapillary_vistas_panoptic.py ================================================ import json import os from detectron2.data import DatasetCatalog, MetadataCatalog from detectron2.utils.file_io import PathManager MAPILLARY_VISTAS_SEM_SEG_CATEGORIES = [ {'color': [165, 42, 42], 'id': 1, 'isthing': 1, 'name': 'Bird', 'supercategory': 'animal--bird'}, {'color': [0, 192, 0], 'id': 2, 'isthing': 1, 'name': 'Ground Animal', 'supercategory': 'animal--ground-animal'}, {'color': [196, 196, 196], 'id': 3, 'isthing': 0, 'name': 'Curb', 'supercategory': 'construction--barrier--curb'}, {'color': [190, 153, 153], 'id': 4, 'isthing': 0, 'name': 'Fence', 'supercategory': 'construction--barrier--fence'}, {'color': [180, 165, 180], 'id': 5, 'isthing': 0, 'name': 'Guard Rail', 'supercategory': 'construction--barrier--guard-rail'}, {'color': [90, 120, 150], 'id': 6, 'isthing': 0, 'name': 'Barrier', 'supercategory': 'construction--barrier--other-barrier'}, {'color': [102, 102, 156], 'id': 7, 'isthing': 0, 'name': 'Wall', 'supercategory': 'construction--barrier--wall'}, {'color': [128, 64, 255], 'id': 8, 'isthing': 0, 'name': 'Bike Lane', 'supercategory': 'construction--flat--bike-lane'}, {'color': [140, 140, 200], 'id': 9, 'isthing': 1, 'name': 'Crosswalk - Plain', 'supercategory': 'construction--flat--crosswalk-plain'}, {'color': [170, 170, 170], 'id': 10, 'isthing': 0, 'name': 'Curb Cut', 'supercategory': 'construction--flat--curb-cut'}, {'color': [250, 170, 160], 'id': 11, 'isthing': 0, 'name': 'Parking', 'supercategory': 'construction--flat--parking'}, {'color': [96, 96, 96], 'id': 12, 'isthing': 0, 'name': 'Pedestrian Area', 'supercategory': 'construction--flat--pedestrian-area'}, {'color': [230, 150, 140], 'id': 13, 'isthing': 0, 'name': 'Rail Track', 'supercategory': 'construction--flat--rail-track'}, {'color': [128, 64, 128], 'id': 14, 'isthing': 0, 'name': 'Road', 'supercategory': 'construction--flat--road'}, {'color': [110, 110, 110], 'id': 15, 'isthing': 0, 'name': 'Service Lane', 'supercategory': 'construction--flat--service-lane'}, {'color': [244, 35, 232], 'id': 16, 'isthing': 0, 'name': 'Sidewalk', 'supercategory': 'construction--flat--sidewalk'}, {'color': [150, 100, 100], 'id': 17, 'isthing': 0, 'name': 'Bridge', 'supercategory': 'construction--structure--bridge'}, {'color': [70, 70, 70], 'id': 18, 'isthing': 0, 'name': 'Building', 'supercategory': 'construction--structure--building'}, {'color': [150, 120, 90], 'id': 19, 'isthing': 0, 'name': 'Tunnel', 'supercategory': 'construction--structure--tunnel'}, {'color': [220, 20, 60], 'id': 20, 'isthing': 1, 'name': 'Person', 'supercategory': 'human--person'}, {'color': [255, 0, 0], 'id': 21, 'isthing': 1, 'name': 'Bicyclist', 'supercategory': 'human--rider--bicyclist'}, {'color': [255, 0, 100], 'id': 22, 'isthing': 1, 'name': 'Motorcyclist', 'supercategory': 'human--rider--motorcyclist'}, {'color': [255, 0, 200], 'id': 23, 'isthing': 1, 'name': 'Other Rider', 'supercategory': 'human--rider--other-rider'}, {'color': [200, 128, 128], 'id': 24, 'isthing': 1, 'name': 'Lane Marking - Crosswalk', 'supercategory': 'marking--crosswalk-zebra'}, {'color': [255, 255, 255], 'id': 25, 'isthing': 0, 'name': 'Lane Marking - General', 'supercategory': 'marking--general'}, {'color': [64, 170, 64], 'id': 26, 'isthing': 0, 'name': 'Mountain', 'supercategory': 'nature--mountain'}, {'color': [230, 160, 50], 'id': 27, 'isthing': 0, 'name': 'Sand', 'supercategory': 'nature--sand'}, {'color': [70, 130, 180], 'id': 28, 'isthing': 0, 'name': 'Sky', 'supercategory': 'nature--sky'}, {'color': [190, 255, 255], 'id': 29, 'isthing': 0, 'name': 'Snow', 'supercategory': 'nature--snow'}, {'color': [152, 251, 152], 'id': 30, 'isthing': 0, 'name': 'Terrain', 'supercategory': 'nature--terrain'}, {'color': [107, 142, 35], 'id': 31, 'isthing': 0, 'name': 'Vegetation', 'supercategory': 'nature--vegetation'}, {'color': [0, 170, 30], 'id': 32, 'isthing': 0, 'name': 'Water', 'supercategory': 'nature--water'}, {'color': [255, 255, 128], 'id': 33, 'isthing': 1, 'name': 'Banner', 'supercategory': 'object--banner'}, {'color': [250, 0, 30], 'id': 34, 'isthing': 1, 'name': 'Bench', 'supercategory': 'object--bench'}, {'color': [100, 140, 180], 'id': 35, 'isthing': 1, 'name': 'Bike Rack', 'supercategory': 'object--bike-rack'}, {'color': [220, 220, 220], 'id': 36, 'isthing': 1, 'name': 'Billboard', 'supercategory': 'object--billboard'}, {'color': [220, 128, 128], 'id': 37, 'isthing': 1, 'name': 'Catch Basin', 'supercategory': 'object--catch-basin'}, {'color': [222, 40, 40], 'id': 38, 'isthing': 1, 'name': 'CCTV Camera', 'supercategory': 'object--cctv-camera'}, {'color': [100, 170, 30], 'id': 39, 'isthing': 1, 'name': 'Fire Hydrant', 'supercategory': 'object--fire-hydrant'}, {'color': [40, 40, 40], 'id': 40, 'isthing': 1, 'name': 'Junction Box', 'supercategory': 'object--junction-box'}, {'color': [33, 33, 33], 'id': 41, 'isthing': 1, 'name': 'Mailbox', 'supercategory': 'object--mailbox'}, {'color': [100, 128, 160], 'id': 42, 'isthing': 1, 'name': 'Manhole', 'supercategory': 'object--manhole'}, {'color': [142, 0, 0], 'id': 43, 'isthing': 1, 'name': 'Phone Booth', 'supercategory': 'object--phone-booth'}, {'color': [70, 100, 150], 'id': 44, 'isthing': 0, 'name': 'Pothole', 'supercategory': 'object--pothole'}, {'color': [210, 170, 100], 'id': 45, 'isthing': 1, 'name': 'Street Light', 'supercategory': 'object--street-light'}, {'color': [153, 153, 153], 'id': 46, 'isthing': 1, 'name': 'Pole', 'supercategory': 'object--support--pole'}, {'color': [128, 128, 128], 'id': 47, 'isthing': 1, 'name': 'Traffic Sign Frame', 'supercategory': 'object--support--traffic-sign-frame'}, {'color': [0, 0, 80], 'id': 48, 'isthing': 1, 'name': 'Utility Pole', 'supercategory': 'object--support--utility-pole'}, {'color': [250, 170, 30], 'id': 49, 'isthing': 1, 'name': 'Traffic Light', 'supercategory': 'object--traffic-light'}, {'color': [192, 192, 192], 'id': 50, 'isthing': 1, 'name': 'Traffic Sign (Back)', 'supercategory': 'object--traffic-sign--back'}, {'color': [220, 220, 0], 'id': 51, 'isthing': 1, 'name': 'Traffic Sign (Front)', 'supercategory': 'object--traffic-sign--front'}, {'color': [140, 140, 20], 'id': 52, 'isthing': 1, 'name': 'Trash Can', 'supercategory': 'object--trash-can'}, {'color': [119, 11, 32], 'id': 53, 'isthing': 1, 'name': 'Bicycle', 'supercategory': 'object--vehicle--bicycle'}, {'color': [150, 0, 255], 'id': 54, 'isthing': 1, 'name': 'Boat', 'supercategory': 'object--vehicle--boat'}, {'color': [0, 60, 100], 'id': 55, 'isthing': 1, 'name': 'Bus', 'supercategory': 'object--vehicle--bus'}, {'color': [0, 0, 142], 'id': 56, 'isthing': 1, 'name': 'Car', 'supercategory': 'object--vehicle--car'}, {'color': [0, 0, 90], 'id': 57, 'isthing': 1, 'name': 'Caravan', 'supercategory': 'object--vehicle--caravan'}, {'color': [0, 0, 230], 'id': 58, 'isthing': 1, 'name': 'Motorcycle', 'supercategory': 'object--vehicle--motorcycle'}, {'color': [0, 80, 100], 'id': 59, 'isthing': 0, 'name': 'On Rails', 'supercategory': 'object--vehicle--on-rails'}, {'color': [128, 64, 64], 'id': 60, 'isthing': 1, 'name': 'Other Vehicle', 'supercategory': 'object--vehicle--other-vehicle'}, {'color': [0, 0, 110], 'id': 61, 'isthing': 1, 'name': 'Trailer', 'supercategory': 'object--vehicle--trailer'}, {'color': [0, 0, 70], 'id': 62, 'isthing': 1, 'name': 'Truck', 'supercategory': 'object--vehicle--truck'}, {'color': [0, 0, 192], 'id': 63, 'isthing': 1, 'name': 'Wheeled Slow', 'supercategory': 'object--vehicle--wheeled-slow'}, {'color': [32, 32, 32], 'id': 64, 'isthing': 0, 'name': 'Car Mount', 'supercategory': 'void--car-mount'}, {'color': [120, 10, 10], 'id': 65, 'isthing': 0, 'name': 'Ego Vehicle', 'supercategory': 'void--ego-vehicle'} ] def load_mapillary_vistas_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta): """ Args: image_dir (str): path to the raw dataset. e.g., "~/coco/train2017". gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017". json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json". Returns: list[dict]: a list of dicts in Detectron2 standard format. (See `Using Custom Datasets `_ ) """ def _convert_category_id(segment_info, meta): if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]: segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] segment_info["isthing"] = True else: segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] segment_info["isthing"] = False return segment_info with PathManager.open(json_file) as f: json_info = json.load(f) ret = [] for ann in json_info["annotations"]: image_id = ann["image_id"] # TODO: currently we assume image and label has the same filename but # different extension, and images have extension ".jpg" for COCO. Need # to make image extension a user-provided argument if we extend this # function to support other COCO-like datasets. image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg") label_file = os.path.join(gt_dir, ann["file_name"]) sem_label_file = os.path.join(semseg_dir, ann["file_name"]) segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]] ret.append( { "file_name": image_file, "image_id": image_id, "pan_seg_file_name": label_file, "sem_seg_file_name": sem_label_file, "segments_info": segments_info, } ) assert len(ret), f"No images found in {image_dir}!" assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"] assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"] assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"] return ret def register_mapillary_vistas_panoptic( name, metadata, image_root, panoptic_root, semantic_root, panoptic_json, instances_json=None ): """ Register a "standard" version of ADE20k panoptic segmentation dataset named `name`. The dictionaries in this registered dataset follows detectron2's standard format. Hence it's called "standard". Args: name (str): the name that identifies a dataset, e.g. "ade20k_panoptic_train" metadata (dict): extra metadata associated with this dataset. image_root (str): directory which contains all the images panoptic_root (str): directory which contains panoptic annotation images in COCO format panoptic_json (str): path to the json panoptic annotation file in COCO format sem_seg_root (none): not used, to be consistent with `register_coco_panoptic_separated`. instances_json (str): path to the json instance annotation file """ panoptic_name = name DatasetCatalog.register( panoptic_name, lambda: load_mapillary_vistas_panoptic_json( panoptic_json, image_root, panoptic_root, semantic_root, metadata ), ) MetadataCatalog.get(panoptic_name).set( panoptic_root=panoptic_root, image_root=image_root, panoptic_json=panoptic_json, json_file=instances_json, evaluator_type="mapillary_vistas_panoptic_seg", ignore_label=65, # different from other datasets, Mapillary Vistas sets ignore_label to 65 label_divisor=1000, **metadata, ) _PREDEFINED_SPLITS_ADE20K_PANOPTIC = { "mapillary_vistas_panoptic_train": ( "mapillary_vistas/training/images", "mapillary_vistas/training/panoptic", "mapillary_vistas/training/panoptic/panoptic_2018.json", "mapillary_vistas/training/labels", ), "mapillary_vistas_panoptic_val": ( "mapillary_vistas/validation/images", "mapillary_vistas/validation/panoptic", "mapillary_vistas/validation/panoptic/panoptic_2018.json", "mapillary_vistas/validation/labels", ), } def get_metadata(): meta = {} # The following metadata maps contiguous id from [0, #thing categories + # #stuff categories) to their names and colors. We have to replica of the # same name and color under "thing_*" and "stuff_*" because the current # visualization function in D2 handles thing and class classes differently # due to some heuristic used in Panoptic FPN. We keep the same naming to # enable reusing existing visualization functions. thing_classes = [k["name"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES] thing_colors = [k["color"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES] stuff_classes = [k["name"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES] stuff_colors = [k["color"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES] meta["thing_classes"] = thing_classes meta["thing_colors"] = thing_colors meta["stuff_classes"] = stuff_classes meta["stuff_colors"] = stuff_colors # Convert category id for training: # category id: like semantic segmentation, it is the class id for each # pixel. Since there are some classes not used in evaluation, the category # id is not always contiguous and thus we have two set of category ids: # - original category id: category id in the original dataset, mainly # used for evaluation. # - contiguous category id: [0, #classes), in order to train the linear # softmax classifier. thing_dataset_id_to_contiguous_id = {} stuff_dataset_id_to_contiguous_id = {} for i, cat in enumerate(MAPILLARY_VISTAS_SEM_SEG_CATEGORIES): if cat["isthing"]: thing_dataset_id_to_contiguous_id[cat["id"]] = i # else: # stuff_dataset_id_to_contiguous_id[cat["id"]] = i # in order to use sem_seg evaluator stuff_dataset_id_to_contiguous_id[cat["id"]] = i meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id return meta def register_all_mapillary_vistas_panoptic(root): metadata = get_metadata() for ( prefix, (image_root, panoptic_root, panoptic_json, semantic_root), ) in _PREDEFINED_SPLITS_ADE20K_PANOPTIC.items(): # The "standard" version of COCO panoptic segmentation dataset, # e.g. used by Panoptic-DeepLab register_mapillary_vistas_panoptic( prefix, metadata, os.path.join(root, image_root), os.path.join(root, panoptic_root), os.path.join(root, semantic_root), os.path.join(root, panoptic_json), ) _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_mapillary_vistas_panoptic(_root) ================================================ FILE: mask2former/evaluation/__init__.py ================================================ ================================================ FILE: mask2former/evaluation/__init__.py.new ================================================ ================================================ FILE: mask2former/evaluation/instance_evaluation.py ================================================ import contextlib import copy import io import itertools import json import logging import numpy as np import os import pickle from collections import OrderedDict import pycocotools.mask as mask_util import torch from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval from tabulate import tabulate import detectron2.utils.comm as comm from detectron2.config import CfgNode from detectron2.data import MetadataCatalog from detectron2.data.datasets.coco import convert_to_coco_json from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco from detectron2.evaluation.fast_eval_api import COCOeval_opt from detectron2.structures import Boxes, BoxMode, pairwise_iou from detectron2.utils.file_io import PathManager from detectron2.utils.logger import create_small_table # modified from COCOEvaluator for instance segmetnat class InstanceSegEvaluator(COCOEvaluator): """ Evaluate AR for object proposals, AP for instance detection/segmentation, AP for keypoint detection outputs using COCO's metrics. See http://cocodataset.org/#detection-eval and http://cocodataset.org/#keypoints-eval to understand its metrics. The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means the metric cannot be computed (e.g. due to no predictions made). In addition to COCO, this evaluator is able to support any bounding box detection, instance segmentation, or keypoint detection dataset. """ def _eval_predictions(self, predictions, img_ids=None): """ Evaluate predictions. Fill self._results with the metrics of the tasks. """ self._logger.info("Preparing results for COCO format ...") coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) tasks = self._tasks or self._tasks_from_predictions(coco_results) # unmap the category ids for COCO if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id # all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) # num_classes = len(all_contiguous_ids) # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} for result in coco_results: category_id = result["category_id"] # assert category_id < num_classes, ( # f"A prediction has class={category_id}, " # f"but the dataset only has {num_classes} classes and " # f"predicted class id should be in [0, {num_classes - 1}]." # ) assert category_id in reverse_id_mapping, ( f"A prediction has class={category_id}, " f"but the dataset only has class ids in {dataset_id_to_contiguous_id}." ) result["category_id"] = reverse_id_mapping[category_id] if self._output_dir: file_path = os.path.join(self._output_dir, "coco_instances_results.json") self._logger.info("Saving results to {}".format(file_path)) with PathManager.open(file_path, "w") as f: f.write(json.dumps(coco_results)) f.flush() if not self._do_evaluation: self._logger.info("Annotations are not available for evaluation.") return self._logger.info( "Evaluating predictions with {} COCO API...".format( "unofficial" if self._use_fast_impl else "official" ) ) for task in sorted(tasks): assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!" coco_eval = ( _evaluate_predictions_on_coco( self._coco_api, coco_results, task, kpt_oks_sigmas=self._kpt_oks_sigmas, use_fast_impl=self._use_fast_impl, img_ids=img_ids, max_dets_per_image=self._max_dets_per_image, ) if len(coco_results) > 0 else None # cocoapi does not handle empty results very well ) res = self._derive_coco_results( coco_eval, task, class_names=self._metadata.get("thing_classes") ) self._results[task] = res ================================================ FILE: mask2former/maskformer_model.py ================================================ from typing import Tuple import torch from torch import nn from torch.nn import functional as F from detectron2.config import configurable from detectron2.data import MetadataCatalog from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head from detectron2.modeling.backbone import Backbone from detectron2.modeling.postprocessing import sem_seg_postprocess from detectron2.structures import Boxes, ImageList, Instances, BitMasks from detectron2.utils.memory import retry_if_cuda_oom from .modeling.criterion import SetCriterion from .modeling.matcher import HungarianMatcher from skimage import color import cv2 import numpy as np def unfold_wo_center(x, kernel_size, dilation): assert x.dim() == 4 assert kernel_size % 2 == 1 # using SAME padding padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2 unfolded_x = F.unfold( x, kernel_size=kernel_size, padding=padding, dilation=dilation ) unfolded_x = unfolded_x.reshape( x.size(0), x.size(1), -1, x.size(2), x.size(3) ) # remove the center pixels size = kernel_size ** 2 unfolded_x = torch.cat(( unfolded_x[:, :, :size // 2], unfolded_x[:, :, size // 2 + 1:] ), dim=2) return unfolded_x def get_images_color_similarity(images, kernel_size, dilation): assert images.dim() == 4 assert images.size(0) == 1 unfolded_images = unfold_wo_center( images, kernel_size=kernel_size, dilation=dilation ) diff = images[:, :, None] - unfolded_images similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5) return similarity @META_ARCH_REGISTRY.register() class MaskFormer(nn.Module): """ Main class for mask classification semantic segmentation architectures. """ @configurable def __init__( self, *, backbone: Backbone, sem_seg_head: nn.Module, criterion: nn.Module, num_queries: int, object_mask_threshold: float, overlap_threshold: float, metadata, size_divisibility: int, sem_seg_postprocess_before_inference: bool, pixel_mean: Tuple[float], pixel_std: Tuple[float], # inference semantic_on: bool, panoptic_on: bool, instance_on: bool, test_topk_per_image: int, ): """ Args: backbone: a backbone module, must follow detectron2's backbone interface sem_seg_head: a module that predicts semantic segmentation from backbone features criterion: a module that defines the loss num_queries: int, number of queries object_mask_threshold: float, threshold to filter query based on classification score for panoptic segmentation inference overlap_threshold: overlap threshold used in general inference for panoptic segmentation metadata: dataset meta, get `thing` and `stuff` category names for panoptic segmentation inference size_divisibility: Some backbones require the input height and width to be divisible by a specific integer. We can use this to override such requirement. sem_seg_postprocess_before_inference: whether to resize the prediction back to original input size before semantic segmentation inference or after. For high-resolution dataset like Mapillary, resizing predictions before inference will cause OOM error. pixel_mean, pixel_std: list or tuple with #channels element, representing the per-channel mean and std to be used to normalize the input image semantic_on: bool, whether to output semantic segmentation prediction instance_on: bool, whether to output instance segmentation prediction panoptic_on: bool, whether to output panoptic segmentation prediction test_topk_per_image: int, instance segmentation parameter, keep topk instances per image """ super().__init__() self.backbone = backbone self.sem_seg_head = sem_seg_head self.criterion = criterion self.num_queries = num_queries self.overlap_threshold = overlap_threshold self.object_mask_threshold = object_mask_threshold self.metadata = metadata if size_divisibility < 0: # use backbone size_divisibility if not set size_divisibility = self.backbone.size_divisibility self.size_divisibility = size_divisibility self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False) self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False) # additional args self.semantic_on = semantic_on self.instance_on = instance_on self.panoptic_on = panoptic_on self.test_topk_per_image = test_topk_per_image if not self.semantic_on: assert self.sem_seg_postprocess_before_inference @classmethod def from_config(cls, cfg): backbone = build_backbone(cfg) sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape()) # Loss parameters: deep_supervision = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION no_object_weight = cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT # loss weights class_weight = cfg.MODEL.MASK_FORMER.CLASS_WEIGHT dice_weight = cfg.MODEL.MASK_FORMER.DICE_WEIGHT mask_weight = cfg.MODEL.MASK_FORMER.MASK_WEIGHT # building criterion matcher = HungarianMatcher( cost_class=class_weight, cost_mask=mask_weight, cost_dice=dice_weight, num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS, ) weight_dict = {"loss_ce": class_weight, "loss_mask": mask_weight, "loss_dice": dice_weight, "loss_bound": mask_weight} if deep_supervision: dec_layers = cfg.MODEL.MASK_FORMER.DEC_LAYERS aux_weight_dict = {} for i in range(dec_layers - 1): aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) weight_dict.update(aux_weight_dict) losses = ["labels", "masks"] criterion = SetCriterion( sem_seg_head.num_classes, matcher=matcher, weight_dict=weight_dict, eos_coef=no_object_weight, losses=losses, num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS, oversample_ratio=cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO, importance_sample_ratio=cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO, ) return { "backbone": backbone, "sem_seg_head": sem_seg_head, "criterion": criterion, "num_queries": cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES, "object_mask_threshold": cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD, "overlap_threshold": cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD, "metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]), "size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY, "sem_seg_postprocess_before_inference": ( cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE or cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON or cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON ), "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, # inference "semantic_on": cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON, "instance_on": cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON, "panoptic_on": cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON, "test_topk_per_image": cfg.TEST.DETECTIONS_PER_IMAGE, } @property def device(self): return self.pixel_mean.device def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper`. Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * "image": Tensor, image in (C, H, W) format. * "instances": per-region ground truth * Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model (may be different from input resolution), used in inference. Returns: list[dict]: each dict has the results for one image. The dict contains the following keys: * "sem_seg": A Tensor that represents the per-pixel segmentation prediced by the head. The prediction has shape KxHxW that represents the logits of each class for each pixel. * "panoptic_seg": A tuple that represent panoptic output panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment. segments_info (list[dict]): Describe each segment in `panoptic_seg`. Each dict contains keys "id", "category_id", "isthing". """ images = [x["image"].to(self.device) for x in batched_inputs] # if self.training: # downsampled_images = [F.avg_pool2d(img.float(), kernel_size=4, stride=4, padding=0)[[2, 1, 0]] for img in images] # images_lab = [torch.as_tensor(color.rgb2lab(ds_image.byte().permute(1, 2, 0).cpu().numpy()), device=ds_image.device, dtype=torch.float32).permute(2, 0, 1) for ds_image in downsampled_images] # images_lab_sim = [get_images_color_similarity(img_lab.unsqueeze(0), 3, 2) for img_lab in images_lab] # ori is 0.3, 0.5, 0.7 # # for i_m, im_sim in enumerate(images_lab_sim): # # heatmapshow = cv2.applyColorMap((im_sim[0, 0] * 255).cpu().numpy().astype(np.uint8), cv2.COLORMAP_JET) # # cv2.imwrite('./vis_debug3/'+str(batched_inputs[i_m]['image_id'])+"_heatmap_n_bina_new1.jpg", heatmapshow) # # cv2.imwrite('./vis_debug3/'+str(batched_inputs[i_m]['image_id'])+"_img.jpg", downsampled_images[i_m].byte().permute(1, 2, 0).cpu().numpy()) # # print('images_lab_sim shape:', [im_sim.shape1 for im_sim in images_lab_sim]) # print('mask in image_masks:', [m.shape for m in image_masks]) # print('mask in image_masks max:', [m.max() for m in image_masks]) # print('mask in image_masks min:', [m.min() for m in image_masks]) # print('mask in image_masks percent:', [m.sum() / (m.shape[0] * m.shape[1]) for m in image_masks]) if self.training: rs_images = ImageList.from_tensors(images, self.size_divisibility) image_masks = [~ x["padding_mask"].to(self.device) for x in batched_inputs] image_masks_back = [x["padding_mask"].to(self.device) for x in batched_inputs] # for ii, i_mask in enumerate(image_masks): # print('index:', ii, 'i_mask:', i_mask.shape) # print('index:', ii, 'i_mask:', i_mask.max()) # cv2.imwrite('vis_mask_check/'+str(batched_inputs[ii]['image_id'])+str(ii)+'_mask.jpg', i_mask.float().cpu().numpy() * 255) # print('mask in image_masks:', [m.shape for m in image_masks]) # print('mask in image_masks max:', [m.max() for m in image_masks]) # print('mask in image_masks min:', [m.min() for m in image_masks]) image_masks_bool = [((m.sum() / (m.shape[0] * m.shape[1])) > 0.25).float()*((m_b.sum() / (m.shape[0] * m.shape[1])) > 0.25).float() for m, m_b in zip(image_masks, image_masks_back)] #0.25, 0.64 #image_masks_bool = [((m.sum() / (m.shape[0] * m.shape[1])) > 1.0).float() for m in image_masks] #0.25, 0.64 # print('len image_masks_bool:', image_masks_bool) downsampled_images = F.avg_pool2d(rs_images.tensor.float(), kernel_size=4, stride=4, padding=0) #for img in images] # print('len downsampled_images:', len(downsampled_images)) images_lab = [torch.as_tensor(color.rgb2lab(ds_image[[2, 1, 0]].byte().permute(1, 2, 0).cpu().numpy()), device=ds_image.device, dtype=torch.float32).permute(2, 0, 1) for ds_image in downsampled_images] images_lab_sim = [get_images_color_similarity(img_lab.unsqueeze(0), 3, 2) * float(img_m_bool) for img_lab, img_m_bool in zip(images_lab, image_masks_bool)] # ori is 0.3, 0.5, 0.7 # for i_m, im_sim in enumerate(images_lab_sim): # heatmapshow = cv2.applyColorMap((im_sim[0, 0] * 255).cpu().numpy().astype(np.uint8), cv2.COLORMAP_JET) # cv2.imwrite('./vis_debug3/'+str(batched_inputs[i_m]['image_id'])+"_heatmap_n_bina_new1.jpg", heatmapshow) # cv2.imwrite('./vis_debug3/'+str(batched_inputs[i_m]['image_id'])+"_img.jpg", downsampled_images[i_m].byte().permute(1, 2, 0).cpu().numpy()) # print('images_lab_sim shape:', [im_sim.shape1 for im_sim in images_lab_sim]) # ori_images = ImageList.from_tensors(images, self.size_divisibility) # ori_images_tensor = ori_images.tensor[:, :, ::4, ::4] # print('ori images:', ori_images_tensor.shape) images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, self.size_divisibility) features = self.backbone(images.tensor) outputs = self.sem_seg_head(features) if self.training: # mask classification target if "instances" in batched_inputs[0]: gt_instances = [x["instances"].to(self.device) for x in batched_inputs] targets = self.prepare_targets(gt_instances, images) else: targets = None # bipartite matching-based loss losses = self.criterion(outputs, targets, images_lab_sim) for k in list(losses.keys()): if k in self.criterion.weight_dict: losses[k] *= self.criterion.weight_dict[k] else: # remove this loss if not specified in `weight_dict` losses.pop(k) return losses else: mask_cls_results = outputs["pred_logits"] mask_pred_results = outputs["pred_masks"] # upsample masks mask_pred_results = F.interpolate( mask_pred_results, size=(images.tensor.shape[-2], images.tensor.shape[-1]), mode="bilinear", align_corners=False, ) del outputs processed_results = [] for mask_cls_result, mask_pred_result, input_per_image, image_size in zip( mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes ): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) processed_results.append({}) if self.sem_seg_postprocess_before_inference: mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)( mask_pred_result, image_size, height, width ) mask_cls_result = mask_cls_result.to(mask_pred_result) # semantic segmentation inference if self.semantic_on: r = retry_if_cuda_oom(self.semantic_inference)(mask_cls_result, mask_pred_result) if not self.sem_seg_postprocess_before_inference: r = retry_if_cuda_oom(sem_seg_postprocess)(r, image_size, height, width) processed_results[-1]["sem_seg"] = r # panoptic segmentation inference if self.panoptic_on: panoptic_r = retry_if_cuda_oom(self.panoptic_inference)(mask_cls_result, mask_pred_result) processed_results[-1]["panoptic_seg"] = panoptic_r # instance segmentation inference if self.instance_on: instance_r = retry_if_cuda_oom(self.instance_inference)(mask_cls_result, mask_pred_result) processed_results[-1]["instances"] = instance_r return processed_results def prepare_targets(self, targets, images): h_pad, w_pad = images.tensor.shape[-2:] new_targets = [] for targets_per_image in targets: # pad gt gt_masks = targets_per_image.gt_masks padded_masks = torch.zeros((gt_masks.shape[0], h_pad, w_pad), dtype=gt_masks.dtype, device=gt_masks.device) padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks new_targets.append( { "labels": targets_per_image.gt_classes, "masks": padded_masks, } ) return new_targets def semantic_inference(self, mask_cls, mask_pred): mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1] mask_pred = mask_pred.sigmoid() semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred) return semseg def panoptic_inference(self, mask_cls, mask_pred): scores, labels = F.softmax(mask_cls, dim=-1).max(-1) mask_pred = mask_pred.sigmoid() keep = labels.ne(self.sem_seg_head.num_classes) & (scores > self.object_mask_threshold) cur_scores = scores[keep] cur_classes = labels[keep] cur_masks = mask_pred[keep] cur_mask_cls = mask_cls[keep] cur_mask_cls = cur_mask_cls[:, :-1] cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks h, w = cur_masks.shape[-2:] panoptic_seg = torch.zeros((h, w), dtype=torch.int32, device=cur_masks.device) segments_info = [] current_segment_id = 0 if cur_masks.shape[0] == 0: # We didn't detect any mask :( return panoptic_seg, segments_info else: # take argmax cur_mask_ids = cur_prob_masks.argmax(0) stuff_memory_list = {} for k in range(cur_classes.shape[0]): pred_class = cur_classes[k].item() isthing = pred_class in self.metadata.thing_dataset_id_to_contiguous_id.values() mask_area = (cur_mask_ids == k).sum().item() original_area = (cur_masks[k] >= 0.5).sum().item() mask = (cur_mask_ids == k) & (cur_masks[k] >= 0.5) if mask_area > 0 and original_area > 0 and mask.sum().item() > 0: if mask_area / original_area < self.overlap_threshold: continue # merge stuff regions if not isthing: if int(pred_class) in stuff_memory_list.keys(): panoptic_seg[mask] = stuff_memory_list[int(pred_class)] continue else: stuff_memory_list[int(pred_class)] = current_segment_id + 1 current_segment_id += 1 panoptic_seg[mask] = current_segment_id segments_info.append( { "id": current_segment_id, "isthing": bool(isthing), "category_id": int(pred_class), } ) return panoptic_seg, segments_info def instance_inference(self, mask_cls, mask_pred): # mask_pred is already processed to have the same shape as original input image_size = mask_pred.shape[-2:] # [Q, K] scores = F.softmax(mask_cls, dim=-1)[:, :-1] labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1) # scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.num_queries, sorted=False) scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.test_topk_per_image, sorted=False) labels_per_image = labels[topk_indices] topk_indices = topk_indices // self.sem_seg_head.num_classes # mask_pred = mask_pred.unsqueeze(1).repeat(1, self.sem_seg_head.num_classes, 1).flatten(0, 1) mask_pred = mask_pred[topk_indices] # if this is panoptic segmentation, we only keep the "thing" classes if self.panoptic_on: keep = torch.zeros_like(scores_per_image).bool() for i, lab in enumerate(labels_per_image): keep[i] = lab in self.metadata.thing_dataset_id_to_contiguous_id.values() scores_per_image = scores_per_image[keep] labels_per_image = labels_per_image[keep] mask_pred = mask_pred[keep] result = Instances(image_size) # mask (before sigmoid) result.pred_masks = (mask_pred > 0).float() # result.pred_masks = (mask_pred.sigmoid() >= 0.5)*(mask_pred.sigmoid() < 0.75).float() # result.pred_boxes = Boxes(torch.zeros(mask_pred.size(0), 4)) # Uncomment the following to get boxes from masks (this is slow) result.pred_boxes = BitMasks(mask_pred > 0).get_bounding_boxes() # calculate average mask prob mask_scores_per_image = (mask_pred.sigmoid().flatten(1) * result.pred_masks.flatten(1)).sum(1) / (result.pred_masks.flatten(1).sum(1) + 1e-6) result.scores = scores_per_image * mask_scores_per_image result.pred_classes = labels_per_image return result ================================================ FILE: mask2former/modeling/__init__.py ================================================ from .backbone.swin import D2SwinTransformer from .pixel_decoder.fpn import BasePixelDecoder from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder from .meta_arch.mask_former_head import MaskFormerHead from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead ================================================ FILE: mask2former/modeling/backbone/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. ================================================ FILE: mask2former/modeling/backbone/__init__.py.new ================================================ ================================================ FILE: mask2former/modeling/backbone/swin.py ================================================ # -------------------------------------------------------- # Swin Transformer # Copyright (c) 2021 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ze Liu, Yutong Lin, Yixuan Wei # -------------------------------------------------------- # Modified by Bowen Cheng from https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation/blob/main/mmseg/models/backbones/swin_transformer.py import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as checkpoint from timm.models.layers import DropPath, to_2tuple, trunc_normal_ from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec class Mlp(nn.Module): """Multilayer perceptron.""" def __init__( self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0 ): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x def window_partition(x, window_size): """ Args: x: (B, H, W, C) window_size (int): window size Returns: windows: (num_windows*B, window_size, window_size, C) """ B, H, W, C = x.shape x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) return windows def window_reverse(windows, window_size, H, W): """ Args: windows: (num_windows*B, window_size, window_size, C) window_size (int): Window size H (int): Height of image W (int): Width of image Returns: x: (B, H, W, C) """ B = int(windows.shape[0] / (H * W / window_size / window_size)) x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) return x class WindowAttention(nn.Module): """Window based multi-head self attention (W-MSA) module with relative position bias. It supports both of shifted and non-shifted window. Args: dim (int): Number of input channels. window_size (tuple[int]): The height and width of the window. num_heads (int): Number of attention heads. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 proj_drop (float, optional): Dropout ratio of output. Default: 0.0 """ def __init__( self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0.0, proj_drop=0.0, ): super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim ** -0.5 # define a parameter table of relative position bias self.relative_position_bias_table = nn.Parameter( torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads) ) # 2*Wh-1 * 2*Ww-1, nH # get pair-wise relative position index for each token inside the window coords_h = torch.arange(self.window_size[0]) coords_w = torch.arange(self.window_size[1]) coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww self.register_buffer("relative_position_index", relative_position_index) self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) trunc_normal_(self.relative_position_bias_table, std=0.02) self.softmax = nn.Softmax(dim=-1) def forward(self, x, mask=None): """Forward function. Args: x: input features with shape of (num_windows*B, N, C) mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None """ B_, N, C = x.shape qkv = ( self.qkv(x) .reshape(B_, N, 3, self.num_heads, C // self.num_heads) .permute(2, 0, 3, 1, 4) ) q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) q = q * self.scale attn = q @ k.transpose(-2, -1) relative_position_bias = self.relative_position_bias_table[ self.relative_position_index.view(-1) ].view( self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 ) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.permute( 2, 0, 1 ).contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) if mask is not None: nW = mask.shape[0] attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) attn = attn.view(-1, self.num_heads, N, N) attn = self.softmax(attn) else: attn = self.softmax(attn) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B_, N, C) x = self.proj(x) x = self.proj_drop(x) return x class SwinTransformerBlock(nn.Module): """Swin Transformer Block. Args: dim (int): Number of input channels. num_heads (int): Number of attention heads. window_size (int): Window size. shift_size (int): Shift size for SW-MSA. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float, optional): Stochastic depth rate. Default: 0.0 act_layer (nn.Module, optional): Activation layer. Default: nn.GELU norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__( self, dim, num_heads, window_size=7, shift_size=0, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, ): super().__init__() self.dim = dim self.num_heads = num_heads self.window_size = window_size self.shift_size = shift_size self.mlp_ratio = mlp_ratio assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" self.norm1 = norm_layer(dim) self.attn = WindowAttention( dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop ) self.H = None self.W = None def forward(self, x, mask_matrix): """Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. mask_matrix: Attention mask for cyclic shift. """ B, L, C = x.shape H, W = self.H, self.W assert L == H * W, "input feature has wrong size" shortcut = x x = self.norm1(x) x = x.view(B, H, W, C) # pad feature maps to multiples of window size pad_l = pad_t = 0 pad_r = (self.window_size - W % self.window_size) % self.window_size pad_b = (self.window_size - H % self.window_size) % self.window_size x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) _, Hp, Wp, _ = x.shape # cyclic shift if self.shift_size > 0: shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) attn_mask = mask_matrix else: shifted_x = x attn_mask = None # partition windows x_windows = window_partition( shifted_x, self.window_size ) # nW*B, window_size, window_size, C x_windows = x_windows.view( -1, self.window_size * self.window_size, C ) # nW*B, window_size*window_size, C # W-MSA/SW-MSA attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C # merge windows attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C # reverse cyclic shift if self.shift_size > 0: x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) else: x = shifted_x if pad_r > 0 or pad_b > 0: x = x[:, :H, :W, :].contiguous() x = x.view(B, H * W, C) # FFN x = shortcut + self.drop_path(x) x = x + self.drop_path(self.mlp(self.norm2(x))) return x class PatchMerging(nn.Module): """Patch Merging Layer Args: dim (int): Number of input channels. norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__(self, dim, norm_layer=nn.LayerNorm): super().__init__() self.dim = dim self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) self.norm = norm_layer(4 * dim) def forward(self, x, H, W): """Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. """ B, L, C = x.shape assert L == H * W, "input feature has wrong size" x = x.view(B, H, W, C) # padding pad_input = (H % 2 == 1) or (W % 2 == 1) if pad_input: x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C x = self.norm(x) x = self.reduction(x) return x class BasicLayer(nn.Module): """A basic Swin Transformer layer for one stage. Args: dim (int): Number of feature channels depth (int): Depths of this stage. num_heads (int): Number of attention head. window_size (int): Local window size. Default: 7. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__( self, dim, depth, num_heads, window_size=7, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False, ): super().__init__() self.window_size = window_size self.shift_size = window_size // 2 self.depth = depth self.use_checkpoint = use_checkpoint # build blocks self.blocks = nn.ModuleList( [ SwinTransformerBlock( dim=dim, num_heads=num_heads, window_size=window_size, shift_size=0 if (i % 2 == 0) else window_size // 2, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop, drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, norm_layer=norm_layer, ) for i in range(depth) ] ) # patch merging layer if downsample is not None: self.downsample = downsample(dim=dim, norm_layer=norm_layer) else: self.downsample = None def forward(self, x, H, W): """Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. """ # calculate attention mask for SW-MSA Hp = int(np.ceil(H / self.window_size)) * self.window_size Wp = int(np.ceil(W / self.window_size)) * self.window_size img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1 h_slices = ( slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None), ) w_slices = ( slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None), ) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 mask_windows = window_partition( img_mask, self.window_size ) # nW, window_size, window_size, 1 mask_windows = mask_windows.view(-1, self.window_size * self.window_size) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( attn_mask == 0, float(0.0) ) for blk in self.blocks: blk.H, blk.W = H, W if self.use_checkpoint: x = checkpoint.checkpoint(blk, x, attn_mask) else: x = blk(x, attn_mask) if self.downsample is not None: x_down = self.downsample(x, H, W) Wh, Ww = (H + 1) // 2, (W + 1) // 2 return x, H, W, x_down, Wh, Ww else: return x, H, W, x, H, W class PatchEmbed(nn.Module): """Image to Patch Embedding Args: patch_size (int): Patch token size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. norm_layer (nn.Module, optional): Normalization layer. Default: None """ def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): super().__init__() patch_size = to_2tuple(patch_size) self.patch_size = patch_size self.in_chans = in_chans self.embed_dim = embed_dim self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: self.norm = None def forward(self, x): """Forward function.""" # padding _, _, H, W = x.size() if W % self.patch_size[1] != 0: x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1])) if H % self.patch_size[0] != 0: x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) x = self.proj(x) # B C Wh Ww if self.norm is not None: Wh, Ww = x.size(2), x.size(3) x = x.flatten(2).transpose(1, 2) x = self.norm(x) x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww) return x class SwinTransformer(nn.Module): """Swin Transformer backbone. A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - https://arxiv.org/pdf/2103.14030 Args: pretrain_img_size (int): Input image size for training the pretrained model, used in absolute postion embedding. Default 224. patch_size (int | tuple(int)): Patch size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. depths (tuple[int]): Depths of each Swin Transformer stage. num_heads (tuple[int]): Number of attention head of each stage. window_size (int): Window size. Default: 7. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. drop_rate (float): Dropout rate. attn_drop_rate (float): Attention dropout rate. Default: 0. drop_path_rate (float): Stochastic depth rate. Default: 0.2. norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. ape (bool): If True, add absolute position embedding to the patch embedding. Default: False. patch_norm (bool): If True, add normalization after patch embedding. Default: True. out_indices (Sequence[int]): Output from which stages. frozen_stages (int): Stages to be frozen (stop grad and set eval mode). -1 means not freezing any parameters. use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__( self, pretrain_img_size=224, patch_size=4, in_chans=3, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.2, norm_layer=nn.LayerNorm, ape=False, patch_norm=True, out_indices=(0, 1, 2, 3), frozen_stages=-1, use_checkpoint=False, ): super().__init__() self.pretrain_img_size = pretrain_img_size self.num_layers = len(depths) self.embed_dim = embed_dim self.ape = ape self.patch_norm = patch_norm self.out_indices = out_indices self.frozen_stages = frozen_stages # split image into non-overlapping patches self.patch_embed = PatchEmbed( patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer if self.patch_norm else None, ) # absolute position embedding if self.ape: pretrain_img_size = to_2tuple(pretrain_img_size) patch_size = to_2tuple(patch_size) patches_resolution = [ pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1], ] self.absolute_pos_embed = nn.Parameter( torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]) ) trunc_normal_(self.absolute_pos_embed, std=0.02) self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) ] # stochastic depth decay rule # build layers self.layers = nn.ModuleList() for i_layer in range(self.num_layers): layer = BasicLayer( dim=int(embed_dim * 2 ** i_layer), depth=depths[i_layer], num_heads=num_heads[i_layer], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])], norm_layer=norm_layer, downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, use_checkpoint=use_checkpoint, ) self.layers.append(layer) num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)] self.num_features = num_features # add a norm layer for each output for i_layer in out_indices: layer = norm_layer(num_features[i_layer]) layer_name = f"norm{i_layer}" self.add_module(layer_name, layer) self._freeze_stages() def _freeze_stages(self): if self.frozen_stages >= 0: self.patch_embed.eval() for param in self.patch_embed.parameters(): param.requires_grad = False if self.frozen_stages >= 1 and self.ape: self.absolute_pos_embed.requires_grad = False if self.frozen_stages >= 2: self.pos_drop.eval() for i in range(0, self.frozen_stages - 1): m = self.layers[i] m.eval() for param in m.parameters(): param.requires_grad = False def init_weights(self, pretrained=None): """Initialize the weights in backbone. Args: pretrained (str, optional): Path to pre-trained weights. Defaults to None. """ def _init_weights(m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=0.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def forward(self, x): """Forward function.""" x = self.patch_embed(x) Wh, Ww = x.size(2), x.size(3) if self.ape: # interpolate the position embedding to the corresponding size absolute_pos_embed = F.interpolate( self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic" ) x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C else: x = x.flatten(2).transpose(1, 2) x = self.pos_drop(x) outs = {} for i in range(self.num_layers): layer = self.layers[i] x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) if i in self.out_indices: norm_layer = getattr(self, f"norm{i}") x_out = norm_layer(x_out) out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous() outs["res{}".format(i + 2)] = out return outs def train(self, mode=True): """Convert the model into training mode while keep layers freezed.""" super(SwinTransformer, self).train(mode) self._freeze_stages() @BACKBONE_REGISTRY.register() class D2SwinTransformer(SwinTransformer, Backbone): def __init__(self, cfg, input_shape): pretrain_img_size = cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE patch_size = cfg.MODEL.SWIN.PATCH_SIZE in_chans = 3 embed_dim = cfg.MODEL.SWIN.EMBED_DIM depths = cfg.MODEL.SWIN.DEPTHS num_heads = cfg.MODEL.SWIN.NUM_HEADS window_size = cfg.MODEL.SWIN.WINDOW_SIZE mlp_ratio = cfg.MODEL.SWIN.MLP_RATIO qkv_bias = cfg.MODEL.SWIN.QKV_BIAS qk_scale = cfg.MODEL.SWIN.QK_SCALE drop_rate = cfg.MODEL.SWIN.DROP_RATE attn_drop_rate = cfg.MODEL.SWIN.ATTN_DROP_RATE drop_path_rate = cfg.MODEL.SWIN.DROP_PATH_RATE norm_layer = nn.LayerNorm ape = cfg.MODEL.SWIN.APE patch_norm = cfg.MODEL.SWIN.PATCH_NORM use_checkpoint = cfg.MODEL.SWIN.USE_CHECKPOINT super().__init__( pretrain_img_size, patch_size, in_chans, embed_dim, depths, num_heads, window_size, mlp_ratio, qkv_bias, qk_scale, drop_rate, attn_drop_rate, drop_path_rate, norm_layer, ape, patch_norm, use_checkpoint=use_checkpoint, ) self._out_features = cfg.MODEL.SWIN.OUT_FEATURES self._out_feature_strides = { "res2": 4, "res3": 8, "res4": 16, "res5": 32, } self._out_feature_channels = { "res2": self.num_features[0], "res3": self.num_features[1], "res4": self.num_features[2], "res5": self.num_features[3], } def forward(self, x): """ Args: x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. Returns: dict[str->Tensor]: names and the corresponding features """ assert ( x.dim() == 4 ), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!" outputs = {} y = super().forward(x) for k in y.keys(): if k in self._out_features: outputs[k] = y[k] return outputs def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features } @property def size_divisibility(self): return 32 ================================================ FILE: mask2former/modeling/criterion.py ================================================ # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py """ MaskFormer criterion. """ import logging import torch import torch.nn.functional as F from torch import nn from detectron2.utils.comm import get_world_size from detectron2.projects.point_rend.point_features import ( get_uncertain_point_coords_with_randomness, point_sample, ) from ..utils.misc import is_dist_avail_and_initialized, nested_tensor_from_tensor_list def unfold_wo_center(x, kernel_size, dilation): assert x.dim() == 4 assert kernel_size % 2 == 1 # using SAME padding padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2 unfolded_x = F.unfold( x, kernel_size=kernel_size, padding=padding, dilation=dilation ) unfolded_x = unfolded_x.reshape( x.size(0), x.size(1), -1, x.size(2), x.size(3) ) # remove the center pixels size = kernel_size ** 2 unfolded_x = torch.cat(( unfolded_x[:, :, :size // 2], unfolded_x[:, :, size // 2 + 1:] ), dim=2) return unfolded_x def compute_pairwise_term(mask_logits, pairwise_size, pairwise_dilation): assert mask_logits.dim() == 4 log_fg_prob = F.logsigmoid(mask_logits) log_bg_prob = F.logsigmoid(-mask_logits) log_fg_prob_unfold = unfold_wo_center( log_fg_prob, kernel_size=pairwise_size, dilation=pairwise_dilation ) log_bg_prob_unfold = unfold_wo_center( log_bg_prob, kernel_size=pairwise_size, dilation=pairwise_dilation ) # the probability of making the same prediction = p_i * p_j + (1 - p_i) * (1 - p_j) # we compute the the probability in log space to avoid numerical instability log_same_fg_prob = log_fg_prob[:, :, None] + log_fg_prob_unfold log_same_bg_prob = log_bg_prob[:, :, None] + log_bg_prob_unfold max_ = torch.max(log_same_fg_prob, log_same_bg_prob) log_same_prob = torch.log( torch.exp(log_same_fg_prob - max_) + torch.exp(log_same_bg_prob - max_) ) + max_ # loss = -log(prob) return -log_same_prob[:, 0] def get_incoherent_mask(input_masks, sfact): mask = input_masks.float() w = input_masks.shape[-1] h = input_masks.shape[-2] mask_small = F.interpolate(mask, (h//sfact, w//sfact), mode='bilinear') mask_recover = F.interpolate(mask_small, (h, w), mode='bilinear') mask_uncertain = (mask - mask_recover).abs() mask_uncertain = (mask_uncertain > 0.01).float() return mask_uncertain def dice_coefficient(x, target): eps = 1e-5 n_inst = x.size(0) x = x.reshape(n_inst, -1) target = target.reshape(n_inst, -1) intersection = (x * target).sum(dim=1) union = (x ** 2.0).sum(dim=1) + (target ** 2.0).sum(dim=1) + eps loss = 1. - (2 * intersection / union) return loss def compute_project_term(mask_scores, gt_bitmasks): mask_losses_y = dice_coefficient( mask_scores.max(dim=2, keepdim=True)[0], gt_bitmasks.max(dim=2, keepdim=True)[0] ) mask_losses_x = dice_coefficient( mask_scores.max(dim=3, keepdim=True)[0], gt_bitmasks.max(dim=3, keepdim=True)[0] ) return (mask_losses_x + mask_losses_y).mean() def dice_loss( inputs: torch.Tensor, targets: torch.Tensor, num_masks: float, ): """ Compute the DICE loss, similar to generalized IOU for masks Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). """ inputs = inputs.sigmoid() inputs = inputs.flatten(1) numerator = 2 * (inputs * targets).sum(-1) denominator = inputs.sum(-1) + targets.sum(-1) loss = 1 - (numerator + 1) / (denominator + 1) return loss.sum() / num_masks dice_loss_jit = torch.jit.script( dice_loss ) # type: torch.jit.ScriptModule def sigmoid_ce_loss( inputs: torch.Tensor, targets: torch.Tensor, num_masks: float, ): """ Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). Returns: Loss tensor """ loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") return loss.mean(1).sum() / num_masks sigmoid_ce_loss_jit = torch.jit.script( sigmoid_ce_loss ) # type: torch.jit.ScriptModule def calculate_uncertainty(logits): """ We estimate uncerainty as L1 distance between 0.0 and the logit prediction in 'logits' for the foreground class in `classes`. Args: logits (Tensor): A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is the number of foreground classes. The values are logits. Returns: scores (Tensor): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most uncertain locations having the highest uncertainty score. """ assert logits.shape[1] == 1 gt_class_logits = logits.clone() return -(torch.abs(gt_class_logits)) class SetCriterion(nn.Module): """This class computes the loss for DETR. The process happens in two steps: 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched ground-truth / prediction (supervise class and box) """ def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses, num_points, oversample_ratio, importance_sample_ratio): """Create the criterion. Parameters: num_classes: number of object categories, omitting the special no-object category matcher: module able to compute a matching between targets and proposals weight_dict: dict containing as key the names of the losses and as values their relative weight. eos_coef: relative classification weight applied to the no-object category losses: list of all the losses to be applied. See get_loss for list of available losses. """ super().__init__() self.num_classes = num_classes self.matcher = matcher self.weight_dict = weight_dict self.eos_coef = eos_coef self.losses = losses empty_weight = torch.ones(self.num_classes + 1) empty_weight[-1] = self.eos_coef self.register_buffer("empty_weight", empty_weight) # pointwise mask loss parameters self.num_points = num_points self.oversample_ratio = oversample_ratio self.importance_sample_ratio = importance_sample_ratio self.laplacian_kernel = torch.tensor([-1, -1, -1, -1, 8, -1, -1, -1, -1], dtype=torch.float32).reshape(1, 1, 3, 3).requires_grad_(False) self.register_buffer("_iter", torch.zeros([1])) self._warmup_iters = 1000 #20000 def loss_labels(self, outputs, targets, indices, num_masks): """Classification loss (NLL) targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes] """ assert "pred_logits" in outputs src_logits = outputs["pred_logits"].float() idx = self._get_src_permutation_idx(indices) target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) target_classes = torch.full( src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device ) target_classes[idx] = target_classes_o loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) losses = {"loss_ce": loss_ce} return losses def loss_masks_proj(self, outputs, targets, indices, num_masks, images_lab_sim): assert "pred_masks" in outputs self._iter += 1 src_idx = self._get_src_permutation_idx(indices) tgt_idx = self._get_tgt_permutation_idx(indices) src_masks = outputs["pred_masks"] src_masks = src_masks[src_idx] masks = [t["masks"] for t in targets] # TODO use valid to mask invalid areas due to padding in loss target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() target_masks = target_masks.to(src_masks) target_masks = target_masks[tgt_idx] if len(src_idx[0].tolist()) > 0: images_lab_sim = torch.cat([images_lab_sim[ind] for ind in src_idx[0].tolist()]) # No need to upsample predictions as we are using normalized coordinates :) # N x 1 x H x W src_masks = src_masks[:, None] target_masks = target_masks[:, None] target_masks = F.interpolate(target_masks, (src_masks.shape[-2], src_masks.shape[-1]), mode='bilinear') if src_masks.shape[0] > 0: loss_prj_term = compute_project_term(src_masks.sigmoid(), target_masks) pairwise_losses = compute_pairwise_term( src_masks, 3, 2 ) inc_mask = get_incoherent_mask(src_masks.detach().sigmoid() > 0.5, 2) #* images_lab_sim).bool() inc_mask = F.conv2d(inc_mask, self.laplacian_kernel.to(inc_mask.device), padding=1).abs() inc_mask = (inc_mask > 0.5).float() weights = (images_lab_sim >= 0.3).float() * target_masks.float() #* inc_mask loss_pairwise = ((pairwise_losses * weights).sum() / weights.sum().clamp(min=1.0)) * 0.25 warmup_factor = min(self._iter.item() / float(self._warmup_iters), 1.0) loss_pairwise = loss_pairwise * warmup_factor #* 0. else: loss_prj_term = src_masks.sum() * 0. loss_pairwise = src_masks.sum() * 0. losses = { "loss_mask": loss_prj_term, "loss_bound": loss_pairwise, } del src_masks del target_masks return losses def loss_masks(self, outputs, targets, indices, num_masks): """Compute the losses related to the masks: the focal loss and the dice loss. targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w] """ assert "pred_masks" in outputs src_idx = self._get_src_permutation_idx(indices) tgt_idx = self._get_tgt_permutation_idx(indices) src_masks = outputs["pred_masks"] src_masks = src_masks[src_idx] masks = [t["masks"] for t in targets] # TODO use valid to mask invalid areas due to padding in loss target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() target_masks = target_masks.to(src_masks) target_masks = target_masks[tgt_idx] # No need to upsample predictions as we are using normalized coordinates :) # N x 1 x H x W src_masks = src_masks[:, None] target_masks = target_masks[:, None] with torch.no_grad(): # sample point_coords point_coords = get_uncertain_point_coords_with_randomness( src_masks, lambda logits: calculate_uncertainty(logits), self.num_points, self.oversample_ratio, self.importance_sample_ratio, ) # get gt labels point_labels = point_sample( target_masks, point_coords, align_corners=False, ).squeeze(1) point_logits = point_sample( src_masks, point_coords, align_corners=False, ).squeeze(1) losses = { "loss_mask": sigmoid_ce_loss_jit(point_logits, point_labels, num_masks), "loss_dice": dice_loss_jit(point_logits, point_labels, num_masks), } del src_masks del target_masks return losses def _get_src_permutation_idx(self, indices): # permute predictions following indices batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) src_idx = torch.cat([src for (src, _) in indices]) return batch_idx, src_idx def _get_tgt_permutation_idx(self, indices): # permute targets following indices batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) tgt_idx = torch.cat([tgt for (_, tgt) in indices]) return batch_idx, tgt_idx def get_loss(self, loss, outputs, targets, indices, num_masks, images_lab_sim): loss_map = { 'labels': self.loss_labels, 'masks': self.loss_masks_proj, } assert loss in loss_map, f"do you really want to compute {loss} loss?" if loss == 'masks': return loss_map[loss](outputs, targets, indices, num_masks, images_lab_sim) else: return loss_map[loss](outputs, targets, indices, num_masks) def forward(self, outputs, targets, images_lab_sim): """This performs the loss computation. Parameters: outputs: dict of tensors, see the output specification of the model for the format targets: list of dicts, such that len(targets) == batch_size. The expected keys in each dict depends on the losses applied, see each loss' doc """ outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"} # Retrieve the matching between the outputs of the last layer and the targets indices = self.matcher(outputs_without_aux, targets) # Compute the average number of target boxes accross all nodes, for normalization purposes num_masks = sum(len(t["labels"]) for t in targets) num_masks = torch.as_tensor( [num_masks], dtype=torch.float, device=next(iter(outputs.values())).device ) if is_dist_avail_and_initialized(): torch.distributed.all_reduce(num_masks) num_masks = torch.clamp(num_masks / get_world_size(), min=1).item() # Compute all the requested losses losses = {} for loss in self.losses: losses.update(self.get_loss(loss, outputs, targets, indices, num_masks, images_lab_sim)) # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. if "aux_outputs" in outputs: for i, aux_outputs in enumerate(outputs["aux_outputs"]): indices = self.matcher(aux_outputs, targets) for loss in self.losses: l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_masks, images_lab_sim) l_dict = {k + f"_{i}": v for k, v in l_dict.items()} losses.update(l_dict) return losses def __repr__(self): head = "Criterion " + self.__class__.__name__ body = [ "matcher: {}".format(self.matcher.__repr__(_repr_indent=8)), "losses: {}".format(self.losses), "weight_dict: {}".format(self.weight_dict), "num_classes: {}".format(self.num_classes), "eos_coef: {}".format(self.eos_coef), "num_points: {}".format(self.num_points), "oversample_ratio: {}".format(self.oversample_ratio), "importance_sample_ratio: {}".format(self.importance_sample_ratio), ] _repr_indent = 4 lines = [head] + [" " * _repr_indent + line for line in body] return "\n".join(lines) ================================================ FILE: mask2former/modeling/matcher.py ================================================ # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py """ Modules to compute the matching cost and solve the corresponding LSAP. """ import torch import torch.nn.functional as F from scipy.optimize import linear_sum_assignment from torch import nn from torch.cuda.amp import autocast from detectron2.projects.point_rend.point_features import point_sample from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou, generalized_multi_box_iou def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor): """ Compute the DICE loss, similar to generalized IOU for masks Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). """ inputs = inputs #.sigmoid() inputs = inputs.flatten(1) numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets) denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :] loss = 1 - (numerator + 1) / (denominator + 1) return loss batch_dice_loss_jit = torch.jit.script( batch_dice_loss ) # type: torch.jit.ScriptModule def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor): """ Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). Returns: Loss tensor """ hw = inputs.shape[1] pos = F.binary_cross_entropy( inputs, torch.ones_like(inputs), reduction="none" ) neg = F.binary_cross_entropy( inputs, torch.zeros_like(inputs), reduction="none" ) loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum( "nc,mc->nm", neg, (1 - targets) ) return loss / hw batch_sigmoid_ce_loss_jit = torch.jit.script( batch_sigmoid_ce_loss ) # type: torch.jit.ScriptModule def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor: """ Compute the bounding boxes around the provided masks. Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. Args: masks (Tensor[N, H, W]): masks to transform where N is the number of masks and (H, W) are the spatial dimensions. Returns: Tensor[N, 4]: bounding boxes """ if masks.numel() == 0: return masks n = masks.shape[0] for index, mask in enumerate(masks): y, x = torch.where(mask != 0) if len(x) * len(y) == 0: continue h = torch.max(y) - torch.min(y) w = torch.max(x) - torch.min(x) masks[index, torch.min(y):torch.max(y), torch.min(x):torch.max(x)] = 1.0 return masks def masks_to_boxes_cc(masks: torch.Tensor) -> torch.Tensor: """ Compute the bounding boxes around the provided masks. Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. Args: masks (Tensor[N, H, W]): masks to transform where N is the number of masks and (H, W) are the spatial dimensions. Returns: Tensor[N, 4]: bounding boxes """ if masks.numel() == 0: return torch.zeros((0, 4), device=masks.device, dtype=torch.float) n = masks.shape[0] h = masks.shape[1] w = masks.shape[2] bounding_boxes = torch.zeros((n, 4), device=masks.device, dtype=torch.float) for index, mask in enumerate(masks): y, x = torch.where(mask != 0) if len(x) * len(y) == 0: continue bounding_boxes[index, 0] = torch.min(x) / float(w) bounding_boxes[index, 1] = torch.min(y) / float(h) bounding_boxes[index, 2] = torch.max(x) / float(w) bounding_boxes[index, 3] = torch.max(y) / float(h) return bounding_boxes class HungarianMatcher(nn.Module): """This class computes an assignment between the targets and the predictions of the network For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are un-matched (and thus treated as non-objects). """ def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0): """Creates the matcher Params: cost_class: This is the relative weight of the classification error in the matching cost cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost """ super().__init__() self.cost_class = cost_class self.cost_mask = cost_mask self.cost_dice = cost_dice self.cost_giou = 2.0 self.cost_bbox = 5.0 assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0" self.num_points = num_points @torch.no_grad() def memory_efficient_forward(self, outputs, targets): """More memory-friendly matching""" bs, num_queries = outputs["pred_logits"].shape[:2] indices = [] # Iterate through batch size for b in range(bs): out_prob = outputs["pred_logits"][b].softmax(-1) # [num_queries, num_classes] tgt_ids = targets[b]["labels"] # Compute the classification cost. Contrary to the loss, we don't use the NLL, # but approximate it in 1 - proba[target class]. # The 1 is a constant that doesn't change the matching, it can be ommitted. cost_class = -out_prob[:, tgt_ids] out_mask = outputs["pred_masks"][b] # [num_queries, H_pred, W_pred] out_mask_box = masks_to_boxes_cc((out_mask.sigmoid() > 0.5).float()) # gt masks are already padded when preparing target tgt_mask = targets[b]["masks"].to(out_mask) tgt_mask_box = masks_to_boxes_cc(tgt_mask) # print('tgt_mask_box shape:', tgt_mask_box.shape) with autocast(enabled=False): cost_bbox = torch.cdist(out_mask_box, tgt_mask_box) cost_giou = -generalized_box_iou(out_mask_box, tgt_mask_box) if torch.isnan(cost_bbox).any(): print('cost_bbox:', cost_bbox) if torch.isnan(cost_giou).any(): print('cost_giou:', cost_giou) C = ( self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou ) C = C.reshape(num_queries, -1).cpu() indices.append(linear_sum_assignment(C)) return [ (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices ] @torch.no_grad() def forward(self, outputs, targets): """Performs the matching Params: outputs: This is a dict that contains at least these entries: "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth objects in the target) containing the class labels "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) """ return self.memory_efficient_forward(outputs, targets) def __repr__(self, _repr_indent=4): head = "Matcher " + self.__class__.__name__ body = [ "cost_class: {}".format(self.cost_class), "cost_mask: {}".format(self.cost_mask), "cost_dice: {}".format(self.cost_dice), ] lines = [head] + [" " * _repr_indent + line for line in body] return "\n".join(lines) ================================================ FILE: mask2former/modeling/meta_arch/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. ================================================ FILE: mask2former/modeling/meta_arch/__init__.py.new ================================================ ================================================ FILE: mask2former/modeling/meta_arch/mask_former_head.py ================================================ import logging from copy import deepcopy from typing import Callable, Dict, List, Optional, Tuple, Union import fvcore.nn.weight_init as weight_init from torch import nn from torch.nn import functional as F from detectron2.config import configurable from detectron2.layers import Conv2d, ShapeSpec, get_norm from detectron2.modeling import SEM_SEG_HEADS_REGISTRY from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder from ..pixel_decoder.fpn import build_pixel_decoder @SEM_SEG_HEADS_REGISTRY.register() class MaskFormerHead(nn.Module): _version = 2 def _load_from_state_dict( self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ): version = local_metadata.get("version", None) if version is None or version < 2: # Do not warn if train from scratch scratch = True logger = logging.getLogger(__name__) for k in list(state_dict.keys()): newk = k ''' if "sem_seg_head" in k and not k.startswith(prefix + "predictor"): newk = k.replace(prefix, prefix + "pixel_decoder.") # logger.debug(f"{k} ==> {newk}") ''' if newk != k: state_dict[newk] = state_dict[k] del state_dict[k] scratch = False if not scratch: logger.warning( f"Weight format of {self.__class__.__name__} have changed! " "Please upgrade your models. Applying automatic conversion now ..." ) @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, num_classes: int, pixel_decoder: nn.Module, loss_weight: float = 1.0, ignore_value: int = -1, # extra parameters transformer_predictor: nn.Module, transformer_in_feature: str, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features num_classes: number of classes to predict pixel_decoder: the pixel decoder module loss_weight: loss weight ignore_value: category id to be ignored during training. transformer_predictor: the transformer decoder that makes prediction transformer_in_feature: input feature name to the transformer_predictor """ super().__init__() input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) self.in_features = [k for k, v in input_shape] feature_strides = [v.stride for k, v in input_shape] feature_channels = [v.channels for k, v in input_shape] self.ignore_value = ignore_value self.common_stride = 4 self.loss_weight = loss_weight self.pixel_decoder = pixel_decoder self.predictor = transformer_predictor self.transformer_in_feature = transformer_in_feature self.num_classes = num_classes @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): # figure out in_channels to transformer predictor if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder": transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding": transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder": # for maskformer2 transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM else: transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels return { "input_shape": { k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES }, "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, "pixel_decoder": build_pixel_decoder(cfg, input_shape), "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE, "transformer_predictor": build_transformer_decoder( cfg, transformer_predictor_in_channels, mask_classification=True, ), } def forward(self, features, mask=None): return self.layers(features, mask) def layers(self, features, mask=None): mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features) if self.transformer_in_feature == "multi_scale_pixel_decoder": predictions = self.predictor(multi_scale_features, mask_features, mask) else: if self.transformer_in_feature == "transformer_encoder": assert ( transformer_encoder_features is not None ), "Please use the TransformerEncoderPixelDecoder." predictions = self.predictor(transformer_encoder_features, mask_features, mask) elif self.transformer_in_feature == "pixel_embedding": predictions = self.predictor(mask_features, mask_features, mask) else: predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask) return predictions ================================================ FILE: mask2former/modeling/meta_arch/per_pixel_baseline.py ================================================ import logging from typing import Callable, Dict, List, Optional, Tuple, Union import fvcore.nn.weight_init as weight_init from torch import nn from torch.nn import functional as F from detectron2.config import configurable from detectron2.layers import Conv2d, ShapeSpec, get_norm from detectron2.modeling import SEM_SEG_HEADS_REGISTRY from ..transformer_decoder.maskformer_transformer_decoder import StandardTransformerDecoder from ..pixel_decoder.fpn import build_pixel_decoder @SEM_SEG_HEADS_REGISTRY.register() class PerPixelBaselineHead(nn.Module): _version = 2 def _load_from_state_dict( self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ): version = local_metadata.get("version", None) if version is None or version < 2: logger = logging.getLogger(__name__) # Do not warn if train from scratch scratch = True logger = logging.getLogger(__name__) for k in list(state_dict.keys()): newk = k if "sem_seg_head" in k and not k.startswith(prefix + "predictor"): newk = k.replace(prefix, prefix + "pixel_decoder.") # logger.warning(f"{k} ==> {newk}") if newk != k: state_dict[newk] = state_dict[k] del state_dict[k] scratch = False if not scratch: logger.warning( f"Weight format of {self.__class__.__name__} have changed! " "Please upgrade your models. Applying automatic conversion now ..." ) @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, num_classes: int, pixel_decoder: nn.Module, loss_weight: float = 1.0, ignore_value: int = -1, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features num_classes: number of classes to predict pixel_decoder: the pixel decoder module loss_weight: loss weight ignore_value: category id to be ignored during training. """ super().__init__() input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) self.in_features = [k for k, v in input_shape] feature_strides = [v.stride for k, v in input_shape] feature_channels = [v.channels for k, v in input_shape] self.ignore_value = ignore_value self.common_stride = 4 self.loss_weight = loss_weight self.pixel_decoder = pixel_decoder self.predictor = Conv2d( self.pixel_decoder.mask_dim, num_classes, kernel_size=1, stride=1, padding=0 ) weight_init.c2_msra_fill(self.predictor) @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): return { "input_shape": { k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES }, "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, "pixel_decoder": build_pixel_decoder(cfg, input_shape), "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, } def forward(self, features, targets=None): """ Returns: In training, returns (None, dict of losses) In inference, returns (CxHxW logits, {}) """ x = self.layers(features) if self.training: return None, self.losses(x, targets) else: x = F.interpolate( x, scale_factor=self.common_stride, mode="bilinear", align_corners=False ) return x, {} def layers(self, features): x, _, _ = self.pixel_decoder.forward_features(features) x = self.predictor(x) return x def losses(self, predictions, targets): predictions = predictions.float() # https://github.com/pytorch/pytorch/issues/48163 predictions = F.interpolate( predictions, scale_factor=self.common_stride, mode="bilinear", align_corners=False ) loss = F.cross_entropy( predictions, targets, reduction="mean", ignore_index=self.ignore_value ) losses = {"loss_sem_seg": loss * self.loss_weight} return losses @SEM_SEG_HEADS_REGISTRY.register() class PerPixelBaselinePlusHead(PerPixelBaselineHead): def _load_from_state_dict( self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ): version = local_metadata.get("version", None) if version is None or version < 2: # Do not warn if train from scratch scratch = True logger = logging.getLogger(__name__) for k in list(state_dict.keys()): newk = k if "sem_seg_head" in k and not k.startswith(prefix + "predictor"): newk = k.replace(prefix, prefix + "pixel_decoder.") logger.debug(f"{k} ==> {newk}") if newk != k: state_dict[newk] = state_dict[k] del state_dict[k] scratch = False if not scratch: logger.warning( f"Weight format of {self.__class__.__name__} have changed! " "Please upgrade your models. Applying automatic conversion now ..." ) @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, # extra parameters transformer_predictor: nn.Module, transformer_in_feature: str, deep_supervision: bool, # inherit parameters num_classes: int, pixel_decoder: nn.Module, loss_weight: float = 1.0, ignore_value: int = -1, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features transformer_predictor: the transformer decoder that makes prediction transformer_in_feature: input feature name to the transformer_predictor deep_supervision: whether or not to add supervision to the output of every transformer decoder layer num_classes: number of classes to predict pixel_decoder: the pixel decoder module loss_weight: loss weight ignore_value: category id to be ignored during training. """ super().__init__( input_shape, num_classes=num_classes, pixel_decoder=pixel_decoder, loss_weight=loss_weight, ignore_value=ignore_value, ) del self.predictor self.predictor = transformer_predictor self.transformer_in_feature = transformer_in_feature self.deep_supervision = deep_supervision @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): ret = super().from_config(cfg, input_shape) ret["transformer_in_feature"] = cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder": in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM else: in_channels = input_shape[ret["transformer_in_feature"]].channels ret["transformer_predictor"] = StandardTransformerDecoder( cfg, in_channels, mask_classification=False ) ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION return ret def forward(self, features, targets=None): """ Returns: In training, returns (None, dict of losses) In inference, returns (CxHxW logits, {}) """ x, aux_outputs = self.layers(features) if self.training: if self.deep_supervision: losses = self.losses(x, targets) for i, aux_output in enumerate(aux_outputs): losses["loss_sem_seg" + f"_{i}"] = self.losses( aux_output["pred_masks"], targets )["loss_sem_seg"] return None, losses else: return None, self.losses(x, targets) else: x = F.interpolate( x, scale_factor=self.common_stride, mode="bilinear", align_corners=False ) return x, {} def layers(self, features): mask_features, transformer_encoder_features, _ = self.pixel_decoder.forward_features(features) if self.transformer_in_feature == "transformer_encoder": assert ( transformer_encoder_features is not None ), "Please use the TransformerEncoderPixelDecoder." predictions = self.predictor(transformer_encoder_features, mask_features) else: predictions = self.predictor(features[self.transformer_in_feature], mask_features) if self.deep_supervision: return predictions["pred_masks"], predictions["aux_outputs"] else: return predictions["pred_masks"], None ================================================ FILE: mask2former/modeling/pixel_decoder/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. ================================================ FILE: mask2former/modeling/pixel_decoder/__init__.py.new ================================================ ================================================ FILE: mask2former/modeling/pixel_decoder/fpn.py ================================================ import logging import numpy as np from typing import Callable, Dict, List, Optional, Tuple, Union import fvcore.nn.weight_init as weight_init import torch from torch import nn from torch.nn import functional as F from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_ from torch.cuda.amp import autocast from detectron2.config import configurable from detectron2.layers import Conv2d, DeformConv, ShapeSpec, get_norm from detectron2.modeling import SEM_SEG_HEADS_REGISTRY from ..transformer_decoder.position_encoding import PositionEmbeddingSine from ..transformer_decoder.transformer import TransformerEncoder, TransformerEncoderLayer, _get_clones, _get_activation_fn def build_pixel_decoder(cfg, input_shape): """ Build a pixel decoder from `cfg.MODEL.MASK_FORMER.PIXEL_DECODER_NAME`. """ name = cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME model = SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape) forward_features = getattr(model, "forward_features", None) if not callable(forward_features): raise ValueError( "Only SEM_SEG_HEADS with forward_features method can be used as pixel decoder. " f"Please implement forward_features for {name} to only return mask features." ) return model # This is a modified FPN decoder. @SEM_SEG_HEADS_REGISTRY.register() class BasePixelDecoder(nn.Module): @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, conv_dim: int, mask_dim: int, norm: Optional[Union[str, Callable]] = None, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features conv_dims: number of output channels for the intermediate conv layers. mask_dim: number of output channels for the final conv layer. norm (str or callable): normalization for all conv layers """ super().__init__() input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5" feature_channels = [v.channels for k, v in input_shape] lateral_convs = [] output_convs = [] use_bias = norm == "" for idx, in_channels in enumerate(feature_channels): if idx == len(self.in_features) - 1: output_norm = get_norm(norm, conv_dim) output_conv = Conv2d( in_channels, conv_dim, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ) weight_init.c2_xavier_fill(output_conv) self.add_module("layer_{}".format(idx + 1), output_conv) lateral_convs.append(None) output_convs.append(output_conv) else: lateral_norm = get_norm(norm, conv_dim) output_norm = get_norm(norm, conv_dim) lateral_conv = Conv2d( in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm ) output_conv = Conv2d( conv_dim, conv_dim, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ) weight_init.c2_xavier_fill(lateral_conv) weight_init.c2_xavier_fill(output_conv) self.add_module("adapter_{}".format(idx + 1), lateral_conv) self.add_module("layer_{}".format(idx + 1), output_conv) lateral_convs.append(lateral_conv) output_convs.append(output_conv) # Place convs into top-down order (from low to high resolution) # to make the top-down computation in forward clearer. self.lateral_convs = lateral_convs[::-1] self.output_convs = output_convs[::-1] self.mask_dim = mask_dim self.mask_features = Conv2d( conv_dim, mask_dim, kernel_size=3, stride=1, padding=1, ) weight_init.c2_xavier_fill(self.mask_features) self.maskformer_num_feature_levels = 3 # always use 3 scales @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): ret = {} ret["input_shape"] = { k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES } ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM return ret def forward_features(self, features): multi_scale_features = [] num_cur_levels = 0 # Reverse feature maps into top-down order (from low to high resolution) for idx, f in enumerate(self.in_features[::-1]): x = features[f] lateral_conv = self.lateral_convs[idx] output_conv = self.output_convs[idx] if lateral_conv is None: y = output_conv(x) else: cur_fpn = lateral_conv(x) # Following FPN implementation, we use nearest upsampling here y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest") y = output_conv(y) if num_cur_levels < self.maskformer_num_feature_levels: multi_scale_features.append(y) num_cur_levels += 1 return self.mask_features(y), None, multi_scale_features def forward(self, features, targets=None): logger = logging.getLogger(__name__) logger.warning("Calling forward() may cause unpredicted behavior of PixelDecoder module.") return self.forward_features(features) class TransformerEncoderOnly(nn.Module): def __init__( self, d_model=512, nhead=8, num_encoder_layers=6, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False, ): super().__init__() encoder_layer = TransformerEncoderLayer( d_model, nhead, dim_feedforward, dropout, activation, normalize_before ) encoder_norm = nn.LayerNorm(d_model) if normalize_before else None self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) self._reset_parameters() self.d_model = d_model self.nhead = nhead def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def forward(self, src, mask, pos_embed): # flatten NxCxHxW to HWxNxC bs, c, h, w = src.shape src = src.flatten(2).permute(2, 0, 1) pos_embed = pos_embed.flatten(2).permute(2, 0, 1) if mask is not None: mask = mask.flatten(1) memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) return memory.permute(1, 2, 0).view(bs, c, h, w) # This is a modified FPN decoder with extra Transformer encoder that processes the lowest-resolution feature map. @SEM_SEG_HEADS_REGISTRY.register() class TransformerEncoderPixelDecoder(BasePixelDecoder): @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, transformer_dropout: float, transformer_nheads: int, transformer_dim_feedforward: int, transformer_enc_layers: int, transformer_pre_norm: bool, conv_dim: int, mask_dim: int, norm: Optional[Union[str, Callable]] = None, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features transformer_dropout: dropout probability in transformer transformer_nheads: number of heads in transformer transformer_dim_feedforward: dimension of feedforward network transformer_enc_layers: number of transformer encoder layers transformer_pre_norm: whether to use pre-layernorm or not conv_dims: number of output channels for the intermediate conv layers. mask_dim: number of output channels for the final conv layer. norm (str or callable): normalization for all conv layers """ super().__init__(input_shape, conv_dim=conv_dim, mask_dim=mask_dim, norm=norm) input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5" feature_strides = [v.stride for k, v in input_shape] feature_channels = [v.channels for k, v in input_shape] in_channels = feature_channels[len(self.in_features) - 1] self.input_proj = Conv2d(in_channels, conv_dim, kernel_size=1) weight_init.c2_xavier_fill(self.input_proj) self.transformer = TransformerEncoderOnly( d_model=conv_dim, dropout=transformer_dropout, nhead=transformer_nheads, dim_feedforward=transformer_dim_feedforward, num_encoder_layers=transformer_enc_layers, normalize_before=transformer_pre_norm, ) N_steps = conv_dim // 2 self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) # update layer use_bias = norm == "" output_norm = get_norm(norm, conv_dim) output_conv = Conv2d( conv_dim, conv_dim, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ) weight_init.c2_xavier_fill(output_conv) delattr(self, "layer_{}".format(len(self.in_features))) self.add_module("layer_{}".format(len(self.in_features)), output_conv) self.output_convs[0] = output_conv @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): ret = super().from_config(cfg, input_shape) ret["transformer_dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT ret["transformer_nheads"] = cfg.MODEL.MASK_FORMER.NHEADS ret["transformer_dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD ret[ "transformer_enc_layers" ] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS # a separate config ret["transformer_pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM return ret def forward_features(self, features): multi_scale_features = [] num_cur_levels = 0 # Reverse feature maps into top-down order (from low to high resolution) for idx, f in enumerate(self.in_features[::-1]): x = features[f] lateral_conv = self.lateral_convs[idx] output_conv = self.output_convs[idx] if lateral_conv is None: transformer = self.input_proj(x) pos = self.pe_layer(x) transformer = self.transformer(transformer, None, pos) y = output_conv(transformer) # save intermediate feature as input to Transformer decoder transformer_encoder_features = transformer else: cur_fpn = lateral_conv(x) # Following FPN implementation, we use nearest upsampling here y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest") y = output_conv(y) if num_cur_levels < self.maskformer_num_feature_levels: multi_scale_features.append(y) num_cur_levels += 1 return self.mask_features(y), transformer_encoder_features, multi_scale_features def forward(self, features, targets=None): logger = logging.getLogger(__name__) logger.warning("Calling forward() may cause unpredicted behavior of PixelDecoder module.") return self.forward_features(features) ================================================ FILE: mask2former/modeling/pixel_decoder/msdeformattn.py ================================================ import logging import numpy as np from typing import Callable, Dict, List, Optional, Tuple, Union import fvcore.nn.weight_init as weight_init import torch from torch import nn from torch.nn import functional as F from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_ from torch.cuda.amp import autocast from detectron2.config import configurable from detectron2.layers import Conv2d, ShapeSpec, get_norm from detectron2.modeling import SEM_SEG_HEADS_REGISTRY from ..transformer_decoder.position_encoding import PositionEmbeddingSine from ..transformer_decoder.transformer import _get_clones, _get_activation_fn from .ops.modules import MSDeformAttn # MSDeformAttn Transformer encoder in deformable detr class MSDeformAttnTransformerEncoderOnly(nn.Module): def __init__(self, d_model=256, nhead=8, num_encoder_layers=6, dim_feedforward=1024, dropout=0.1, activation="relu", num_feature_levels=4, enc_n_points=4, ): super().__init__() self.d_model = d_model self.nhead = nhead encoder_layer = MSDeformAttnTransformerEncoderLayer(d_model, dim_feedforward, dropout, activation, num_feature_levels, nhead, enc_n_points) self.encoder = MSDeformAttnTransformerEncoder(encoder_layer, num_encoder_layers) self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model)) self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) for m in self.modules(): if isinstance(m, MSDeformAttn): m._reset_parameters() normal_(self.level_embed) def get_valid_ratio(self, mask): _, H, W = mask.shape valid_H = torch.sum(~mask[:, :, 0], 1) valid_W = torch.sum(~mask[:, 0, :], 1) valid_ratio_h = valid_H.float() / H valid_ratio_w = valid_W.float() / W valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1) return valid_ratio def forward(self, srcs, pos_embeds): masks = [torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) for x in srcs] # prepare input for encoder src_flatten = [] mask_flatten = [] lvl_pos_embed_flatten = [] spatial_shapes = [] for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)): bs, c, h, w = src.shape spatial_shape = (h, w) spatial_shapes.append(spatial_shape) src = src.flatten(2).transpose(1, 2) mask = mask.flatten(1) pos_embed = pos_embed.flatten(2).transpose(1, 2) lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1) lvl_pos_embed_flatten.append(lvl_pos_embed) src_flatten.append(src) mask_flatten.append(mask) src_flatten = torch.cat(src_flatten, 1) mask_flatten = torch.cat(mask_flatten, 1) lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device) level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) # encoder memory = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten) return memory, spatial_shapes, level_start_index class MSDeformAttnTransformerEncoderLayer(nn.Module): def __init__(self, d_model=256, d_ffn=1024, dropout=0.1, activation="relu", n_levels=4, n_heads=8, n_points=4): super().__init__() # self attention self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points) self.dropout1 = nn.Dropout(dropout) self.norm1 = nn.LayerNorm(d_model) # ffn self.linear1 = nn.Linear(d_model, d_ffn) self.activation = _get_activation_fn(activation) self.dropout2 = nn.Dropout(dropout) self.linear2 = nn.Linear(d_ffn, d_model) self.dropout3 = nn.Dropout(dropout) self.norm2 = nn.LayerNorm(d_model) @staticmethod def with_pos_embed(tensor, pos): return tensor if pos is None else tensor + pos def forward_ffn(self, src): src2 = self.linear2(self.dropout2(self.activation(self.linear1(src)))) src = src + self.dropout3(src2) src = self.norm2(src) return src def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None): # self attention src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask) src = src + self.dropout1(src2) src = self.norm1(src) # ffn src = self.forward_ffn(src) return src class MSDeformAttnTransformerEncoder(nn.Module): def __init__(self, encoder_layer, num_layers): super().__init__() self.layers = _get_clones(encoder_layer, num_layers) self.num_layers = num_layers @staticmethod def get_reference_points(spatial_shapes, valid_ratios, device): reference_points_list = [] for lvl, (H_, W_) in enumerate(spatial_shapes): ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device), torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device)) ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_) ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_) ref = torch.stack((ref_x, ref_y), -1) reference_points_list.append(ref) reference_points = torch.cat(reference_points_list, 1) reference_points = reference_points[:, :, None] * valid_ratios[:, None] return reference_points def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None): output = src reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device) for _, layer in enumerate(self.layers): output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask) return output @SEM_SEG_HEADS_REGISTRY.register() class MSDeformAttnPixelDecoder(nn.Module): @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, transformer_dropout: float, transformer_nheads: int, transformer_dim_feedforward: int, transformer_enc_layers: int, conv_dim: int, mask_dim: int, norm: Optional[Union[str, Callable]] = None, # deformable transformer encoder args transformer_in_features: List[str], common_stride: int, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features transformer_dropout: dropout probability in transformer transformer_nheads: number of heads in transformer transformer_dim_feedforward: dimension of feedforward network transformer_enc_layers: number of transformer encoder layers conv_dims: number of output channels for the intermediate conv layers. mask_dim: number of output channels for the final conv layer. norm (str or callable): normalization for all conv layers """ super().__init__() transformer_input_shape = { k: v for k, v in input_shape.items() if k in transformer_in_features } # this is the input shape of pixel decoder input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5" self.feature_strides = [v.stride for k, v in input_shape] self.feature_channels = [v.channels for k, v in input_shape] # this is the input shape of transformer encoder (could use less features than pixel decoder transformer_input_shape = sorted(transformer_input_shape.items(), key=lambda x: x[1].stride) self.transformer_in_features = [k for k, v in transformer_input_shape] # starting from "res2" to "res5" transformer_in_channels = [v.channels for k, v in transformer_input_shape] self.transformer_feature_strides = [v.stride for k, v in transformer_input_shape] # to decide extra FPN layers self.transformer_num_feature_levels = len(self.transformer_in_features) if self.transformer_num_feature_levels > 1: input_proj_list = [] # from low resolution to high resolution (res5 -> res2) for in_channels in transformer_in_channels[::-1]: input_proj_list.append(nn.Sequential( nn.Conv2d(in_channels, conv_dim, kernel_size=1), nn.GroupNorm(32, conv_dim), )) self.input_proj = nn.ModuleList(input_proj_list) else: self.input_proj = nn.ModuleList([ nn.Sequential( nn.Conv2d(transformer_in_channels[-1], conv_dim, kernel_size=1), nn.GroupNorm(32, conv_dim), )]) for proj in self.input_proj: nn.init.xavier_uniform_(proj[0].weight, gain=1) nn.init.constant_(proj[0].bias, 0) self.transformer = MSDeformAttnTransformerEncoderOnly( d_model=conv_dim, dropout=transformer_dropout, nhead=transformer_nheads, dim_feedforward=transformer_dim_feedforward, num_encoder_layers=transformer_enc_layers, num_feature_levels=self.transformer_num_feature_levels, ) N_steps = conv_dim // 2 self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) self.mask_dim = mask_dim # use 1x1 conv instead self.mask_features = Conv2d( conv_dim, mask_dim, kernel_size=1, stride=1, padding=0, ) weight_init.c2_xavier_fill(self.mask_features) self.maskformer_num_feature_levels = 3 # always use 3 scales self.common_stride = common_stride # extra fpn levels stride = min(self.transformer_feature_strides) self.num_fpn_levels = int(np.log2(stride) - np.log2(self.common_stride)) lateral_convs = [] output_convs = [] use_bias = norm == "" for idx, in_channels in enumerate(self.feature_channels[:self.num_fpn_levels]): lateral_norm = get_norm(norm, conv_dim) output_norm = get_norm(norm, conv_dim) lateral_conv = Conv2d( in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm ) output_conv = Conv2d( conv_dim, conv_dim, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ) weight_init.c2_xavier_fill(lateral_conv) weight_init.c2_xavier_fill(output_conv) self.add_module("adapter_{}".format(idx + 1), lateral_conv) self.add_module("layer_{}".format(idx + 1), output_conv) lateral_convs.append(lateral_conv) output_convs.append(output_conv) # Place convs into top-down order (from low to high resolution) # to make the top-down computation in forward clearer. self.lateral_convs = lateral_convs[::-1] self.output_convs = output_convs[::-1] @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): ret = {} ret["input_shape"] = { k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES } ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM ret["transformer_dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT ret["transformer_nheads"] = cfg.MODEL.MASK_FORMER.NHEADS # ret["transformer_dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD ret["transformer_dim_feedforward"] = 1024 # use 1024 for deformable transformer encoder ret[ "transformer_enc_layers" ] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS # a separate config ret["transformer_in_features"] = cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES ret["common_stride"] = cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE return ret @autocast(enabled=False) def forward_features(self, features): srcs = [] pos = [] # Reverse feature maps into top-down order (from low to high resolution) for idx, f in enumerate(self.transformer_in_features[::-1]): x = features[f].float() # deformable detr does not support half precision srcs.append(self.input_proj[idx](x)) pos.append(self.pe_layer(x)) y, spatial_shapes, level_start_index = self.transformer(srcs, pos) bs = y.shape[0] split_size_or_sections = [None] * self.transformer_num_feature_levels for i in range(self.transformer_num_feature_levels): if i < self.transformer_num_feature_levels - 1: split_size_or_sections[i] = level_start_index[i + 1] - level_start_index[i] else: split_size_or_sections[i] = y.shape[1] - level_start_index[i] y = torch.split(y, split_size_or_sections, dim=1) out = [] multi_scale_features = [] num_cur_levels = 0 for i, z in enumerate(y): out.append(z.transpose(1, 2).view(bs, -1, spatial_shapes[i][0], spatial_shapes[i][1])) # append `out` with extra FPN levels # Reverse feature maps into top-down order (from low to high resolution) for idx, f in enumerate(self.in_features[:self.num_fpn_levels][::-1]): x = features[f].float() lateral_conv = self.lateral_convs[idx] output_conv = self.output_convs[idx] cur_fpn = lateral_conv(x) # Following FPN implementation, we use nearest upsampling here y = cur_fpn + F.interpolate(out[-1], size=cur_fpn.shape[-2:], mode="bilinear", align_corners=False) y = output_conv(y) out.append(y) for o in out: if num_cur_levels < self.maskformer_num_feature_levels: multi_scale_features.append(o) num_cur_levels += 1 return self.mask_features(out[-1]), out[0], multi_scale_features ================================================ FILE: mask2former/modeling/pixel_decoder/ops/functions/__init__.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from .ms_deform_attn_func import MSDeformAttnFunction ================================================ FILE: mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from __future__ import absolute_import from __future__ import print_function from __future__ import division import torch import torch.nn.functional as F from torch.autograd import Function from torch.autograd.function import once_differentiable try: import MultiScaleDeformableAttention as MSDA except ModuleNotFoundError as e: info_string = ( "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" "\t`cd mask2former/modeling/pixel_decoder/ops`\n" "\t`sh make.sh`\n" ) raise ModuleNotFoundError(info_string) class MSDeformAttnFunction(Function): @staticmethod def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): ctx.im2col_step = im2col_step output = MSDA.ms_deform_attn_forward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) return output @staticmethod @once_differentiable def backward(ctx, grad_output): value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors grad_value, grad_sampling_loc, grad_attn_weight = \ MSDA.ms_deform_attn_backward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): # for debug and test only, # need to use cuda version instead N_, S_, M_, D_ = value.shape _, Lq_, M_, L_, P_, _ = sampling_locations.shape value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) sampling_grids = 2 * sampling_locations - 1 sampling_value_list = [] for lid_, (H_, W_) in enumerate(value_spatial_shapes): # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) # N_*M_, D_, Lq_, P_ sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, mode='bilinear', padding_mode='zeros', align_corners=False) sampling_value_list.append(sampling_value_l_) # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) return output.transpose(1, 2).contiguous() ================================================ FILE: mask2former/modeling/pixel_decoder/ops/make.sh ================================================ #!/usr/bin/env bash # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR python setup.py build install ================================================ FILE: mask2former/modeling/pixel_decoder/ops/modules/__init__.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from .ms_deform_attn import MSDeformAttn ================================================ FILE: mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from __future__ import absolute_import from __future__ import print_function from __future__ import division import warnings import math import torch from torch import nn import torch.nn.functional as F from torch.nn.init import xavier_uniform_, constant_ from ..functions import MSDeformAttnFunction from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) return (n & (n-1) == 0) and n != 0 class MSDeformAttn(nn.Module): def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): """ Multi-Scale Deformable Attention Module :param d_model hidden dimension :param n_levels number of feature levels :param n_heads number of attention heads :param n_points number of sampling points per attention head per feature level """ super().__init__() if d_model % n_heads != 0: raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) _d_per_head = d_model // n_heads # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation if not _is_power_of_2(_d_per_head): warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " "which is more efficient in our CUDA implementation.") self.im2col_step = 128 self.d_model = d_model self.n_levels = n_levels self.n_heads = n_heads self.n_points = n_points self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) self.value_proj = nn.Linear(d_model, d_model) self.output_proj = nn.Linear(d_model, d_model) self._reset_parameters() def _reset_parameters(self): constant_(self.sampling_offsets.weight.data, 0.) thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) for i in range(self.n_points): grid_init[:, :, i, :] *= i + 1 with torch.no_grad(): self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) constant_(self.attention_weights.weight.data, 0.) constant_(self.attention_weights.bias.data, 0.) xavier_uniform_(self.value_proj.weight.data) constant_(self.value_proj.bias.data, 0.) xavier_uniform_(self.output_proj.weight.data) constant_(self.output_proj.bias.data, 0.) def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): """ :param query (N, Length_{query}, C) :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements :return output (N, Length_{query}, C) """ N, Len_q, _ = query.shape N, Len_in, _ = input_flatten.shape assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in value = self.value_proj(input_flatten) if input_padding_mask is not None: value = value.masked_fill(input_padding_mask[..., None], float(0)) value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) # N, Len_q, n_heads, n_levels, n_points, 2 if reference_points.shape[-1] == 2: offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) sampling_locations = reference_points[:, :, None, :, None, :] \ + sampling_offsets / offset_normalizer[None, None, None, :, None, :] elif reference_points.shape[-1] == 4: sampling_locations = reference_points[:, :, None, :, None, :2] \ + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 else: raise ValueError( 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) try: output = MSDeformAttnFunction.apply( value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) except: # CPU output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) # # For FLOPs calculation only # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) output = self.output_proj(output) return output ================================================ FILE: mask2former/modeling/pixel_decoder/ops/setup.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR import os import glob import torch from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CppExtension from torch.utils.cpp_extension import CUDAExtension from setuptools import find_packages from setuptools import setup requirements = ["torch", "torchvision"] def get_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) extensions_dir = os.path.join(this_dir, "src") main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) sources = main_file + source_cpu extension = CppExtension extra_compile_args = {"cxx": []} define_macros = [] # Force cuda since torch ask for a device, not if cuda is in fact available. if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: extension = CUDAExtension sources += source_cuda define_macros += [("WITH_CUDA", None)] extra_compile_args["nvcc"] = [ "-DCUDA_HAS_FP16=1", "-D__CUDA_NO_HALF_OPERATORS__", "-D__CUDA_NO_HALF_CONVERSIONS__", "-D__CUDA_NO_HALF2_OPERATORS__", ] else: if CUDA_HOME is None: raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') else: raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') sources = [os.path.join(extensions_dir, s) for s in sources] include_dirs = [extensions_dir] ext_modules = [ extension( "MultiScaleDeformableAttention", sources, include_dirs=include_dirs, define_macros=define_macros, extra_compile_args=extra_compile_args, ) ] return ext_modules setup( name="MultiScaleDeformableAttention", version="1.0", author="Weijie Su", url="https://github.com/fundamentalvision/Deformable-DETR", description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", packages=find_packages(exclude=("configs", "tests",)), ext_modules=get_extensions(), cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, ) ================================================ FILE: mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include #include #include at::Tensor ms_deform_attn_cpu_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step) { AT_ERROR("Not implement on cpu"); } std::vector ms_deform_attn_cpu_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step) { AT_ERROR("Not implement on cpu"); } ================================================ FILE: mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #pragma once #include at::Tensor ms_deform_attn_cpu_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step); std::vector ms_deform_attn_cpu_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step); ================================================ FILE: mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include #include "cuda/ms_deform_im2col_cuda.cuh" #include #include #include #include at::Tensor ms_deform_attn_cuda_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step) { AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); const int batch = value.size(0); const int spatial_size = value.size(1); const int num_heads = value.size(2); const int channels = value.size(3); const int num_levels = spatial_shapes.size(0); const int num_query = sampling_loc.size(1); const int num_point = sampling_loc.size(4); const int im2col_step_ = std::min(batch, im2col_step); AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); const int batch_n = im2col_step_; auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); auto per_value_size = spatial_size * num_heads * channels; auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; for (int n = 0; n < batch/im2col_step_; ++n) { auto columns = output_n.select(0, n); AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), value.data() + n * im2col_step_ * per_value_size, spatial_shapes.data(), level_start_index.data(), sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, attn_weight.data() + n * im2col_step_ * per_attn_weight_size, batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, columns.data()); })); } output = output.view({batch, num_query, num_heads*channels}); return output; } std::vector ms_deform_attn_cuda_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step) { AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); const int batch = value.size(0); const int spatial_size = value.size(1); const int num_heads = value.size(2); const int channels = value.size(3); const int num_levels = spatial_shapes.size(0); const int num_query = sampling_loc.size(1); const int num_point = sampling_loc.size(4); const int im2col_step_ = std::min(batch, im2col_step); AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); auto grad_value = at::zeros_like(value); auto grad_sampling_loc = at::zeros_like(sampling_loc); auto grad_attn_weight = at::zeros_like(attn_weight); const int batch_n = im2col_step_; auto per_value_size = spatial_size * num_heads * channels; auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); for (int n = 0; n < batch/im2col_step_; ++n) { auto grad_output_g = grad_output_n.select(0, n); AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), grad_output_g.data(), value.data() + n * im2col_step_ * per_value_size, spatial_shapes.data(), level_start_index.data(), sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, attn_weight.data() + n * im2col_step_ * per_attn_weight_size, batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value.data() + n * im2col_step_ * per_value_size, grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); })); } return { grad_value, grad_sampling_loc, grad_attn_weight }; } ================================================ FILE: mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #pragma once #include at::Tensor ms_deform_attn_cuda_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step); std::vector ms_deform_attn_cuda_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step); ================================================ FILE: mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh ================================================ /*! ************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************** * Modified from DCN (https://github.com/msracver/Deformable-ConvNets) * Copyright (c) 2018 Microsoft ************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include #include #include #include #include #include #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ i < (n); \ i += blockDim.x * gridDim.x) const int CUDA_NUM_THREADS = 1024; inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } template __device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; } const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad, const scalar_t &attn_weight, scalar_t* &grad_value, scalar_t* grad_sampling_loc, scalar_t* grad_attn_weight) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t top_grad_value = top_grad * attn_weight; scalar_t grad_h_weight = 0, grad_w_weight = 0; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_value+ptr1, w1*top_grad_value); } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_value+ptr2, w2*top_grad_value); } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_value+ptr3, w3*top_grad_value); } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_value+ptr4, w4*top_grad_value); } const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); *grad_attn_weight = top_grad * val; *grad_sampling_loc = width * grad_w_weight * top_grad_value; *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value; } template __device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad, const scalar_t &attn_weight, scalar_t* &grad_value, scalar_t* grad_sampling_loc, scalar_t* grad_attn_weight) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t top_grad_value = top_grad * attn_weight; scalar_t grad_h_weight = 0, grad_w_weight = 0; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_value+ptr1, w1*top_grad_value); } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_value+ptr2, w2*top_grad_value); } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_value+ptr3, w3*top_grad_value); } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_value+ptr4, w4*top_grad_value); } const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); atomicAdd(grad_attn_weight, top_grad * val); atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value); atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value); } template __global__ void ms_deformable_im2col_gpu_kernel(const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *data_col) { CUDA_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; scalar_t *data_col_ptr = data_col + index; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; scalar_t col = 0; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride); for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight; } data_weight_ptr += 1; data_loc_w_ptr += 2; } } *data_col_ptr = col; } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; __shared__ scalar_t cache_grad_attn_weight[blockSize]; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); if (tid == 0) { scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; int sid=2; for (unsigned int tid = 1; tid < blockSize; ++tid) { _grad_w += cache_grad_sampling_loc[sid]; _grad_h += cache_grad_sampling_loc[sid + 1]; _grad_a += cache_grad_attn_weight[tid]; sid += 2; } *grad_sampling_loc = _grad_w; *(grad_sampling_loc + 1) = _grad_h; *grad_attn_weight = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; __shared__ scalar_t cache_grad_attn_weight[blockSize]; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); for (unsigned int s=blockSize/2; s>0; s>>=1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; } __syncthreads(); } if (tid == 0) { *grad_sampling_loc = cache_grad_sampling_loc[0]; *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; *grad_attn_weight = cache_grad_attn_weight[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); if (tid == 0) { scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; int sid=2; for (unsigned int tid = 1; tid < blockDim.x; ++tid) { _grad_w += cache_grad_sampling_loc[sid]; _grad_h += cache_grad_sampling_loc[sid + 1]; _grad_a += cache_grad_attn_weight[tid]; sid += 2; } *grad_sampling_loc = _grad_w; *(grad_sampling_loc + 1) = _grad_h; *grad_attn_weight = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { *grad_sampling_loc = cache_grad_sampling_loc[0]; *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; *grad_attn_weight = cache_grad_attn_weight[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]); atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]); atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]); } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_gm(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear_gm( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, grad_sampling_loc, grad_attn_weight); } data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t* data_value, const int64_t* data_spatial_shapes, const int64_t* data_level_start_index, const scalar_t* data_sampling_loc, const scalar_t* data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t* data_col) { const int num_kernels = batch_size * num_query * num_heads * channels; const int num_actual_kernels = batch_size * num_query * num_heads * channels; const int num_threads = CUDA_NUM_THREADS; ms_deformable_im2col_gpu_kernel <<>>( num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); } } template void ms_deformable_col2im_cuda(cudaStream_t stream, const scalar_t* grad_col, const scalar_t* data_value, const int64_t * data_spatial_shapes, const int64_t * data_level_start_index, const scalar_t * data_sampling_loc, const scalar_t * data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t* grad_value, scalar_t* grad_sampling_loc, scalar_t* grad_attn_weight) { const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels; const int num_kernels = batch_size * num_query * num_heads * channels; const int num_actual_kernels = batch_size * num_query * num_heads * channels; if (channels > 1024) { if ((channels & 1023) == 0) { ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } else { ms_deformable_col2im_gpu_kernel_gm <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } } else{ switch(channels) { case 1: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 2: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 4: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 8: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 16: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 32: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 64: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 128: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 256: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 512: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 1024: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; default: if (channels < 64) { ms_deformable_col2im_gpu_kernel_shm_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } else { ms_deformable_col2im_gpu_kernel_shm_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } } } cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); } } ================================================ FILE: mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #pragma once #include "cpu/ms_deform_attn_cpu.h" #ifdef WITH_CUDA #include "cuda/ms_deform_attn_cuda.h" #endif at::Tensor ms_deform_attn_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step) { if (value.type().is_cuda()) { #ifdef WITH_CUDA return ms_deform_attn_cuda_forward( value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } std::vector ms_deform_attn_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step) { if (value.type().is_cuda()) { #ifdef WITH_CUDA return ms_deform_attn_cuda_backward( value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: mask2former/modeling/pixel_decoder/ops/src/vision.cpp ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include "ms_deform_attn.h" PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); } ================================================ FILE: mask2former/modeling/pixel_decoder/ops/test.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from __future__ import absolute_import from __future__ import print_function from __future__ import division import time import torch import torch.nn as nn from torch.autograd import gradcheck from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch N, M, D = 1, 2, 2 Lq, L, P = 2, 2, 2 shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) S = sum([(H*W).item() for H, W in shapes]) torch.manual_seed(3) @torch.no_grad() def check_forward_equal_with_pytorch_double(): value = torch.rand(N, S, M, D).cuda() * 0.01 sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) im2col_step = 2 output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() fwdok = torch.allclose(output_cuda, output_pytorch) max_abs_err = (output_cuda - output_pytorch).abs().max() max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') @torch.no_grad() def check_forward_equal_with_pytorch_float(): value = torch.rand(N, S, M, D).cuda() * 0.01 sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) im2col_step = 2 output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) max_abs_err = (output_cuda - output_pytorch).abs().max() max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): value = torch.rand(N, S, M, channels).cuda() * 0.01 sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) im2col_step = 2 func = MSDeformAttnFunction.apply value.requires_grad = grad_value sampling_locations.requires_grad = grad_sampling_loc attention_weights.requires_grad = grad_attn_weight gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) print(f'* {gradok} check_gradient_numerical(D={channels})') if __name__ == '__main__': check_forward_equal_with_pytorch_double() check_forward_equal_with_pytorch_float() for channels in [30, 32, 64, 71, 1025, 2048, 3096]: check_gradient_numerical(channels, True, True, True) ================================================ FILE: mask2former/modeling/transformer_decoder/__init__.py ================================================ from .maskformer_transformer_decoder import StandardTransformerDecoder from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder ================================================ FILE: mask2former/modeling/transformer_decoder/mask2former_transformer_decoder.py ================================================ # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py import logging import fvcore.nn.weight_init as weight_init from typing import Optional import torch from torch import nn, Tensor from torch.nn import functional as F from detectron2.config import configurable from detectron2.layers import Conv2d from .position_encoding import PositionEmbeddingSine from .maskformer_transformer_decoder import TRANSFORMER_DECODER_REGISTRY class SelfAttentionLayer(nn.Module): def __init__(self, d_model, nhead, dropout=0.0, activation="relu", normalize_before=False): super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.norm = nn.LayerNorm(d_model) self.dropout = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post(self, tgt, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): q = k = self.with_pos_embed(tgt, query_pos) tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) tgt = self.norm(tgt) return tgt def forward_pre(self, tgt, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): tgt2 = self.norm(tgt) q = k = self.with_pos_embed(tgt2, query_pos) tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) return tgt def forward(self, tgt, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): if self.normalize_before: return self.forward_pre(tgt, tgt_mask, tgt_key_padding_mask, query_pos) return self.forward_post(tgt, tgt_mask, tgt_key_padding_mask, query_pos) class CrossAttentionLayer(nn.Module): def __init__(self, d_model, nhead, dropout=0.0, activation="relu", normalize_before=False): super().__init__() self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.norm = nn.LayerNorm(d_model) self.dropout = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post(self, tgt, memory, memory_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos), key=self.with_pos_embed(memory, pos), value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) tgt = self.norm(tgt) return tgt def forward_pre(self, tgt, memory, memory_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): tgt2 = self.norm(tgt) tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos), key=self.with_pos_embed(memory, pos), value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) return tgt def forward(self, tgt, memory, memory_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): if self.normalize_before: return self.forward_pre(tgt, memory, memory_mask, memory_key_padding_mask, pos, query_pos) return self.forward_post(tgt, memory, memory_mask, memory_key_padding_mask, pos, query_pos) class FFNLayer(nn.Module): def __init__(self, d_model, dim_feedforward=2048, dropout=0.0, activation="relu", normalize_before=False): super().__init__() # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm = nn.LayerNorm(d_model) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post(self, tgt): tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) tgt = tgt + self.dropout(tgt2) tgt = self.norm(tgt) return tgt def forward_pre(self, tgt): tgt2 = self.norm(tgt) tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) tgt = tgt + self.dropout(tgt2) return tgt def forward(self, tgt): if self.normalize_before: return self.forward_pre(tgt) return self.forward_post(tgt) def _get_activation_fn(activation): """Return an activation function given a string""" if activation == "relu": return F.relu if activation == "gelu": return F.gelu if activation == "glu": return F.glu raise RuntimeError(F"activation should be relu/gelu, not {activation}.") class MLP(nn.Module): """ Very simple multi-layer perceptron (also called FFN)""" def __init__(self, input_dim, hidden_dim, output_dim, num_layers): super().__init__() self.num_layers = num_layers h = [hidden_dim] * (num_layers - 1) self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) def forward(self, x): for i, layer in enumerate(self.layers): x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) return x @TRANSFORMER_DECODER_REGISTRY.register() class MultiScaleMaskedTransformerDecoder(nn.Module): _version = 2 def _load_from_state_dict( self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ): version = local_metadata.get("version", None) if version is None or version < 2: # Do not warn if train from scratch scratch = True logger = logging.getLogger(__name__) for k in list(state_dict.keys()): newk = k if "static_query" in k: newk = k.replace("static_query", "query_feat") if newk != k: state_dict[newk] = state_dict[k] del state_dict[k] scratch = False if not scratch: logger.warning( f"Weight format of {self.__class__.__name__} have changed! " "Please upgrade your models. Applying automatic conversion now ..." ) @configurable def __init__( self, in_channels, mask_classification=True, *, num_classes: int, hidden_dim: int, num_queries: int, nheads: int, dim_feedforward: int, dec_layers: int, pre_norm: bool, mask_dim: int, enforce_input_project: bool, ): """ NOTE: this interface is experimental. Args: in_channels: channels of the input features mask_classification: whether to add mask classifier or not num_classes: number of classes hidden_dim: Transformer feature dimension num_queries: number of queries nheads: number of heads dim_feedforward: feature dimension in feedforward network enc_layers: number of Transformer encoder layers dec_layers: number of Transformer decoder layers pre_norm: whether to use pre-LayerNorm or not mask_dim: mask feature dimension enforce_input_project: add input project 1x1 conv even if input channels and hidden dim is identical """ super().__init__() assert mask_classification, "Only support mask classification model" self.mask_classification = mask_classification # positional encoding N_steps = hidden_dim // 2 self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) # define Transformer decoder here self.num_heads = nheads self.num_layers = dec_layers self.transformer_self_attention_layers = nn.ModuleList() self.transformer_cross_attention_layers = nn.ModuleList() self.transformer_ffn_layers = nn.ModuleList() for _ in range(self.num_layers): self.transformer_self_attention_layers.append( SelfAttentionLayer( d_model=hidden_dim, nhead=nheads, dropout=0.0, normalize_before=pre_norm, ) ) self.transformer_cross_attention_layers.append( CrossAttentionLayer( d_model=hidden_dim, nhead=nheads, dropout=0.0, normalize_before=pre_norm, ) ) self.transformer_ffn_layers.append( FFNLayer( d_model=hidden_dim, dim_feedforward=dim_feedforward, dropout=0.0, normalize_before=pre_norm, ) ) self.decoder_norm = nn.LayerNorm(hidden_dim) self.num_queries = num_queries # learnable query features self.query_feat = nn.Embedding(num_queries, hidden_dim) # learnable query p.e. self.query_embed = nn.Embedding(num_queries, hidden_dim) # level embedding (we always use 3 scales) self.num_feature_levels = 3 self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim) self.input_proj = nn.ModuleList() for _ in range(self.num_feature_levels): if in_channels != hidden_dim or enforce_input_project: self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1)) weight_init.c2_xavier_fill(self.input_proj[-1]) else: self.input_proj.append(nn.Sequential()) # output FFNs if self.mask_classification: self.class_embed = nn.Linear(hidden_dim, num_classes + 1) self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) @classmethod def from_config(cls, cfg, in_channels, mask_classification): ret = {} ret["in_channels"] = in_channels ret["mask_classification"] = mask_classification ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES # Transformer parameters: ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD # NOTE: because we add learnable query features which requires supervision, # we add minus 1 to decoder layers to be consistent with our loss # implementation: that is, number of auxiliary losses is always # equal to number of decoder layers. With learnable query features, the number of # auxiliary losses equals number of decoders plus 1. assert cfg.MODEL.MASK_FORMER.DEC_LAYERS >= 1 ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS - 1 ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM return ret def forward(self, x, mask_features, mask = None): # x is a list of multi-scale feature assert len(x) == self.num_feature_levels src = [] pos = [] size_list = [] # disable mask, it does not affect performance del mask for i in range(self.num_feature_levels): size_list.append(x[i].shape[-2:]) pos.append(self.pe_layer(x[i], None).flatten(2)) src.append(self.input_proj[i](x[i]).flatten(2) + self.level_embed.weight[i][None, :, None]) # flatten NxCxHxW to HWxNxC pos[-1] = pos[-1].permute(2, 0, 1) src[-1] = src[-1].permute(2, 0, 1) _, bs, _ = src[0].shape # QxNxC query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1) # query_embed = None # print('come here==========') output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1) predictions_class = [] predictions_mask = [] # prediction heads on learnable query features outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0]) predictions_class.append(outputs_class) predictions_mask.append(outputs_mask) for i in range(self.num_layers): level_index = i % self.num_feature_levels attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False # attention: cross-attention first output = self.transformer_cross_attention_layers[i]( output, src[level_index], memory_mask=attn_mask, memory_key_padding_mask=None, # here we do not apply masking on padded region pos=pos[level_index], query_pos=query_embed ) output = self.transformer_self_attention_layers[i]( output, tgt_mask=None, tgt_key_padding_mask=None, query_pos=query_embed ) # FFN output = self.transformer_ffn_layers[i]( output ) outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels]) predictions_class.append(outputs_class) predictions_mask.append(outputs_mask) assert len(predictions_class) == self.num_layers + 1 # print('len mask predictions:', len(predictions_mask)) out = { 'pred_logits': predictions_class[-1], 'pred_masks': predictions_mask[-1], 'aux_outputs': self._set_aux_loss( predictions_class if self.mask_classification else None, predictions_mask ) } return out def forward_prediction_heads(self, output, mask_features, attn_mask_target_size): decoder_output = self.decoder_norm(output) decoder_output = decoder_output.transpose(0, 1) outputs_class = self.class_embed(decoder_output) mask_embed = self.mask_embed(decoder_output) outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features) # NOTE: prediction is of higher-resolution # [B, Q, H, W] -> [B, Q, H*W] -> [B, h, Q, H*W] -> [B*h, Q, HW] attn_mask = F.interpolate(outputs_mask, size=attn_mask_target_size, mode="bilinear", align_corners=False) # must use bool type # If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged. attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool() attn_mask = attn_mask.detach() return outputs_class, outputs_mask, attn_mask @torch.jit.unused def _set_aux_loss(self, outputs_class, outputs_seg_masks): # this is a workaround to make torchscript happy, as torchscript # doesn't support dictionary with non-homogeneous values, such # as a dict having both a Tensor and a list. if self.mask_classification: return [ {"pred_logits": a, "pred_masks": b} for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1]) ] else: return [{"pred_masks": b} for b in outputs_seg_masks[:-1]] ================================================ FILE: mask2former/modeling/transformer_decoder/maskformer_transformer_decoder.py ================================================ # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py import fvcore.nn.weight_init as weight_init import torch from torch import nn from torch.nn import functional as F from detectron2.config import configurable from detectron2.layers import Conv2d from detectron2.utils.registry import Registry from .position_encoding import PositionEmbeddingSine from .transformer import Transformer TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE") TRANSFORMER_DECODER_REGISTRY.__doc__ = """ Registry for transformer module in MaskFormer. """ def build_transformer_decoder(cfg, in_channels, mask_classification=True): """ Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`. """ name = cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification) @TRANSFORMER_DECODER_REGISTRY.register() class StandardTransformerDecoder(nn.Module): @configurable def __init__( self, in_channels, mask_classification=True, *, num_classes: int, hidden_dim: int, num_queries: int, nheads: int, dropout: float, dim_feedforward: int, enc_layers: int, dec_layers: int, pre_norm: bool, deep_supervision: bool, mask_dim: int, enforce_input_project: bool, ): """ NOTE: this interface is experimental. Args: in_channels: channels of the input features mask_classification: whether to add mask classifier or not num_classes: number of classes hidden_dim: Transformer feature dimension num_queries: number of queries nheads: number of heads dropout: dropout in Transformer dim_feedforward: feature dimension in feedforward network enc_layers: number of Transformer encoder layers dec_layers: number of Transformer decoder layers pre_norm: whether to use pre-LayerNorm or not deep_supervision: whether to add supervision to every decoder layers mask_dim: mask feature dimension enforce_input_project: add input project 1x1 conv even if input channels and hidden dim is identical """ super().__init__() self.mask_classification = mask_classification # positional encoding N_steps = hidden_dim // 2 self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) transformer = Transformer( d_model=hidden_dim, dropout=dropout, nhead=nheads, dim_feedforward=dim_feedforward, num_encoder_layers=enc_layers, num_decoder_layers=dec_layers, normalize_before=pre_norm, return_intermediate_dec=deep_supervision, ) self.num_queries = num_queries self.transformer = transformer hidden_dim = transformer.d_model self.query_embed = nn.Embedding(num_queries, hidden_dim) if in_channels != hidden_dim or enforce_input_project: self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1) weight_init.c2_xavier_fill(self.input_proj) else: self.input_proj = nn.Sequential() self.aux_loss = deep_supervision # output FFNs if self.mask_classification: self.class_embed = nn.Linear(hidden_dim, num_classes + 1) self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) @classmethod def from_config(cls, cfg, in_channels, mask_classification): ret = {} ret["in_channels"] = in_channels ret["mask_classification"] = mask_classification ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES # Transformer parameters: ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM return ret def forward(self, x, mask_features, mask=None): if mask is not None: mask = F.interpolate(mask[None].float(), size=x.shape[-2:]).to(torch.bool)[0] pos = self.pe_layer(x, mask) src = x hs, memory = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos) if self.mask_classification: outputs_class = self.class_embed(hs) out = {"pred_logits": outputs_class[-1]} else: out = {} if self.aux_loss: # [l, bs, queries, embed] mask_embed = self.mask_embed(hs) outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features) out["pred_masks"] = outputs_seg_masks[-1] out["aux_outputs"] = self._set_aux_loss( outputs_class if self.mask_classification else None, outputs_seg_masks ) else: # FIXME h_boxes takes the last one computed, keep this in mind # [bs, queries, embed] mask_embed = self.mask_embed(hs[-1]) outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features) out["pred_masks"] = outputs_seg_masks return out @torch.jit.unused def _set_aux_loss(self, outputs_class, outputs_seg_masks): # this is a workaround to make torchscript happy, as torchscript # doesn't support dictionary with non-homogeneous values, such # as a dict having both a Tensor and a list. if self.mask_classification: return [ {"pred_logits": a, "pred_masks": b} for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1]) ] else: return [{"pred_masks": b} for b in outputs_seg_masks[:-1]] class MLP(nn.Module): """Very simple multi-layer perceptron (also called FFN)""" def __init__(self, input_dim, hidden_dim, output_dim, num_layers): super().__init__() self.num_layers = num_layers h = [hidden_dim] * (num_layers - 1) self.layers = nn.ModuleList( nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]) ) def forward(self, x): for i, layer in enumerate(self.layers): x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) return x ================================================ FILE: mask2former/modeling/transformer_decoder/position_encoding.py ================================================ # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py """ Various positional encodings for the transformer. """ import math import torch from torch import nn class PositionEmbeddingSine(nn.Module): """ This is a more standard version of the position embedding, very similar to the one used by the Attention is all you need paper, generalized to work on images. """ def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): super().__init__() self.num_pos_feats = num_pos_feats self.temperature = temperature self.normalize = normalize if scale is not None and normalize is False: raise ValueError("normalize should be True if scale is passed") if scale is None: scale = 2 * math.pi self.scale = scale def forward(self, x, mask=None): if mask is None: mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) not_mask = ~mask y_embed = not_mask.cumsum(1, dtype=torch.float32) x_embed = not_mask.cumsum(2, dtype=torch.float32) if self.normalize: eps = 1e-6 y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) pos_x = x_embed[:, :, :, None] / dim_t pos_y = y_embed[:, :, :, None] / dim_t pos_x = torch.stack( (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 ).flatten(3) pos_y = torch.stack( (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 ).flatten(3) pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) return pos def __repr__(self, _repr_indent=4): head = "Positional encoding " + self.__class__.__name__ body = [ "num_pos_feats: {}".format(self.num_pos_feats), "temperature: {}".format(self.temperature), "normalize: {}".format(self.normalize), "scale: {}".format(self.scale), ] # _repr_indent = 4 lines = [head] + [" " * _repr_indent + line for line in body] return "\n".join(lines) ================================================ FILE: mask2former/modeling/transformer_decoder/transformer.py ================================================ # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/transformer.py """ Transformer class. Copy-paste from torch.nn.Transformer with modifications: * positional encodings are passed in MHattention * extra LN at the end of encoder is removed * decoder returns a stack of activations from all decoding layers """ import copy from typing import List, Optional import torch import torch.nn.functional as F from torch import Tensor, nn class Transformer(nn.Module): def __init__( self, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False, return_intermediate_dec=False, ): super().__init__() encoder_layer = TransformerEncoderLayer( d_model, nhead, dim_feedforward, dropout, activation, normalize_before ) encoder_norm = nn.LayerNorm(d_model) if normalize_before else None self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) decoder_layer = TransformerDecoderLayer( d_model, nhead, dim_feedforward, dropout, activation, normalize_before ) decoder_norm = nn.LayerNorm(d_model) self.decoder = TransformerDecoder( decoder_layer, num_decoder_layers, decoder_norm, return_intermediate=return_intermediate_dec, ) self._reset_parameters() self.d_model = d_model self.nhead = nhead def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def forward(self, src, mask, query_embed, pos_embed): # flatten NxCxHxW to HWxNxC bs, c, h, w = src.shape src = src.flatten(2).permute(2, 0, 1) pos_embed = pos_embed.flatten(2).permute(2, 0, 1) query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) if mask is not None: mask = mask.flatten(1) tgt = torch.zeros_like(query_embed) memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) hs = self.decoder( tgt, memory, memory_key_padding_mask=mask, pos=pos_embed, query_pos=query_embed ) return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w) class TransformerEncoder(nn.Module): def __init__(self, encoder_layer, num_layers, norm=None): super().__init__() self.layers = _get_clones(encoder_layer, num_layers) self.num_layers = num_layers self.norm = norm def forward( self, src, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, ): output = src for layer in self.layers: output = layer( output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, pos=pos ) if self.norm is not None: output = self.norm(output) return output class TransformerDecoder(nn.Module): def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): super().__init__() self.layers = _get_clones(decoder_layer, num_layers) self.num_layers = num_layers self.norm = norm self.return_intermediate = return_intermediate def forward( self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None, ): output = tgt intermediate = [] for layer in self.layers: output = layer( output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask, pos=pos, query_pos=query_pos, ) if self.return_intermediate: intermediate.append(self.norm(output)) if self.norm is not None: output = self.norm(output) if self.return_intermediate: intermediate.pop() intermediate.append(output) if self.return_intermediate: return torch.stack(intermediate) return output.unsqueeze(0) class TransformerEncoderLayer(nn.Module): def __init__( self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False, ): super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post( self, src, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, ): q = k = self.with_pos_embed(src, pos) src2 = self.self_attn( q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask )[0] src = src + self.dropout1(src2) src = self.norm1(src) src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) src = src + self.dropout2(src2) src = self.norm2(src) return src def forward_pre( self, src, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, ): src2 = self.norm1(src) q = k = self.with_pos_embed(src2, pos) src2 = self.self_attn( q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask )[0] src = src + self.dropout1(src2) src2 = self.norm2(src) src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) src = src + self.dropout2(src2) return src def forward( self, src, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, ): if self.normalize_before: return self.forward_pre(src, src_mask, src_key_padding_mask, pos) return self.forward_post(src, src_mask, src_key_padding_mask, pos) class TransformerDecoderLayer(nn.Module): def __init__( self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False, ): super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post( self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None, ): q = k = self.with_pos_embed(tgt, query_pos) tgt2 = self.self_attn( q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask )[0] tgt = tgt + self.dropout1(tgt2) tgt = self.norm1(tgt) tgt2 = self.multihead_attn( query=self.with_pos_embed(tgt, query_pos), key=self.with_pos_embed(memory, pos), value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask, )[0] tgt = tgt + self.dropout2(tgt2) tgt = self.norm2(tgt) tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) tgt = tgt + self.dropout3(tgt2) tgt = self.norm3(tgt) return tgt def forward_pre( self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None, ): tgt2 = self.norm1(tgt) q = k = self.with_pos_embed(tgt2, query_pos) tgt2 = self.self_attn( q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask )[0] tgt = tgt + self.dropout1(tgt2) tgt2 = self.norm2(tgt) tgt2 = self.multihead_attn( query=self.with_pos_embed(tgt2, query_pos), key=self.with_pos_embed(memory, pos), value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask, )[0] tgt = tgt + self.dropout2(tgt2) tgt2 = self.norm3(tgt) tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) tgt = tgt + self.dropout3(tgt2) return tgt def forward( self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None, ): if self.normalize_before: return self.forward_pre( tgt, memory, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos, ) return self.forward_post( tgt, memory, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos, ) def _get_clones(module, N): return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) def _get_activation_fn(activation): """Return an activation function given a string""" if activation == "relu": return F.relu if activation == "gelu": return F.gelu if activation == "glu": return F.glu raise RuntimeError(f"activation should be relu/gelu, not {activation}.") ================================================ FILE: mask2former/test_time_augmentation.py ================================================ import copy import logging from itertools import count import numpy as np import torch from fvcore.transforms import HFlipTransform from torch import nn from torch.nn.parallel import DistributedDataParallel from detectron2.data.detection_utils import read_image from detectron2.modeling import DatasetMapperTTA __all__ = [ "SemanticSegmentorWithTTA", ] class SemanticSegmentorWithTTA(nn.Module): """ A SemanticSegmentor with test-time augmentation enabled. Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`. """ def __init__(self, cfg, model, tta_mapper=None, batch_size=1): """ Args: cfg (CfgNode): model (SemanticSegmentor): a SemanticSegmentor to apply TTA on. tta_mapper (callable): takes a dataset dict and returns a list of augmented versions of the dataset dict. Defaults to `DatasetMapperTTA(cfg)`. batch_size (int): batch the augmented images into this batch size for inference. """ super().__init__() if isinstance(model, DistributedDataParallel): model = model.module self.cfg = cfg.clone() self.model = model if tta_mapper is None: tta_mapper = DatasetMapperTTA(cfg) self.tta_mapper = tta_mapper self.batch_size = batch_size def __call__(self, batched_inputs): """ Same input/output format as :meth:`SemanticSegmentor.forward` """ def _maybe_read_image(dataset_dict): ret = copy.copy(dataset_dict) if "image" not in ret: image = read_image(ret.pop("file_name"), self.model.input_format) image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW ret["image"] = image if "height" not in ret and "width" not in ret: ret["height"] = image.shape[1] ret["width"] = image.shape[2] return ret processed_results = [] for x in batched_inputs: result = self._inference_one_image(_maybe_read_image(x)) processed_results.append(result) return processed_results def _inference_one_image(self, input): """ Args: input (dict): one dataset dict with "image" field being a CHW tensor Returns: dict: one output dict """ orig_shape = (input["height"], input["width"]) augmented_inputs, tfms = self._get_augmented_inputs(input) final_predictions = None count_predictions = 0 for input, tfm in zip(augmented_inputs, tfms): count_predictions += 1 with torch.no_grad(): if final_predictions is None: if any(isinstance(t, HFlipTransform) for t in tfm.transforms): final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2]) else: final_predictions = self.model([input])[0].pop("sem_seg") else: if any(isinstance(t, HFlipTransform) for t in tfm.transforms): final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2]) else: final_predictions += self.model([input])[0].pop("sem_seg") final_predictions = final_predictions / count_predictions return {"sem_seg": final_predictions} def _get_augmented_inputs(self, input): augmented_inputs = self.tta_mapper(input) tfms = [x.pop("transforms") for x in augmented_inputs] return augmented_inputs, tfms ================================================ FILE: mask2former/utils/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. ================================================ FILE: mask2former/utils/__init__.py.new ================================================ ================================================ FILE: mask2former/utils/misc.py ================================================ # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py """ Misc functions, including distributed helpers. Mostly copy-paste from torchvision references. """ from typing import List, Optional import torch import torch.distributed as dist import torchvision from torch import Tensor def _max_by_axis(the_list): # type: (List[List[int]]) -> List[int] maxes = the_list[0] for sublist in the_list[1:]: for index, item in enumerate(sublist): maxes[index] = max(maxes[index], item) return maxes class NestedTensor(object): def __init__(self, tensors, mask: Optional[Tensor]): self.tensors = tensors self.mask = mask def to(self, device): # type: (Device) -> NestedTensor # noqa cast_tensor = self.tensors.to(device) mask = self.mask if mask is not None: assert mask is not None cast_mask = mask.to(device) else: cast_mask = None return NestedTensor(cast_tensor, cast_mask) def decompose(self): return self.tensors, self.mask def __repr__(self): return str(self.tensors) def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): # TODO make this more general if tensor_list[0].ndim == 3: if torchvision._is_tracing(): # nested_tensor_from_tensor_list() does not export well to ONNX # call _onnx_nested_tensor_from_tensor_list() instead return _onnx_nested_tensor_from_tensor_list(tensor_list) # TODO make it support different-sized images max_size = _max_by_axis([list(img.shape) for img in tensor_list]) # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) batch_shape = [len(tensor_list)] + max_size b, c, h, w = batch_shape dtype = tensor_list[0].dtype device = tensor_list[0].device tensor = torch.zeros(batch_shape, dtype=dtype, device=device) mask = torch.ones((b, h, w), dtype=torch.bool, device=device) for img, pad_img, m in zip(tensor_list, tensor, mask): pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) m[: img.shape[1], : img.shape[2]] = False else: raise ValueError("not supported") return NestedTensor(tensor, mask) # _onnx_nested_tensor_from_tensor_list() is an implementation of # nested_tensor_from_tensor_list() that is supported by ONNX tracing. @torch.jit.unused def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: max_size = [] for i in range(tensor_list[0].dim()): max_size_i = torch.max( torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) ).to(torch.int64) max_size.append(max_size_i) max_size = tuple(max_size) # work around for # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) # m[: img.shape[1], :img.shape[2]] = False # which is not yet supported in onnx padded_imgs = [] padded_masks = [] for img in tensor_list: padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) padded_imgs.append(padded_img) m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) padded_masks.append(padded_mask.to(torch.bool)) tensor = torch.stack(padded_imgs) mask = torch.stack(padded_masks) return NestedTensor(tensor, mask=mask) def is_dist_avail_and_initialized(): if not dist.is_available(): return False if not dist.is_initialized(): return False return True ================================================ FILE: mask2former_video/__init__.py ================================================ from . import modeling # config from .config import add_maskformer2_video_config # models from .video_maskformer_model import VideoMaskFormer # video from .data_video import ( YTVISDatasetMapper, CocoClipDatasetMapper, YTVISEvaluator, build_detection_train_loader, build_detection_test_loader, build_combined_loader, get_detection_dataset_dicts, ) ================================================ FILE: mask2former_video/config.py ================================================ # -*- coding: utf-8 -*- from detectron2.config import CfgNode as CN def add_maskformer2_video_config(cfg): # video data # DataLoader cfg.INPUT.SAMPLING_FRAME_NUM = 3 cfg.INPUT.SAMPLING_FRAME_RANGE = 5 cfg.INPUT.SAMPLING_FRAME_SHUFFLE = True cfg.INPUT.AUGMENTATIONS = [] cfg.INPUT.PSEUDO = CN() cfg.INPUT.PSEUDO.AUGMENTATIONS = ['rotation'] cfg.INPUT.PSEUDO.MIN_SIZE_TRAIN = (480, 512, 544, 576, 608, 640, 672, 704, 736, 768) cfg.INPUT.PSEUDO.MAX_SIZE_TRAIN = 768 cfg.INPUT.PSEUDO.MIN_SIZE_TRAIN_SAMPLING = "choice_by_clip" cfg.INPUT.PSEUDO.SAMPLING_FRAME_NUM = 4 cfg.INPUT.PSEUDO.SAMPLING_FRAME_RANGE = 20 cfg.INPUT.PSEUDO.CROP = CN() cfg.INPUT.PSEUDO.CROP.ENABLED = False cfg.INPUT.PSEUDO.CROP.TYPE = "absolute_range" cfg.INPUT.PSEUDO.CROP.SIZE = (384, 600) # LSJ cfg.INPUT.LSJ_AUG = CN() cfg.INPUT.LSJ_AUG.ENABLED = False cfg.INPUT.LSJ_AUG.IMAGE_SIZE = 1024 cfg.INPUT.LSJ_AUG.MIN_SCALE = 0.1 cfg.INPUT.LSJ_AUG.MAX_SCALE = 2.0 ================================================ FILE: mask2former_video/data_video/__init__.py ================================================ # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC from .dataset_mapper import YTVISDatasetMapper, CocoClipDatasetMapper from .build import * from .datasets import * from .ytvis_eval import YTVISEvaluator ================================================ FILE: mask2former_video/data_video/augmentation.py ================================================ # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC import numpy as np import logging import sys from fvcore.transforms.transform import ( HFlipTransform, NoOpTransform, VFlipTransform, ) from PIL import Image from typing import Tuple from detectron2.data import transforms as T class RandomApplyClip(T.Augmentation): """ Randomly apply an augmentation with a given probability. """ def __init__(self, tfm_or_aug, prob=0.5, clip_frame_cnt=1): """ Args: tfm_or_aug (Transform, Augmentation): the transform or augmentation to be applied. It can either be a `Transform` or `Augmentation` instance. prob (float): probability between 0.0 and 1.0 that the wrapper transformation is applied """ super().__init__() self.aug = T.augmentation._transform_to_aug(tfm_or_aug) assert 0.0 <= prob <= 1.0, f"Probablity must be between 0.0 and 1.0 (given: {prob})" self.prob = prob self._cnt = 0 self.clip_frame_cnt = clip_frame_cnt def get_transform(self, *args): if self._cnt % self.clip_frame_cnt == 0: self.do = self._rand_range() < self.prob self._cnt = 0 # avoiding overflow self._cnt += 1 if self.do: return self.aug.get_transform(*args) else: return NoOpTransform() def __call__(self, aug_input): if self._cnt % self.clip_frame_cnt == 0: self.do = self._rand_range() < self.prob self._cnt = 0 # avoiding overflow self._cnt += 1 if self.do: return self.aug(aug_input) else: return NoOpTransform() class RandomRotationClip(T.Augmentation): """ This method returns a copy of this image, rotated the given number of degrees counter clockwise around the given center. """ def __init__(self, angle, prob=0.5, expand=True, center=None, interp=None, clip_frame_cnt=1): """ Args: angle (list[float]): If ``sample_style=="range"``, a [min, max] interval from which to sample the angle (in degrees). If ``sample_style=="choice"``, a list of angles to sample from expand (bool): choose if the image should be resized to fit the whole rotated image (default), or simply cropped center (list[[float, float]]): If ``sample_style=="range"``, a [[minx, miny], [maxx, maxy]] relative interval from which to sample the center, [0, 0] being the top left of the image and [1, 1] the bottom right. If ``sample_style=="choice"``, a list of centers to sample from Default: None, which means that the center of rotation is the center of the image center has no effect if expand=True because it only affects shifting """ super().__init__() if isinstance(angle, (float, int)): angle = (angle, angle) if center is not None and isinstance(center[0], (float, int)): center = (center, center) self.angle_save = None self.center_save = None self._cnt = 0 self._init(locals()) def get_transform(self, image): h, w = image.shape[:2] if self._cnt % self.clip_frame_cnt == 0: center = None angle = np.random.uniform(self.angle[0], self.angle[1], size=self.clip_frame_cnt) if self.center is not None: center = ( np.random.uniform(self.center[0][0], self.center[1][0]), np.random.uniform(self.center[0][1], self.center[1][1]), ) angle = np.sort(angle) if self._rand_range() < self.prob: angle = angle[::-1] self.angle_save = angle self.center_save = center self._cnt = 0 # avoiding overflow angle = self.angle_save[self._cnt] center = self.center_save self._cnt += 1 if center is not None: center = (w * center[0], h * center[1]) # Convert to absolute coordinates if angle % 360 == 0: return NoOpTransform() return T.RotationTransform(h, w, angle, expand=self.expand, center=center, interp=self.interp) class ResizeShortestEdge(T.Augmentation): """ Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge. If `max_size` is reached, then downscale so that the longer edge does not exceed max_size. """ def __init__( self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR, clip_frame_cnt=1 ): """ Args: short_edge_length (list[int]): If ``sample_style=="range"``, a [min, max] interval from which to sample the shortest edge length. If ``sample_style=="choice"``, a list of shortest edge lengths to sample from. max_size (int): maximum allowed longest edge length. sample_style (str): either "range" or "choice". """ super().__init__() assert sample_style in ["range", "choice", "range_by_clip", "choice_by_clip"], sample_style self.is_range = ("range" in sample_style) if isinstance(short_edge_length, int): short_edge_length = (short_edge_length, short_edge_length) if self.is_range: assert len(short_edge_length) == 2, ( "short_edge_length must be two values using 'range' sample style." f" Got {short_edge_length}!" ) self._cnt = 0 self._init(locals()) def get_transform(self, image): if self._cnt % self.clip_frame_cnt == 0: if self.is_range: self.size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1) else: self.size = np.random.choice(self.short_edge_length) if self.size == 0: return NoOpTransform() self._cnt = 0 # avoiding overflow self._cnt += 1 h, w = image.shape[:2] scale = self.size * 1.0 / min(h, w) if h < w: newh, neww = self.size, scale * w else: newh, neww = scale * h, self.size if max(newh, neww) > self.max_size: scale = self.max_size * 1.0 / max(newh, neww) newh = newh * scale neww = neww * scale neww = int(neww + 0.5) newh = int(newh + 0.5) return T.ResizeTransform(h, w, newh, neww, self.interp) class RandomFlip(T.Augmentation): """ Flip the image horizontally or vertically with the given probability. """ def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_frame_cnt=1): """ Args: prob (float): probability of flip. horizontal (boolean): whether to apply horizontal flipping vertical (boolean): whether to apply vertical flipping """ super().__init__() if horizontal and vertical: raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.") if not horizontal and not vertical: raise ValueError("At least one of horiz or vert has to be True!") self._cnt = 0 self._init(locals()) def get_transform(self, image): if self._cnt % self.clip_frame_cnt == 0: self.do = self._rand_range() < self.prob self._cnt = 0 # avoiding overflow self._cnt += 1 h, w = image.shape[:2] if self.do: if self.horizontal: return HFlipTransform(w) elif self.vertical: return VFlipTransform(h) else: return NoOpTransform() class RandomCropClip(T.Augmentation): """ Randomly crop a rectangle region out of an image. """ def __init__(self, crop_type: str, crop_size, clip_frame_cnt=1): """ Args: crop_type (str): one of "relative_range", "relative", "absolute", "absolute_range". crop_size (tuple[float, float]): two floats, explained below. - "relative": crop a (H * crop_size[0], W * crop_size[1]) region from an input image of size (H, W). crop size should be in (0, 1] - "relative_range": uniformly sample two values from [crop_size[0], 1] and [crop_size[1]], 1], and use them as in "relative" crop type. - "absolute" crop a (crop_size[0], crop_size[1]) region from input image. crop_size must be smaller than the input image size. - "absolute_range", for an input of size (H, W), uniformly sample H_crop in [crop_size[0], min(H, crop_size[1])] and W_crop in [crop_size[0], min(W, crop_size[1])]. Then crop a region (H_crop, W_crop). """ # TODO style of relative_range and absolute_range are not consistent: # one takes (h, w) but another takes (min, max) super().__init__() assert crop_type in ["relative_range", "relative", "absolute", "absolute_range"] self._init(locals()) self._cnt = 0 def get_transform(self, image): h, w = image.shape[:2] # 667, 500 if self._cnt % self.clip_frame_cnt == 0: croph, cropw = self.get_crop_size((h, w)) assert h >= croph and w >= cropw, "Shape computation in {} has bugs.".format(self) h0 = np.random.randint(h - croph + 1) # rand(124) -> 5 w0 = np.random.randint(w - cropw + 1) # rand(111) -> 634 h1 = np.random.randint(h0, h - croph + 1) w1 = np.random.randint(w0, w - cropw + 1) x = np.sort(np.random.rand(self.clip_frame_cnt)) h = h0 * x + h1 * (1-x) w = w0 * x + w1 * (1-x) h = np.round_(h).astype(np.int) w = np.round_(w).astype(np.int) if self._rand_range() < 0.5: h = h[::-1] w = w[::-1] self.hw_save = (h, w) self.crop_h_save, self.crop_w_save = croph, cropw self._cnt = 0 # avoiding overflow _h, _w = self.hw_save[0][self._cnt], self.hw_save[1][self._cnt] self._cnt += 1 return T.CropTransform(_w, _h, self.crop_w_save, self.crop_h_save) def get_crop_size(self, image_size): """ Args: image_size (tuple): height, width Returns: crop_size (tuple): height, width in absolute pixels """ h, w = image_size if self.crop_type == "relative": ch, cw = self.crop_size return int(h * ch + 0.5), int(w * cw + 0.5) elif self.crop_type == "relative_range": crop_size = np.asarray(self.crop_size, dtype=np.float32) ch, cw = crop_size + np.random.rand(2) * (1 - crop_size) return int(h * ch + 0.5), int(w * cw + 0.5) elif self.crop_type == "absolute": return (min(self.crop_size[0], h), min(self.crop_size[1], w)) elif self.crop_type == "absolute_range": assert self.crop_size[0] <= self.crop_size[1] ch = np.random.randint(min(h, self.crop_size[0]), min(h, self.crop_size[1]) + 1) cw = np.random.randint(min(w, self.crop_size[0]), min(w, self.crop_size[1]) + 1) return ch, cw else: raise NotImplementedError("Unknown crop type {}".format(self.crop_type)) class FixedSizeCropClip(T.Augmentation): """ If `crop_size` is smaller than the input image size, then it uses a random crop of the crop size. If `crop_size` is larger than the input image size, then it pads the right and the bottom of the image to the crop size if `pad` is True, otherwise it returns the smaller image. """ def __init__(self, crop_size: Tuple[int], pad: bool = True, pad_value: float = 128.0, clip_frame_cnt=1): """ Args: crop_size: target image (height, width). pad: if True, will pad images smaller than `crop_size` up to `crop_size` pad_value: the padding value. """ super().__init__() self._init(locals()) self._cnt = 0 def _get_crop(self, image: np.ndarray): # Compute the image scale and scaled size. input_size = image.shape[:2] output_size = self.crop_size # Add random crop if the image is scaled up. max_offset = np.subtract(input_size, output_size) max_offset = np.maximum(max_offset, 0) if self._cnt % self.clip_frame_cnt == 0: offset = np.multiply(max_offset, np.random.uniform(0.0, 1.0)) offset = np.round(offset).astype(int) self.offset_save = offset self._cnt = 0 # avoiding overflow self._cnt += 1 offset = self.offset_save return CropTransform( offset[1], offset[0], output_size[1], output_size[0], input_size[1], input_size[0] ) def _get_pad(self, image: np.ndarray): # Compute the image scale and scaled size. input_size = image.shape[:2] output_size = self.crop_size # Add padding if the image is scaled down. pad_size = np.subtract(output_size, input_size) pad_size = np.maximum(pad_size, 0) original_size = np.minimum(input_size, output_size) return PadTransform( 0, 0, pad_size[1], pad_size[0], original_size[1], original_size[0], self.pad_value ) def get_transform(self, image: np.ndarray): transforms = [self._get_crop(image)] if self.pad: transforms.append(self._get_pad(image)) return TransformList(transforms) class ResizeShortestEdgeClip(T.Augmentation): """ Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge. If `max_size` is reached, then downscale so that the longer edge does not exceed max_size. """ def __init__( self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR, clip_frame_cnt=1 ): """ Args: short_edge_length (list[int]): If ``sample_style=="range"``, a [min, max] interval from which to sample the shortest edge length. If ``sample_style=="choice"``, a list of shortest edge lengths to sample from. max_size (int): maximum allowed longest edge length. sample_style (str): either "range" or "choice". """ super().__init__() assert sample_style in ["range", "choice", "range_by_clip", "choice_by_clip"], sample_style self.is_range = ("range" in sample_style) if isinstance(short_edge_length, int): short_edge_length = (short_edge_length, short_edge_length) if self.is_range: assert len(short_edge_length) == 2, ( "short_edge_length must be two values using 'range' sample style." f" Got {short_edge_length}!" ) self._cnt = 0 self._init(locals()) def get_transform(self, image): if self._cnt % self.clip_frame_cnt == 0: if self.is_range: self.size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1) else: self.size = np.random.choice(self.short_edge_length) self._cnt = 0 # avoiding overflow if self.size == 0: return NoOpTransform() self._cnt += 1 h, w = image.shape[:2] scale = self.size * 1.0 / min(h, w) if h < w: newh, neww = self.size, scale * w else: newh, neww = scale * h, self.size if max(newh, neww) > self.max_size: scale = self.max_size * 1.0 / max(newh, neww) newh = newh * scale neww = neww * scale neww = int(neww + 0.5) newh = int(newh + 0.5) return T.ResizeTransform(h, w, newh, neww, self.interp) class RandomFlipClip(T.Augmentation): """ Flip the image horizontally or vertically with the given probability. """ def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_frame_cnt=1): """ Args: prob (float): probability of flip. horizontal (boolean): whether to apply horizontal flipping vertical (boolean): whether to apply vertical flipping """ super().__init__() if horizontal and vertical: raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.") if not horizontal and not vertical: raise ValueError("At least one of horiz or vert has to be True!") self._cnt = 0 self._init(locals()) def get_transform(self, image): if self._cnt % self.clip_frame_cnt == 0: self.do = self._rand_range() < self.prob self._cnt = 0 # avoiding overflow self._cnt += 1 h, w = image.shape[:2] if self.do: if self.horizontal: return HFlipTransform(w) elif self.vertical: return VFlipTransform(h) else: return NoOpTransform() def build_augmentation(cfg, is_train): logger = logging.getLogger(__name__) aug_list = [] if is_train: # Crop if cfg.INPUT.CROP.ENABLED: aug_list.append(T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE)) # Resize min_size = cfg.INPUT.MIN_SIZE_TRAIN max_size = cfg.INPUT.MAX_SIZE_TRAIN sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING ms_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM if "by_clip" in cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING else 1 aug_list.append(ResizeShortestEdge(min_size, max_size, sample_style, clip_frame_cnt=ms_clip_frame_cnt)) # Flip if cfg.INPUT.RANDOM_FLIP != "none": if cfg.INPUT.RANDOM_FLIP == "flip_by_clip": flip_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM else: flip_clip_frame_cnt = 1 aug_list.append( # NOTE using RandomFlip modified for the support of flip maintenance RandomFlip( horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"), vertical=cfg.INPUT.RANDOM_FLIP == "vertical", clip_frame_cnt=flip_clip_frame_cnt, ) ) # Additional augmentations : brightness, contrast, saturation, rotation augmentations = cfg.INPUT.AUGMENTATIONS if "brightness" in augmentations: aug_list.append(T.RandomBrightness(0.9, 1.1)) if "contrast" in augmentations: aug_list.append(T.RandomContrast(0.9, 1.1)) if "saturation" in augmentations: aug_list.append(T.RandomSaturation(0.9, 1.1)) if "rotation" in augmentations: # print('not come here' * 10) aug_list.append( T.RandomRotation( [-10, 10], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], sample_style="range" ) ) else: # Resize min_size = cfg.INPUT.MIN_SIZE_TEST max_size = cfg.INPUT.MAX_SIZE_TEST sample_style = "choice" aug_list.append(T.ResizeShortestEdge(min_size, max_size, sample_style)) return aug_list def build_pseudo_augmentation(cfg, is_train): logger = logging.getLogger(__name__) aug_list = [] if is_train: use_lsj = cfg.INPUT.LSJ_AUG.ENABLED if use_lsj: image_size = cfg.INPUT.LSJ_AUG.IMAGE_SIZE min_scale = cfg.INPUT.LSJ_AUG.MIN_SCALE max_scale = cfg.INPUT.LSJ_AUG.MAX_SCALE if cfg.INPUT.RANDOM_FLIP != "none": if cfg.INPUT.RANDOM_FLIP == "flip_by_clip": clip_frame_cnt = cfg.INPUT.PSEUDO.SAMPLING_FRAME_NUM else: clip_frame_cnt = 1 aug_list.append( # NOTE using RandomFlip modified for the support of flip maintenance RandomFlipClip( horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"), vertical=cfg.INPUT.RANDOM_FLIP == "vertical", clip_frame_cnt=clip_frame_cnt, ) ) # Additional augmentations : brightness, contrast, saturation, rotation augmentations = cfg.INPUT.PSEUDO.AUGMENTATIONS if "brightness" in augmentations: aug_list.append(T.RandomBrightness(0.9, 1.1)) if "contrast" in augmentations: aug_list.append(T.RandomContrast(0.9, 1.1)) if "saturation" in augmentations: aug_list.append(T.RandomSaturation(0.9, 1.1)) if "rotation" in augmentations: aug_list.append( RandomRotationClip( [-15, 15], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], clip_frame_cnt=clip_frame_cnt, ) ) aug_list.extend([ ResizeScaleClip( min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size, clip_frame_cnt=clip_frame_cnt, ), FixedSizeCropClip(crop_size=(image_size, image_size), clip_frame_cnt=clip_frame_cnt), ]) else: min_size = cfg.INPUT.PSEUDO.MIN_SIZE_TRAIN max_size = cfg.INPUT.PSEUDO.MAX_SIZE_TRAIN sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING clip_frame_cnt = cfg.INPUT.PSEUDO.SAMPLING_FRAME_NUM # Crop if cfg.INPUT.PSEUDO.CROP.ENABLED: crop_aug = RandomApplyClip( T.AugmentationList([ ResizeShortestEdgeClip([400, 500, 600], 1333, sample_style, clip_frame_cnt=clip_frame_cnt), RandomCropClip(cfg.INPUT.PSEUDO.CROP.TYPE, cfg.INPUT.PSEUDO.CROP.SIZE, clip_frame_cnt=clip_frame_cnt) ]), clip_frame_cnt=clip_frame_cnt ) aug_list.append(crop_aug) # Resize aug_list.append(ResizeShortestEdgeClip(min_size, max_size, sample_style, clip_frame_cnt=clip_frame_cnt)) # Flip aug_list.append( # NOTE using RandomFlip modified for the support of flip maintenance RandomFlipClip( horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"), vertical=cfg.INPUT.RANDOM_FLIP == "vertical", clip_frame_cnt=clip_frame_cnt, ) ) # Additional augmentations : brightness, contrast, saturation, rotation augmentations = cfg.INPUT.PSEUDO.AUGMENTATIONS if "brightness" in augmentations: aug_list.append(T.RandomBrightness(0.9, 1.1)) if "contrast" in augmentations: aug_list.append(T.RandomContrast(0.9, 1.1)) if "saturation" in augmentations: aug_list.append(T.RandomSaturation(0.9, 1.1)) if "rotation" in augmentations: aug_list.append( RandomRotationClip( [-15, 15], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], clip_frame_cnt=clip_frame_cnt, ) ) else: # Resize min_size = cfg.INPUT.MIN_SIZE_TEST max_size = cfg.INPUT.MAX_SIZE_TEST sample_style = "choice" aug_list.append(T.ResizeShortestEdge(min_size, max_size, sample_style)) return aug_list ================================================ FILE: mask2former_video/data_video/build.py ================================================ # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC import itertools import logging import torch.utils.data from typing import Collection, Sequence from detectron2.config import CfgNode, configurable from detectron2.data.build import ( build_batch_data_loader, load_proposals_into_dataset, trivial_batch_collator, ) from detectron2.data.catalog import DatasetCatalog from detectron2.data.common import DatasetFromList, MapDataset from detectron2.data.dataset_mapper import DatasetMapper from detectron2.data.samplers import InferenceSampler, TrainingSampler from detectron2.utils.comm import get_world_size from .combined_loader import CombinedDataLoader, Loader def _compute_num_images_per_worker(cfg: CfgNode): num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers ) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers ) images_per_worker = images_per_batch // num_workers return images_per_worker def filter_images_with_only_crowd_annotations(dataset_dicts, dataset_names): """ Filter out images with none annotations or only crowd annotations (i.e., images without non-crowd annotations). A common training-time preprocessing on COCO dataset. Args: dataset_dicts (list[dict]): annotations in Detectron2 Dataset format. Returns: list[dict]: the same format, but filtered. """ num_before = len(dataset_dicts) def valid(anns): for ann in anns: if isinstance(ann, list): for instance in ann: if instance.get("iscrowd", 0) == 0: return True else: if ann.get("iscrowd", 0) == 0: return True return False dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])] num_after = len(dataset_dicts) logger = logging.getLogger(__name__) logger.info( "Removed {} images with no usable annotations. {} images left.".format( num_before - num_after, num_after ) ) return dataset_dicts def get_detection_dataset_dicts( dataset_names, filter_empty=True, proposal_files=None ): """ Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation. Args: dataset_names (str or list[str]): a dataset name or a list of dataset names filter_empty (bool): whether to filter out images without instance annotations proposal_files (list[str]): if given, a list of object proposal files that match each dataset in `dataset_names`. Returns: list[dict]: a list of dicts following the standard dataset dict format. """ if isinstance(dataset_names, str): dataset_names = [dataset_names] assert len(dataset_names) dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in dataset_names] for dataset_name, dicts in zip(dataset_names, dataset_dicts): assert len(dicts), "Dataset '{}' is empty!".format(dataset_name) if proposal_files is not None: assert len(dataset_names) == len(proposal_files) # load precomputed proposals from proposal files dataset_dicts = [ load_proposals_into_dataset(dataset_i_dicts, proposal_file) for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files) ] dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts)) has_instances = "annotations" in dataset_dicts[0] if filter_empty and has_instances: dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts, dataset_names) assert len(dataset_dicts), "No valid data found in {}.".format(",".join(dataset_names)) return dataset_dicts def _train_loader_from_config(cfg, mapper, dataset_name=None, *, dataset=None, sampler=None): if dataset is None: dataset = get_detection_dataset_dicts( dataset_name, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) if mapper is None: mapper = DatasetMapper(cfg, True) if sampler is None: sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) sampler = TrainingSampler(len(dataset)) return { "dataset": dataset, "sampler": sampler, "mapper": mapper, "total_batch_size": cfg.SOLVER.IMS_PER_BATCH, "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING, "num_workers": cfg.DATALOADER.NUM_WORKERS, } # TODO can allow dataset as an iterable or IterableDataset to make this function more general @configurable(from_config=_train_loader_from_config) def build_detection_train_loader( dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0 ): """ Build a dataloader for object detection with some default features. This interface is experimental. Args: dataset (list or torch.utils.data.Dataset): a list of dataset dicts, or a map-style pytorch dataset. They can be obtained by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`. mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``. sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces indices to be applied on ``dataset``. Default to :class:`TrainingSampler`, which coordinates a random shuffle sequence across all workers. total_batch_size (int): total batch size across all workers. Batching simply puts data into a list. aspect_ratio_grouping (bool): whether to group images with similar aspect ratio for efficiency. When enabled, it requires each element in dataset be a dict with keys "width" and "height". num_workers (int): number of parallel data loading workers Returns: torch.utils.data.DataLoader: a dataloader. Each output from it is a ``list[mapped_element]`` of length ``total_batch_size / num_workers``, where ``mapped_element`` is produced by the ``mapper``. """ if isinstance(dataset, list): dataset = DatasetFromList(dataset, copy=False) if mapper is not None: dataset = MapDataset(dataset, mapper) if sampler is None: sampler = TrainingSampler(len(dataset)) assert isinstance(sampler, torch.utils.data.sampler.Sampler) return build_batch_data_loader( dataset, sampler, total_batch_size, aspect_ratio_grouping=aspect_ratio_grouping, num_workers=num_workers, ) def build_combined_loader(cfg: CfgNode, loaders: Collection[Loader], ratios: Sequence[float]): images_per_worker = _compute_num_images_per_worker(cfg) return CombinedDataLoader(loaders, images_per_worker, ratios) def _test_loader_from_config(cfg, dataset_name, mapper=None): """ Uses the given `dataset_name` argument (instead of the names in cfg), because the standard practice is to evaluate each test set individually (not combining them). """ dataset = get_detection_dataset_dicts( [dataset_name], filter_empty=False, proposal_files=[ cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(dataset_name)] ] if cfg.MODEL.LOAD_PROPOSALS else None, ) if mapper is None: mapper = DatasetMapper(cfg, False) return {"dataset": dataset, "mapper": mapper, "num_workers": cfg.DATALOADER.NUM_WORKERS} @configurable(from_config=_test_loader_from_config) def build_detection_test_loader(dataset, *, mapper, num_workers=0): """ Similar to `build_detection_train_loader`, but uses a batch size of 1. This interface is experimental. Args: dataset (list or torch.utils.data.Dataset): a list of dataset dicts, or a map-style pytorch dataset. They can be obtained by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`. mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``. num_workers (int): number of parallel data loading workers Returns: DataLoader: a torch DataLoader, that loads the given detection dataset, with test-time transformation and batching. Examples: :: data_loader = build_detection_test_loader( DatasetRegistry.get("my_test"), mapper=DatasetMapper(...)) # or, instantiate with a CfgNode: data_loader = build_detection_test_loader(cfg, "my_test") """ if isinstance(dataset, list): dataset = DatasetFromList(dataset, copy=False) if mapper is not None: dataset = MapDataset(dataset, mapper) sampler = InferenceSampler(len(dataset)) # Always use 1 image per worker during inference since this is the # standard when reporting inference time in papers. batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False) data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, ) return data_loader ================================================ FILE: mask2former_video/data_video/combined_loader.py ================================================ import random from collections import deque from typing import Any, Collection, Deque, Iterable, Iterator, List, Sequence Loader = Iterable[Any] def _pooled_next(iterator: Iterator[Any], pool: Deque[Any]): if not pool: pool.extend(next(iterator)) return pool.popleft() class CombinedDataLoader: """ Combines data loaders using the provided sampling ratios """ BATCH_COUNT = 100 def __init__(self, loaders: Collection[Loader], batch_size: int, ratios: Sequence[float]): self.loaders = loaders self.batch_size = batch_size self.ratios = ratios def __iter__(self) -> Iterator[List[Any]]: iters = [iter(loader) for loader in self.loaders] indices = [] pool = [deque()] * len(iters) # infinite iterator, as in D2 while True: if not indices: # just a buffer of indices, its size doesn't matter # as long as it's a multiple of batch_size k = self.batch_size * self.BATCH_COUNT indices = random.choices(range(len(self.loaders)), self.ratios, k=k) try: batch = [_pooled_next(iters[i], pool[i]) for i in indices[: self.batch_size]] except StopIteration: break indices = indices[self.batch_size :] yield batch ================================================ FILE: mask2former_video/data_video/dataset_mapper.py ================================================ # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC import copy import logging import random import numpy as np from typing import List, Union import torch from detectron2.config import configurable from detectron2.structures import ( BitMasks, Boxes, BoxMode, Instances, ) from detectron2.data import detection_utils as utils from detectron2.data import transforms as T from detectron2.data import MetadataCatalog from .augmentation import build_augmentation, build_pseudo_augmentation #build_coco_augmentation from .datasets.ytvis import COCO_TO_YTVIS_2019, COCO_TO_YTVIS_2021 import os from pycocotools import mask as coco_mask __all__ = ["YTVISDatasetMapper", "CocoClipDatasetMapper"] def seed_everything(seed): random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed) torch.manual_seed(seed) def filter_empty_instances(instances, by_box=True, by_mask=True, box_threshold=1e-5): """ Filter out empty instances in an `Instances` object. Args: instances (Instances): by_box (bool): whether to filter out instances with empty boxes by_mask (bool): whether to filter out instances with empty masks box_threshold (float): minimum width and height to be considered non-empty Returns: Instances: the filtered instances. """ assert by_box or by_mask r = [] if by_box: r.append(instances.gt_boxes.nonempty(threshold=box_threshold)) if instances.has("gt_masks") and by_mask: r.append(instances.gt_masks.nonempty()) if not r: return instances m = r[0] for x in r[1:]: m = m & x instances.gt_ids[~m] = -1 return instances def _get_dummy_anno(): return { "iscrowd": 0, "category_id": -1, "id": -1, "bbox": np.array([0, 0, 0, 0]), "bbox_mode": BoxMode.XYXY_ABS, "segmentation": [np.array([0.0] * 6)] } def ytvis_annotations_to_instances(annos, image_size): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: It will contain fields "gt_boxes", "gt_classes", "gt_ids", "gt_masks", if they can be obtained from `annos`. This is the format that builtin models expect. """ boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos] target = Instances(image_size) target.gt_boxes = Boxes(boxes) classes = [int(obj["category_id"]) for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes ids = [int(obj["id"]) for obj in annos] ids = torch.tensor(ids, dtype=torch.int64) target.gt_ids = ids if len(annos) and "segmentation" in annos[0]: segms = [obj["segmentation"] for obj in annos] masks = [] for segm in segms: assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim ) # mask array masks.append(segm) # torch.from_numpy does not support array with negative stride. masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in masks]) ) target.gt_masks = masks return target def convert_coco_poly_to_mask(segmentations, height, width): masks = [] for polygons in segmentations: rles = coco_mask.frPyObjects(polygons, height, width) mask = coco_mask.decode(rles) if len(mask.shape) < 3: mask = mask[..., None] mask = torch.as_tensor(mask, dtype=torch.uint8) mask = mask.any(dim=2) masks.append(mask) if masks: masks = torch.stack(masks, dim=0) else: masks = torch.zeros((0, height, width), dtype=torch.uint8) return masks class YTVISDatasetMapper: """ A callable which takes a dataset dict in YouTube-VIS Dataset format, and map it into a format used by the model. """ @configurable def __init__( self, is_train: bool, is_tgt: bool, *, augmentations: List[Union[T.Augmentation, T.Transform]], image_format: str, use_instance_mask: bool = False, sampling_frame_num: int = 2, sampling_frame_range: int = 5, sampling_frame_shuffle: bool = False, num_classes: int = 40, src_dataset_name: str = "", tgt_dataset_name: str = "", ): """ NOTE: this interface is experimental. Args: is_train: whether it's used in training or inference augmentations: a list of augmentations or deterministic transforms to apply image_format: an image format supported by :func:`detection_utils.read_image`. use_instance_mask: whether to process instance segmentation annotations, if available """ # fmt: off self.is_train = is_train self.is_tgt = is_tgt self.augmentations = T.AugmentationList(augmentations) self.image_format = image_format self.use_instance_mask = use_instance_mask self.sampling_frame_num = sampling_frame_num self.sampling_frame_range = sampling_frame_range self.sampling_frame_shuffle = sampling_frame_shuffle self.num_classes = num_classes if not is_tgt: self.src_metadata = MetadataCatalog.get(src_dataset_name) self.tgt_metadata = MetadataCatalog.get(tgt_dataset_name) print('tgt_dataset_name:', tgt_dataset_name) if tgt_dataset_name.startswith("ytvis_2019"): src2tgt = OVIS_TO_YTVIS_2019 elif tgt_dataset_name.startswith("ytvis_2021"): src2tgt = OVIS_TO_YTVIS_2021 elif tgt_dataset_name.startswith("ovis"): if src_dataset_name.startswith("ytvis_2019"): src2tgt = YTVIS_2019_TO_OVIS elif src_dataset_name.startswith("ytvis_2021"): src2tgt = YTVIS_2021_TO_OVIS else: raise NotImplementedError else: raise NotImplementedError self.src2tgt = {} for k, v in src2tgt.items(): self.src2tgt[ self.src_metadata.thing_dataset_id_to_contiguous_id[k] ] = self.tgt_metadata.thing_dataset_id_to_contiguous_id[v] # fmt: on logger = logging.getLogger(__name__) mode = "training" if is_train else "inference" logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}") @classmethod def from_config(cls, cfg, is_train: bool = True, is_tgt: bool = True): augs = build_augmentation(cfg, is_train) sampling_frame_num = cfg.INPUT.SAMPLING_FRAME_NUM sampling_frame_range = cfg.INPUT.SAMPLING_FRAME_RANGE sampling_frame_shuffle = cfg.INPUT.SAMPLING_FRAME_SHUFFLE ret = { "is_train": is_train, "is_tgt": is_tgt, "augmentations": augs, "image_format": cfg.INPUT.FORMAT, "use_instance_mask": cfg.MODEL.MASK_ON, "sampling_frame_num": sampling_frame_num, "sampling_frame_range": sampling_frame_range, "sampling_frame_shuffle": sampling_frame_shuffle, "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, "tgt_dataset_name": cfg.DATASETS.TRAIN[-1], } return ret def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one video, in YTVIS Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ # TODO consider examining below deepcopy as it costs huge amount of computations. dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below video_length = dataset_dict["length"] if self.is_train: ref_frame = random.randrange(video_length) start_idx = max(0, ref_frame-self.sampling_frame_range) end_idx = min(video_length, ref_frame+self.sampling_frame_range + 1) selected_idx = np.random.choice( np.array(list(range(start_idx, ref_frame)) + list(range(ref_frame+1, end_idx))), self.sampling_frame_num - 1, ) selected_idx = selected_idx.tolist() + [ref_frame] selected_idx = sorted(selected_idx) if self.sampling_frame_shuffle: random.shuffle(selected_idx) else: selected_idx = range(video_length) video_annos = dataset_dict.pop("annotations", None) file_names = dataset_dict.pop("file_names", None) if self.is_train: _ids = set() for frame_idx in selected_idx: _ids.update([anno["id"] for anno in video_annos[frame_idx]]) ids = dict() for i, _id in enumerate(_ids): ids[_id] = i dataset_dict["video_len"] = len(video_annos) dataset_dict["frame_idx"] = list(selected_idx) dataset_dict["image"] = [] dataset_dict["instances"] = [] dataset_dict["file_names"] = [] for frame_idx in selected_idx: dataset_dict["file_names"].append(file_names[frame_idx]) # Read image image = utils.read_image(file_names[frame_idx], format=self.image_format) utils.check_image_size(dataset_dict, image) aug_input = T.AugInput(image) transforms = self.augmentations(aug_input) image = aug_input.image image_shape = image.shape[:2] # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"].append(torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))) if (video_annos is None) or (not self.is_train): continue # NOTE copy() is to prevent annotations getting changed from applying augmentations _frame_annos = [] for anno in video_annos[frame_idx]: _anno = {} for k, v in anno.items(): _anno[k] = copy.deepcopy(v) _frame_annos.append(_anno) # USER: Implement additional transformations if you have other types of data annos = [ utils.transform_instance_annotations(obj, transforms, image_shape) for obj in _frame_annos if obj.get("iscrowd", 0) == 0 ] sorted_annos = [_get_dummy_anno() for _ in range(len(ids))] for _anno in annos: idx = ids[_anno["id"]] sorted_annos[idx] = _anno _gt_ids = [_anno["id"] for _anno in sorted_annos] instances = utils.annotations_to_instances(sorted_annos, image_shape, mask_format="bitmask") if not self.is_tgt: instances.gt_classes = torch.tensor( [self.src2tgt[c] if c in self.src2tgt else -1 for c in instances.gt_classes.tolist()] ) instances.gt_ids = torch.tensor(_gt_ids) instances = filter_empty_instances(instances) # if instances.has("gt_masks"): # instances.gt_boxes = instances.gt_masks.get_bounding_boxes() # instances = filter_empty_instances(instances) if not instances.has("gt_masks"): instances.gt_masks = BitMasks(torch.empty((0, *image_shape))) dataset_dict["instances"].append(instances) return dataset_dict class CocoClipDatasetMapper: """ A callable which takes a COCO image which converts into multiple frames, and map it into a format used by the model. """ @configurable def __init__( self, is_train: bool, is_tgt: bool, *, augmentations: List[Union[T.Augmentation, T.Transform]], image_format: str, sampling_frame_num: int = 2, sampling_frame_range: int = 5, src_dataset_name: str = "", tgt_dataset_name: str = "", ): """ NOTE: this interface is experimental. Args: is_train: whether it's used in training or inference augmentations: a list of augmentations or deterministic transforms to apply image_format: an image format supported by :func:`detection_utils.read_image`. """ # fmt: off self.is_train = is_train self.is_tgt = is_tgt self.augmentations = T.AugmentationList(augmentations) self.image_format = image_format self.sampling_frame_num = sampling_frame_num self.sampling_frame_range = sampling_frame_range if not is_tgt: self.src_metadata = MetadataCatalog.get(src_dataset_name) self.tgt_metadata = MetadataCatalog.get(tgt_dataset_name) if tgt_dataset_name.startswith("ytvis_2019"): src2tgt = COCO_TO_YTVIS_2019 elif tgt_dataset_name.startswith("ytvis_2021"): src2tgt = COCO_TO_YTVIS_2021 elif tgt_dataset_name.startswith("ovis"): src2tgt = COCO_TO_OVIS else: raise NotImplementedError self.src2tgt = {} for k, v in src2tgt.items(): self.src2tgt[ self.src_metadata.thing_dataset_id_to_contiguous_id[k] ] = self.tgt_metadata.thing_dataset_id_to_contiguous_id[v] # fmt: on logger = logging.getLogger(__name__) mode = "training" if is_train else "inference" logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}") @classmethod def from_config(cls, cfg, is_train: bool = True, is_tgt: bool = True): if is_tgt: augs = build_augmentation(cfg, is_train) else: # print('come here') augs = build_pseudo_augmentation(cfg, is_train) sampling_frame_num = cfg.INPUT.PSEUDO.SAMPLING_FRAME_NUM sampling_frame_range = cfg.INPUT.PSEUDO.SAMPLING_FRAME_RANGE ret = { "is_train": is_train, "is_tgt": is_tgt, "augmentations": augs, "image_format": cfg.INPUT.FORMAT, "sampling_frame_num": sampling_frame_num, "sampling_frame_range": sampling_frame_range, "tgt_dataset_name": cfg.DATASETS.TRAIN[-1], } return ret def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below img_annos = dataset_dict.pop("annotations", None) file_name = dataset_dict.pop("file_name", None) original_image = utils.read_image(file_name, format=self.image_format) if self.is_train: video_length = random.randrange(16, 49) ref_frame = random.randrange(video_length) start_idx = max(0, ref_frame-self.sampling_frame_range) end_idx = min(video_length, ref_frame+self.sampling_frame_range + 1) selected_idx = np.random.choice( np.array(list(range(start_idx, ref_frame)) + list(range(ref_frame+1, end_idx))), self.sampling_frame_num - 1, ) selected_idx = selected_idx.tolist() + [ref_frame] selected_idx = sorted(selected_idx) else: video_length = self.sampling_frame_num selected_idx = list(range(self.sampling_frame_num)) dataset_dict["video_len"] = video_length dataset_dict["frame_idx"] = selected_idx dataset_dict["image"] = [] dataset_dict["instances"] = [] dataset_dict["file_names"] = [file_name] * self.sampling_frame_num for _ in range(self.sampling_frame_num): utils.check_image_size(dataset_dict, original_image) aug_input = T.AugInput(original_image) transforms = self.augmentations(aug_input) image = aug_input.image image_shape = image.shape[:2] # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"].append(torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))) if (img_annos is None) or (not self.is_train): continue _img_annos = [] for anno in img_annos: _anno = {} for k, v in anno.items(): _anno[k] = copy.deepcopy(v) _img_annos.append(_anno) # USER: Implement additional transformations if you have other types of data annos = [ utils.transform_instance_annotations(obj, transforms, image_shape) for obj in _img_annos if obj.get("iscrowd", 0) == 0 ] _gt_ids = list(range(len(annos))) for idx in range(len(annos)): if len(annos[idx]["segmentation"]) == 0: annos[idx]["segmentation"] = [np.array([0.0] * 6)] instances = utils.annotations_to_instances(annos, image_shape) if not self.is_tgt: instances.gt_classes = torch.tensor( [self.src2tgt[c] if c in self.src2tgt else -1 for c in instances.gt_classes.tolist()] ) instances.gt_ids = torch.tensor(_gt_ids) # instances.gt_boxes = instances.gt_masks.get_bounding_boxes() # NOTE we don't need boxes instances = filter_empty_instances(instances) h, w = instances.image_size if hasattr(instances, 'gt_masks'): gt_masks = instances.gt_masks gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w) instances.gt_masks = gt_masks else: instances.gt_masks = torch.zeros((0, h, w), dtype=torch.uint8) dataset_dict["instances"].append(instances) return dataset_dict ================================================ FILE: mask2former_video/data_video/datasets/__init__.py ================================================ # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC from . import builtin # ensure the builtin datasets are registered __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")] ================================================ FILE: mask2former_video/data_video/datasets/builtin.py ================================================ # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC import os from .ytvis import ( register_ytvis_instances, _get_ytvis_2019_instances_meta, _get_ytvis_2021_instances_meta, ) from detectron2.data.datasets.coco import register_coco_instances from detectron2.data.datasets.builtin_meta import _get_builtin_metadata _PREDEFINED_SPLITS_COCO = {} _PREDEFINED_SPLITS_COCO["coco"] = { "coco_2017_train_fake": ("coco/train2017", "coco/annotations/coco2ytvis2019_train.json"), } # ==== Predefined splits for YTVIS 2019 =========== _PREDEFINED_SPLITS_YTVIS_2019 = { "ytvis_2019_train": ("ytvis_2019/train/JPEGImages", "ytvis_2019/train.json"), "ytvis_2019_val": ("ytvis_2019/valid/JPEGImages", "ytvis_2019/valid.json"), "ytvis_2019_test": ("ytvis_2019/test/JPEGImages", "ytvis_2019/test.json"), } # ==== Predefined splits for YTVIS 2021 =========== _PREDEFINED_SPLITS_YTVIS_2021 = { "ytvis_2021_train": ("ytvis_2021/train/JPEGImages", "ytvis_2021/train.json"), "ytvis_2021_val": ("ytvis_2021/valid/JPEGImages", "ytvis_2021/valid.json"), "ytvis_2021_test": ("ytvis_2021/test/JPEGImages", "ytvis_2021/test.json"), } def register_all_ytvis_2019(root): for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2019.items(): # Assume pre-defined datasets live in `./datasets`. register_ytvis_instances( key, _get_ytvis_2019_instances_meta(), os.path.join(root, json_file) if "://" not in json_file else json_file, os.path.join(root, image_root), ) def register_all_ytvis_2021(root): for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2021.items(): # Assume pre-defined datasets live in `./datasets`. register_ytvis_instances( key, _get_ytvis_2021_instances_meta(), os.path.join(root, json_file) if "://" not in json_file else json_file, os.path.join(root, image_root), ) def register_all_coco(root): for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_COCO.items(): for key, (image_root, json_file) in splits_per_dataset.items(): # Assume pre-defined datasets live in `./datasets`. register_coco_instances( key, _get_builtin_metadata(dataset_name), os.path.join(root, json_file) if "://" not in json_file else json_file, os.path.join(root, image_root), ) if __name__.endswith(".builtin"): # Assume pre-defined datasets live in `./datasets`. _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_ytvis_2019(_root) register_all_ytvis_2021(_root) register_all_coco(_root) ================================================ FILE: mask2former_video/data_video/datasets/ytvis.py ================================================ # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC import contextlib import io import json import logging import numpy as np import os import pycocotools.mask as mask_util from fvcore.common.file_io import PathManager from fvcore.common.timer import Timer from detectron2.structures import Boxes, BoxMode, PolygonMasks from detectron2.data import DatasetCatalog, MetadataCatalog """ This file contains functions to parse YTVIS dataset of COCO-format annotations into dicts in "Detectron2 format". """ logger = logging.getLogger(__name__) __all__ = ["load_ytvis_json", "register_ytvis_instances"] COCO_TO_YTVIS_2019 = { 1:1, 2:21, 3:6, 4:21, 5:28, 7:17, 8:29, 9:34, 17:14, 18:8, 19:18, 21:15, 22:32, 23:20, 24:30, 25:22, 35:33, 36:33, 41:5, 42:27, 43:40 } COCO_TO_YTVIS_2021 = { 1:26, 2:23, 3:5, 4:23, 5:1, 7:36, 8:37, 9:4, 16:3, 17:6, 18:9, 19:19, 21:7, 22:12, 23:2, 24:40, 25:18, 34:14, 35:31, 36:31, 41:29, 42:33, 43:34 } YTVIS_CATEGORIES_2019 = [ {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"}, {"color": [0, 82, 0], "isthing": 1, "id": 2, "name": "giant_panda"}, {"color": [119, 11, 32], "isthing": 1, "id": 3, "name": "lizard"}, {"color": [165, 42, 42], "isthing": 1, "id": 4, "name": "parrot"}, {"color": [134, 134, 103], "isthing": 1, "id": 5, "name": "skateboard"}, {"color": [0, 0, 142], "isthing": 1, "id": 6, "name": "sedan"}, {"color": [255, 109, 65], "isthing": 1, "id": 7, "name": "ape"}, {"color": [0, 226, 252], "isthing": 1, "id": 8, "name": "dog"}, {"color": [5, 121, 0], "isthing": 1, "id": 9, "name": "snake"}, {"color": [0, 60, 100], "isthing": 1, "id": 10, "name": "monkey"}, {"color": [250, 170, 30], "isthing": 1, "id": 11, "name": "hand"}, {"color": [100, 170, 30], "isthing": 1, "id": 12, "name": "rabbit"}, {"color": [179, 0, 194], "isthing": 1, "id": 13, "name": "duck"}, {"color": [255, 77, 255], "isthing": 1, "id": 14, "name": "cat"}, {"color": [120, 166, 157], "isthing": 1, "id": 15, "name": "cow"}, {"color": [73, 77, 174], "isthing": 1, "id": 16, "name": "fish"}, {"color": [0, 80, 100], "isthing": 1, "id": 17, "name": "train"}, {"color": [182, 182, 255], "isthing": 1, "id": 18, "name": "horse"}, {"color": [0, 143, 149], "isthing": 1, "id": 19, "name": "turtle"}, {"color": [174, 57, 255], "isthing": 1, "id": 20, "name": "bear"}, {"color": [0, 0, 230], "isthing": 1, "id": 21, "name": "motorbike"}, {"color": [72, 0, 118], "isthing": 1, "id": 22, "name": "giraffe"}, {"color": [255, 179, 240], "isthing": 1, "id": 23, "name": "leopard"}, {"color": [0, 125, 92], "isthing": 1, "id": 24, "name": "fox"}, {"color": [209, 0, 151], "isthing": 1, "id": 25, "name": "deer"}, {"color": [188, 208, 182], "isthing": 1, "id": 26, "name": "owl"}, {"color": [145, 148, 174], "isthing": 1, "id": 27, "name": "surfboard"}, {"color": [106, 0, 228], "isthing": 1, "id": 28, "name": "airplane"}, {"color": [0, 0, 70], "isthing": 1, "id": 29, "name": "truck"}, {"color": [199, 100, 0], "isthing": 1, "id": 30, "name": "zebra"}, {"color": [166, 196, 102], "isthing": 1, "id": 31, "name": "tiger"}, {"color": [110, 76, 0], "isthing": 1, "id": 32, "name": "elephant"}, {"color": [133, 129, 255], "isthing": 1, "id": 33, "name": "snowboard"}, {"color": [0, 0, 192], "isthing": 1, "id": 34, "name": "boat"}, {"color": [183, 130, 88], "isthing": 1, "id": 35, "name": "shark"}, {"color": [130, 114, 135], "isthing": 1, "id": 36, "name": "mouse"}, {"color": [107, 142, 35], "isthing": 1, "id": 37, "name": "frog"}, {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "eagle"}, {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "earless_seal"}, {"color": [255, 208, 186], "isthing": 1, "id": 40, "name": "tennis_racket"}, ] YTVIS_CATEGORIES_2021 = [ {"color": [106, 0, 228], "isthing": 1, "id": 1, "name": "airplane"}, {"color": [174, 57, 255], "isthing": 1, "id": 2, "name": "bear"}, {"color": [255, 109, 65], "isthing": 1, "id": 3, "name": "bird"}, {"color": [0, 0, 192], "isthing": 1, "id": 4, "name": "boat"}, {"color": [0, 0, 142], "isthing": 1, "id": 5, "name": "car"}, {"color": [255, 77, 255], "isthing": 1, "id": 6, "name": "cat"}, {"color": [120, 166, 157], "isthing": 1, "id": 7, "name": "cow"}, {"color": [209, 0, 151], "isthing": 1, "id": 8, "name": "deer"}, {"color": [0, 226, 252], "isthing": 1, "id": 9, "name": "dog"}, {"color": [179, 0, 194], "isthing": 1, "id": 10, "name": "duck"}, {"color": [174, 255, 243], "isthing": 1, "id": 11, "name": "earless_seal"}, {"color": [110, 76, 0], "isthing": 1, "id": 12, "name": "elephant"}, {"color": [73, 77, 174], "isthing": 1, "id": 13, "name": "fish"}, {"color": [250, 170, 30], "isthing": 1, "id": 14, "name": "flying_disc"}, {"color": [0, 125, 92], "isthing": 1, "id": 15, "name": "fox"}, {"color": [107, 142, 35], "isthing": 1, "id": 16, "name": "frog"}, {"color": [0, 82, 0], "isthing": 1, "id": 17, "name": "giant_panda"}, {"color": [72, 0, 118], "isthing": 1, "id": 18, "name": "giraffe"}, {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"}, {"color": [255, 179, 240], "isthing": 1, "id": 20, "name": "leopard"}, {"color": [119, 11, 32], "isthing": 1, "id": 21, "name": "lizard"}, {"color": [0, 60, 100], "isthing": 1, "id": 22, "name": "monkey"}, {"color": [0, 0, 230], "isthing": 1, "id": 23, "name": "motorbike"}, {"color": [130, 114, 135], "isthing": 1, "id": 24, "name": "mouse"}, {"color": [165, 42, 42], "isthing": 1, "id": 25, "name": "parrot"}, {"color": [220, 20, 60], "isthing": 1, "id": 26, "name": "person"}, {"color": [100, 170, 30], "isthing": 1, "id": 27, "name": "rabbit"}, {"color": [183, 130, 88], "isthing": 1, "id": 28, "name": "shark"}, {"color": [134, 134, 103], "isthing": 1, "id": 29, "name": "skateboard"}, {"color": [5, 121, 0], "isthing": 1, "id": 30, "name": "snake"}, {"color": [133, 129, 255], "isthing": 1, "id": 31, "name": "snowboard"}, {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "squirrel"}, {"color": [145, 148, 174], "isthing": 1, "id": 33, "name": "surfboard"}, {"color": [255, 208, 186], "isthing": 1, "id": 34, "name": "tennis_racket"}, {"color": [166, 196, 102], "isthing": 1, "id": 35, "name": "tiger"}, {"color": [0, 80, 100], "isthing": 1, "id": 36, "name": "train"}, {"color": [0, 0, 70], "isthing": 1, "id": 37, "name": "truck"}, {"color": [0, 143, 149], "isthing": 1, "id": 38, "name": "turtle"}, {"color": [0, 228, 0], "isthing": 1, "id": 39, "name": "whale"}, {"color": [199, 100, 0], "isthing": 1, "id": 40, "name": "zebra"}, ] def _get_ytvis_2019_instances_meta(): thing_ids = [k["id"] for k in YTVIS_CATEGORIES_2019 if k["isthing"] == 1] thing_colors = [k["color"] for k in YTVIS_CATEGORIES_2019 if k["isthing"] == 1] assert len(thing_ids) == 40, len(thing_ids) # Mapping from the incontiguous YTVIS category id to an id in [0, 39] thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} thing_classes = [k["name"] for k in YTVIS_CATEGORIES_2019 if k["isthing"] == 1] ret = { "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, "thing_classes": thing_classes, "thing_colors": thing_colors, } return ret def _get_ytvis_2021_instances_meta(): thing_ids = [k["id"] for k in YTVIS_CATEGORIES_2021 if k["isthing"] == 1] thing_colors = [k["color"] for k in YTVIS_CATEGORIES_2021 if k["isthing"] == 1] assert len(thing_ids) == 40, len(thing_ids) # Mapping from the incontiguous YTVIS category id to an id in [0, 39] thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} thing_classes = [k["name"] for k in YTVIS_CATEGORIES_2021 if k["isthing"] == 1] ret = { "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, "thing_classes": thing_classes, "thing_colors": thing_colors, } return ret def load_ytvis_json(json_file, image_root, dataset_name=None, extra_annotation_keys=None): from .ytvis_api.ytvos import YTVOS timer = Timer() json_file = PathManager.get_local_path(json_file) with contextlib.redirect_stdout(io.StringIO()): ytvis_api = YTVOS(json_file) if timer.seconds() > 1: logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds())) id_map = None if dataset_name is not None: meta = MetadataCatalog.get(dataset_name) cat_ids = sorted(ytvis_api.getCatIds()) cats = ytvis_api.loadCats(cat_ids) # The categories in a custom json file may not be sorted. thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])] meta.thing_classes = thing_classes # In COCO, certain category ids are artificially removed, # and by convention they are always ignored. # We deal with COCO's id issue and translate # the category ids to contiguous ids in [0, 80). # It works by looking at the "categories" field in the json, therefore # if users' own json also have incontiguous ids, we'll # apply this mapping as well but print a warning. if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)): if "coco" not in dataset_name: logger.warning( """ Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you. """ ) id_map = {v: i for i, v in enumerate(cat_ids)} meta.thing_dataset_id_to_contiguous_id = id_map # sort indices for reproducible results vid_ids = sorted(ytvis_api.vids.keys()) # vids is a list of dicts, each looks something like: # {'license': 1, # 'flickr_url': ' ', # 'file_names': ['ff25f55852/00000.jpg', 'ff25f55852/00005.jpg', ..., 'ff25f55852/00175.jpg'], # 'height': 720, # 'width': 1280, # 'length': 36, # 'date_captured': '2019-04-11 00:55:41.903902', # 'id': 2232} vids = ytvis_api.loadVids(vid_ids) anns = [ytvis_api.vidToAnns[vid_id] for vid_id in vid_ids] total_num_valid_anns = sum([len(x) for x in anns]) total_num_anns = len(ytvis_api.anns) if total_num_valid_anns < total_num_anns: logger.warning( f"{json_file} contains {total_num_anns} annotations, but only " f"{total_num_valid_anns} of them match to images in the file." ) vids_anns = list(zip(vids, anns)) logger.info("Loaded {} videos in YTVIS format from {}".format(len(vids_anns), json_file)) dataset_dicts = [] ann_keys = ["iscrowd", "category_id", "id"] + (extra_annotation_keys or []) num_instances_without_valid_segmentation = 0 for (vid_dict, anno_dict_list) in vids_anns: record = {} record["file_names"] = [os.path.join(image_root, vid_dict["file_names"][i]) for i in range(vid_dict["length"])] record["height"] = vid_dict["height"] record["width"] = vid_dict["width"] record["length"] = vid_dict["length"] video_id = record["video_id"] = vid_dict["id"] video_objs = [] for frame_idx in range(record["length"]): frame_objs = [] for anno in anno_dict_list: assert anno["video_id"] == video_id obj = {key: anno[key] for key in ann_keys if key in anno} _bboxes = anno.get("bboxes", None) _segm = anno.get("segmentations", None) if not (_bboxes and _segm and _bboxes[frame_idx] and _segm[frame_idx]): continue bbox = _bboxes[frame_idx] segm = _segm[frame_idx] obj["bbox"] = bbox obj["bbox_mode"] = BoxMode.XYWH_ABS if isinstance(segm, dict): if isinstance(segm["counts"], list): # convert to compressed RLE segm = mask_util.frPyObjects(segm, *segm["size"]) elif segm: # filter out invalid polygons (< 3 points) segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6] if len(segm) == 0: num_instances_without_valid_segmentation += 1 continue # ignore this instance obj["segmentation"] = segm if id_map: obj["category_id"] = id_map[obj["category_id"]] frame_objs.append(obj) video_objs.append(frame_objs) record["annotations"] = video_objs dataset_dicts.append(record) if num_instances_without_valid_segmentation > 0: logger.warning( "Filtered out {} instances without valid segmentation. ".format( num_instances_without_valid_segmentation ) + "There might be issues in your dataset generation process. " "A valid polygon should be a list[float] with even length >= 6." ) return dataset_dicts def register_ytvis_instances(name, metadata, json_file, image_root): """ Register a dataset in YTVIS's json annotation format for instance tracking. Args: name (str): the name that identifies a dataset, e.g. "ytvis_train". metadata (dict): extra metadata associated with this dataset. You can leave it as an empty dict. json_file (str): path to the json instance annotation file. image_root (str or path-like): directory which contains all the images. """ assert isinstance(name, str), name assert isinstance(json_file, (str, os.PathLike)), json_file assert isinstance(image_root, (str, os.PathLike)), image_root # 1. register a function which returns dicts DatasetCatalog.register(name, lambda: load_ytvis_json(json_file, image_root, name)) # 2. Optionally, add metadata about this dataset, # since they might be useful in evaluation, visualization or logging MetadataCatalog.get(name).set( json_file=json_file, image_root=image_root, evaluator_type="ytvis", **metadata ) if __name__ == "__main__": """ Test the YTVIS json dataset loader. """ from detectron2.utils.logger import setup_logger from detectron2.utils.visualizer import Visualizer import detectron2.data.datasets # noqa # add pre-defined metadata import sys from PIL import Image logger = setup_logger(name=__name__) #assert sys.argv[3] in DatasetCatalog.list() meta = MetadataCatalog.get("ytvis_2019_train") json_file = "./datasets/ytvis/instances_train_sub.json" image_root = "./datasets/ytvis/train/JPEGImages" dicts = load_ytvis_json(json_file, image_root, dataset_name="ytvis_2019_train") logger.info("Done loading {} samples.".format(len(dicts))) dirname = "ytvis-data-vis" os.makedirs(dirname, exist_ok=True) def extract_frame_dic(dic, frame_idx): import copy frame_dic = copy.deepcopy(dic) annos = frame_dic.get("annotations", None) if annos: frame_dic["annotations"] = annos[frame_idx] return frame_dic for d in dicts: vid_name = d["file_names"][0].split('/')[-2] os.makedirs(os.path.join(dirname, vid_name), exist_ok=True) for idx, file_name in enumerate(d["file_names"]): img = np.array(Image.open(file_name)) visualizer = Visualizer(img, metadata=meta) vis = visualizer.draw_dataset_dict(extract_frame_dic(d, idx)) fpath = os.path.join(dirname, vid_name, file_name.split('/')[-1]) vis.save(fpath) ================================================ FILE: mask2former_video/data_video/datasets/ytvis_api/__init__.py ================================================ # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi ================================================ FILE: mask2former_video/data_video/datasets/ytvis_api/ytvos.py ================================================ # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi __author__ = 'ychfan' # Interface for accessing the YouTubeVIS dataset. # The following API functions are defined: # YTVOS - YTVOS api class that loads YouTubeVIS annotation file and prepare data structures. # decodeMask - Decode binary mask M encoded via run-length encoding. # encodeMask - Encode binary mask M using run-length encoding. # getAnnIds - Get ann ids that satisfy given filter conditions. # getCatIds - Get cat ids that satisfy given filter conditions. # getImgIds - Get img ids that satisfy given filter conditions. # loadAnns - Load anns with the specified ids. # loadCats - Load cats with the specified ids. # loadImgs - Load imgs with the specified ids. # annToMask - Convert segmentation in an annotation to binary mask. # loadRes - Load algorithm results and create API for accessing them. # Microsoft COCO Toolbox. version 2.0 # Data, paper, and tutorials available at: http://mscoco.org/ # Code written by Piotr Dollar and Tsung-Yi Lin, 2014. # Licensed under the Simplified BSD License [see bsd.txt] import json import time import matplotlib.pyplot as plt from matplotlib.collections import PatchCollection from matplotlib.patches import Polygon import numpy as np import copy import itertools from pycocotools import mask as maskUtils import os from collections import defaultdict import sys PYTHON_VERSION = sys.version_info[0] if PYTHON_VERSION == 2: from urllib import urlretrieve elif PYTHON_VERSION == 3: from urllib.request import urlretrieve def _isArrayLike(obj): return hasattr(obj, '__iter__') and hasattr(obj, '__len__') class YTVOS: def __init__(self, annotation_file=None): """ Constructor of Microsoft COCO helper class for reading and visualizing annotations. :param annotation_file (str): location of annotation file :param image_folder (str): location to the folder that hosts images. :return: """ # load dataset self.dataset,self.anns,self.cats,self.vids = dict(),dict(),dict(),dict() self.vidToAnns, self.catToVids = defaultdict(list), defaultdict(list) if not annotation_file == None: print('loading annotations into memory...') tic = time.time() dataset = json.load(open(annotation_file, 'r')) assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset)) print('Done (t={:0.2f}s)'.format(time.time()- tic)) self.dataset = dataset self.createIndex() def createIndex(self): # create index print('creating index...') anns, cats, vids = {}, {}, {} vidToAnns,catToVids = defaultdict(list),defaultdict(list) if 'annotations' in self.dataset: for ann in self.dataset['annotations']: vidToAnns[ann['video_id']].append(ann) anns[ann['id']] = ann if 'videos' in self.dataset: for vid in self.dataset['videos']: vids[vid['id']] = vid if 'categories' in self.dataset: for cat in self.dataset['categories']: cats[cat['id']] = cat if 'annotations' in self.dataset and 'categories' in self.dataset: for ann in self.dataset['annotations']: catToVids[ann['category_id']].append(ann['video_id']) print('index created!') # create class members self.anns = anns self.vidToAnns = vidToAnns self.catToVids = catToVids self.vids = vids self.cats = cats def info(self): """ Print information about the annotation file. :return: """ for key, value in self.dataset['info'].items(): print('{}: {}'.format(key, value)) def getAnnIds(self, vidIds=[], catIds=[], areaRng=[], iscrowd=None): """ Get ann ids that satisfy given filter conditions. default skips that filter :param vidIds (int array) : get anns for given vids catIds (int array) : get anns for given cats areaRng (float array) : get anns for given area range (e.g. [0 inf]) iscrowd (boolean) : get anns for given crowd label (False or True) :return: ids (int array) : integer array of ann ids """ vidIds = vidIds if _isArrayLike(vidIds) else [vidIds] catIds = catIds if _isArrayLike(catIds) else [catIds] if len(vidIds) == len(catIds) == len(areaRng) == 0: anns = self.dataset['annotations'] else: if not len(vidIds) == 0: lists = [self.vidToAnns[vidId] for vidId in vidIds if vidId in self.vidToAnns] anns = list(itertools.chain.from_iterable(lists)) else: anns = self.dataset['annotations'] anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds] anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['avg_area'] > areaRng[0] and ann['avg_area'] < areaRng[1]] if not iscrowd == None: ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd] else: ids = [ann['id'] for ann in anns] return ids def getCatIds(self, catNms=[], supNms=[], catIds=[]): """ filtering parameters. default skips that filter. :param catNms (str array) : get cats for given cat names :param supNms (str array) : get cats for given supercategory names :param catIds (int array) : get cats for given cat ids :return: ids (int array) : integer array of cat ids """ catNms = catNms if _isArrayLike(catNms) else [catNms] supNms = supNms if _isArrayLike(supNms) else [supNms] catIds = catIds if _isArrayLike(catIds) else [catIds] if len(catNms) == len(supNms) == len(catIds) == 0: cats = self.dataset['categories'] else: cats = self.dataset['categories'] cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms] cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms] cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds] ids = [cat['id'] for cat in cats] return ids def getVidIds(self, vidIds=[], catIds=[]): ''' Get vid ids that satisfy given filter conditions. :param vidIds (int array) : get vids for given ids :param catIds (int array) : get vids with all given cats :return: ids (int array) : integer array of vid ids ''' vidIds = vidIds if _isArrayLike(vidIds) else [vidIds] catIds = catIds if _isArrayLike(catIds) else [catIds] if len(vidIds) == len(catIds) == 0: ids = self.vids.keys() else: ids = set(vidIds) for i, catId in enumerate(catIds): if i == 0 and len(ids) == 0: ids = set(self.catToVids[catId]) else: ids &= set(self.catToVids[catId]) return list(ids) def loadAnns(self, ids=[]): """ Load anns with the specified ids. :param ids (int array) : integer ids specifying anns :return: anns (object array) : loaded ann objects """ if _isArrayLike(ids): return [self.anns[id] for id in ids] elif type(ids) == int: return [self.anns[ids]] def loadCats(self, ids=[]): """ Load cats with the specified ids. :param ids (int array) : integer ids specifying cats :return: cats (object array) : loaded cat objects """ if _isArrayLike(ids): return [self.cats[id] for id in ids] elif type(ids) == int: return [self.cats[ids]] def loadVids(self, ids=[]): """ Load anns with the specified ids. :param ids (int array) : integer ids specifying vid :return: vids (object array) : loaded vid objects """ if _isArrayLike(ids): return [self.vids[id] for id in ids] elif type(ids) == int: return [self.vids[ids]] def loadRes(self, resFile): """ Load result file and return a result api object. :param resFile (str) : file name of result file :return: res (obj) : result api object """ res = YTVOS() res.dataset['videos'] = [img for img in self.dataset['videos']] print('Loading and preparing results...') tic = time.time() if type(resFile) == str or (PYTHON_VERSION == 2 and type(resFile) == unicode): anns = json.load(open(resFile)) elif type(resFile) == np.ndarray: anns = self.loadNumpyAnnotations(resFile) else: anns = resFile assert type(anns) == list, 'results in not an array of objects' annsVidIds = [ann['video_id'] for ann in anns] assert set(annsVidIds) == (set(annsVidIds) & set(self.getVidIds())), \ 'Results do not correspond to current coco set' if 'segmentations' in anns[0]: res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) for id, ann in enumerate(anns): ann['areas'] = [] if not 'bboxes' in ann: ann['bboxes'] = [] for seg in ann['segmentations']: # now only support compressed RLE format as segmentation results if seg: ann['areas'].append(maskUtils.area(seg)) if len(ann['bboxes']) < len(ann['areas']): ann['bboxes'].append(maskUtils.toBbox(seg)) else: ann['areas'].append(None) if len(ann['bboxes']) < len(ann['areas']): ann['bboxes'].append(None) ann['id'] = id+1 l = [a for a in ann['areas'] if a] if len(l)==0: ann['avg_area'] = 0 else: ann['avg_area'] = np.array(l).mean() ann['iscrowd'] = 0 print('DONE (t={:0.2f}s)'.format(time.time()- tic)) res.dataset['annotations'] = anns res.createIndex() return res def annToRLE(self, ann, frameId): """ Convert annotation which can be polygons, uncompressed RLE to RLE. :return: binary mask (numpy 2D array) """ t = self.vids[ann['video_id']] h, w = t['height'], t['width'] segm = ann['segmentations'][frameId] if type(segm) == list: # polygon -- a single object might consist of multiple parts # we merge all parts into one mask rle code rles = maskUtils.frPyObjects(segm, h, w) rle = maskUtils.merge(rles) elif type(segm['counts']) == list: # uncompressed RLE rle = maskUtils.frPyObjects(segm, h, w) else: # rle rle = segm return rle def annToMask(self, ann, frameId): """ Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask. :return: binary mask (numpy 2D array) """ rle = self.annToRLE(ann, frameId) m = maskUtils.decode(rle) return m ================================================ FILE: mask2former_video/data_video/datasets/ytvis_api/ytvoseval.py ================================================ # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi __author__ = 'ychfan' import numpy as np import datetime import time from collections import defaultdict from pycocotools import mask as maskUtils import copy class YTVOSeval: # Interface for evaluating video instance segmentation on the YouTubeVIS dataset. # # The usage for YTVOSeval is as follows: # cocoGt=..., cocoDt=... # load dataset and results # E = YTVOSeval(cocoGt,cocoDt); # initialize YTVOSeval object # E.params.recThrs = ...; # set parameters as desired # E.evaluate(); # run per image evaluation # E.accumulate(); # accumulate per image results # E.summarize(); # display summary metrics of results # For example usage see evalDemo.m and http://mscoco.org/. # # The evaluation parameters are as follows (defaults in brackets): # imgIds - [all] N img ids to use for evaluation # catIds - [all] K cat ids to use for evaluation # iouThrs - [.5:.05:.95] T=10 IoU thresholds for evaluation # recThrs - [0:.01:1] R=101 recall thresholds for evaluation # areaRng - [...] A=4 object area ranges for evaluation # maxDets - [1 10 100] M=3 thresholds on max detections per image # iouType - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints' # iouType replaced the now DEPRECATED useSegm parameter. # useCats - [1] if true use category labels for evaluation # Note: if useCats=0 category labels are ignored as in proposal scoring. # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified. # # evaluate(): evaluates detections on every image and every category and # concats the results into the "evalImgs" with fields: # dtIds - [1xD] id for each of the D detections (dt) # gtIds - [1xG] id for each of the G ground truths (gt) # dtMatches - [TxD] matching gt id at each IoU or 0 # gtMatches - [TxG] matching dt id at each IoU or 0 # dtScores - [1xD] confidence of each dt # gtIgnore - [1xG] ignore flag for each gt # dtIgnore - [TxD] ignore flag for each dt at each IoU # # accumulate(): accumulates the per-image, per-category evaluation # results in "evalImgs" into the dictionary "eval" with fields: # params - parameters used for evaluation # date - date evaluation was performed # counts - [T,R,K,A,M] parameter dimensions (see above) # precision - [TxRxKxAxM] precision for every evaluation setting # recall - [TxKxAxM] max recall for every evaluation setting # Note: precision and recall==-1 for settings with no gt objects. # # See also coco, mask, pycocoDemo, pycocoEvalDemo # # Microsoft COCO Toolbox. version 2.0 # Data, paper, and tutorials available at: http://mscoco.org/ # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. # Licensed under the Simplified BSD License [see coco/license.txt] def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'): ''' Initialize CocoEval using coco APIs for gt and dt :param cocoGt: coco object with ground truth annotations :param cocoDt: coco object with detection results :return: None ''' if not iouType: print('iouType not specified. use default iouType segm') self.cocoGt = cocoGt # ground truth COCO API self.cocoDt = cocoDt # detections COCO API self.params = {} # evaluation parameters self.evalVids = defaultdict(list) # per-image per-category evaluation results [KxAxI] elements self.eval = {} # accumulated evaluation results self._gts = defaultdict(list) # gt for evaluation self._dts = defaultdict(list) # dt for evaluation self.params = Params(iouType=iouType) # parameters self._paramsEval = {} # parameters for evaluation self.stats = [] # result summarization self.ious = {} # ious between all gts and dts if not cocoGt is None: self.params.vidIds = sorted(cocoGt.getVidIds()) self.params.catIds = sorted(cocoGt.getCatIds()) def _prepare(self): ''' Prepare ._gts and ._dts for evaluation based on params :return: None ''' def _toMask(anns, coco): # modify ann['segmentation'] by reference for ann in anns: for i, a in enumerate(ann['segmentations']): if a: rle = coco.annToRLE(ann, i) ann['segmentations'][i] = rle l = [a for a in ann['areas'] if a] if len(l)==0: ann['avg_area'] = 0 else: ann['avg_area'] = np.array(l).mean() p = self.params if p.useCats: gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(vidIds=p.vidIds, catIds=p.catIds)) dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(vidIds=p.vidIds, catIds=p.catIds)) else: gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(vidIds=p.vidIds)) dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(vidIds=p.vidIds)) # convert ground truth to mask if iouType == 'segm' if p.iouType == 'segm': _toMask(gts, self.cocoGt) _toMask(dts, self.cocoDt) # set ignore flag for gt in gts: gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0 gt['ignore'] = 'iscrowd' in gt and gt['iscrowd'] if p.iouType == 'keypoints': gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore'] self._gts = defaultdict(list) # gt for evaluation self._dts = defaultdict(list) # dt for evaluation for gt in gts: self._gts[gt['video_id'], gt['category_id']].append(gt) for dt in dts: self._dts[dt['video_id'], dt['category_id']].append(dt) self.evalVids = defaultdict(list) # per-image per-category evaluation results self.eval = {} # accumulated evaluation results def evaluate(self): ''' Run per image evaluation on given images and store results (a list of dict) in self.evalVids :return: None ''' tic = time.time() print('Running per image evaluation...') p = self.params # add backward compatibility if useSegm is specified in params if not p.useSegm is None: p.iouType = 'segm' if p.useSegm == 1 else 'bbox' print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType)) print('Evaluate annotation type *{}*'.format(p.iouType)) p.vidIds = list(np.unique(p.vidIds)) if p.useCats: p.catIds = list(np.unique(p.catIds)) p.maxDets = sorted(p.maxDets) self.params=p self._prepare() # loop through images, area range, max detection number catIds = p.catIds if p.useCats else [-1] if p.iouType == 'segm' or p.iouType == 'bbox': computeIoU = self.computeIoU elif p.iouType == 'keypoints': computeIoU = self.computeOks self.ious = {(vidId, catId): computeIoU(vidId, catId) \ for vidId in p.vidIds for catId in catIds} evaluateVid = self.evaluateVid maxDet = p.maxDets[-1] self.evalImgs = [evaluateVid(vidId, catId, areaRng, maxDet) for catId in catIds for areaRng in p.areaRng for vidId in p.vidIds ] self._paramsEval = copy.deepcopy(self.params) toc = time.time() print('DONE (t={:0.2f}s).'.format(toc-tic)) def computeIoU(self, vidId, catId): p = self.params if p.useCats: gt = self._gts[vidId,catId] dt = self._dts[vidId,catId] else: gt = [_ for cId in p.catIds for _ in self._gts[vidId,cId]] dt = [_ for cId in p.catIds for _ in self._dts[vidId,cId]] if len(gt) == 0 and len(dt) ==0: return [] inds = np.argsort([-d['score'] for d in dt], kind='mergesort') dt = [dt[i] for i in inds] if len(dt) > p.maxDets[-1]: dt=dt[0:p.maxDets[-1]] if p.iouType == 'segm': g = [g['segmentations'] for g in gt] d = [d['segmentations'] for d in dt] elif p.iouType == 'bbox': g = [g['bboxes'] for g in gt] d = [d['bboxes'] for d in dt] else: raise Exception('unknown iouType for iou computation') # compute iou between each dt and gt region iscrowd = [int(o['iscrowd']) for o in gt] #ious = maskUtils.iou(d,g,iscrowd) def iou_seq(d_seq, g_seq): i = .0 u = .0 for d, g in zip(d_seq, g_seq): if d and g: i += maskUtils.area(maskUtils.merge([d, g], True)) u += maskUtils.area(maskUtils.merge([d, g], False)) elif not d and g: u += maskUtils.area(g) elif d and not g: u += maskUtils.area(d) if not u > .0: print("Mask sizes in video {} and category {} may not match!".format(vidId, catId)) iou = i / u if u > .0 else .0 return iou ious = np.zeros([len(d), len(g)]) for i, j in np.ndindex(ious.shape): ious[i, j] = iou_seq(d[i], g[j]) #print(vidId, catId, ious.shape, ious) return ious def computeOks(self, imgId, catId): p = self.params # dimention here should be Nxm gts = self._gts[imgId, catId] dts = self._dts[imgId, catId] inds = np.argsort([-d['score'] for d in dts], kind='mergesort') dts = [dts[i] for i in inds] if len(dts) > p.maxDets[-1]: dts = dts[0:p.maxDets[-1]] # if len(gts) == 0 and len(dts) == 0: if len(gts) == 0 or len(dts) == 0: return [] ious = np.zeros((len(dts), len(gts))) sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62,.62, 1.07, 1.07, .87, .87, .89, .89])/10.0 vars = (sigmas * 2)**2 k = len(sigmas) # compute oks between each detection and ground truth object for j, gt in enumerate(gts): # create bounds for ignore regions(double the gt bbox) g = np.array(gt['keypoints']) xg = g[0::3]; yg = g[1::3]; vg = g[2::3] k1 = np.count_nonzero(vg > 0) bb = gt['bbox'] x0 = bb[0] - bb[2]; x1 = bb[0] + bb[2] * 2 y0 = bb[1] - bb[3]; y1 = bb[1] + bb[3] * 2 for i, dt in enumerate(dts): d = np.array(dt['keypoints']) xd = d[0::3]; yd = d[1::3] if k1>0: # measure the per-keypoint distance if keypoints visible dx = xd - xg dy = yd - yg else: # measure minimum distance to keypoints in (x0,y0) & (x1,y1) z = np.zeros((k)) dx = np.max((z, x0-xd),axis=0)+np.max((z, xd-x1),axis=0) dy = np.max((z, y0-yd),axis=0)+np.max((z, yd-y1),axis=0) e = (dx**2 + dy**2) / vars / (gt['avg_area']+np.spacing(1)) / 2 if k1 > 0: e=e[vg > 0] ious[i, j] = np.sum(np.exp(-e)) / e.shape[0] return ious def evaluateVid(self, vidId, catId, aRng, maxDet): ''' perform evaluation for single category and image :return: dict (single image results) ''' p = self.params if p.useCats: gt = self._gts[vidId,catId] dt = self._dts[vidId,catId] else: gt = [_ for cId in p.catIds for _ in self._gts[vidId,cId]] dt = [_ for cId in p.catIds for _ in self._dts[vidId,cId]] if len(gt) == 0 and len(dt) ==0: return None for g in gt: if g['ignore'] or (g['avg_area']aRng[1]): g['_ignore'] = 1 else: g['_ignore'] = 0 # sort dt highest score first, sort gt ignore last gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort') gt = [gt[i] for i in gtind] dtind = np.argsort([-d['score'] for d in dt], kind='mergesort') dt = [dt[i] for i in dtind[0:maxDet]] iscrowd = [int(o['iscrowd']) for o in gt] # load computed ious ious = self.ious[vidId, catId][:, gtind] if len(self.ious[vidId, catId]) > 0 else self.ious[vidId, catId] T = len(p.iouThrs) G = len(gt) D = len(dt) gtm = np.zeros((T,G)) dtm = np.zeros((T,D)) gtIg = np.array([g['_ignore'] for g in gt]) dtIg = np.zeros((T,D)) if not len(ious)==0: for tind, t in enumerate(p.iouThrs): for dind, d in enumerate(dt): # information about best match so far (m=-1 -> unmatched) iou = min([t,1-1e-10]) m = -1 for gind, g in enumerate(gt): # if this gt already matched, and not a crowd, continue if gtm[tind,gind]>0 and not iscrowd[gind]: continue # if dt matched to reg gt, and on ignore gt, stop if m>-1 and gtIg[m]==0 and gtIg[gind]==1: break # continue to next gt unless better match made if ious[dind,gind] < iou: continue # if match successful and best so far, store appropriately iou=ious[dind,gind] m=gind # if match made store id of match for both dt and gt if m ==-1: continue dtIg[tind,dind] = gtIg[m] dtm[tind,dind] = gt[m]['id'] gtm[tind,m] = d['id'] # set unmatched detections outside of area range to ignore a = np.array([d['avg_area']aRng[1] for d in dt]).reshape((1, len(dt))) dtIg = np.logical_or(dtIg, np.logical_and(dtm==0, np.repeat(a,T,0))) # store results for given image and category return { 'video_id': vidId, 'category_id': catId, 'aRng': aRng, 'maxDet': maxDet, 'dtIds': [d['id'] for d in dt], 'gtIds': [g['id'] for g in gt], 'dtMatches': dtm, 'gtMatches': gtm, 'dtScores': [d['score'] for d in dt], 'gtIgnore': gtIg, 'dtIgnore': dtIg, } def accumulate(self, p = None): ''' Accumulate per image evaluation results and store the result in self.eval :param p: input params for evaluation :return: None ''' print('Accumulating evaluation results...') tic = time.time() if not self.evalImgs: print('Please run evaluate() first') # allows input customized parameters if p is None: p = self.params p.catIds = p.catIds if p.useCats == 1 else [-1] T = len(p.iouThrs) R = len(p.recThrs) K = len(p.catIds) if p.useCats else 1 A = len(p.areaRng) M = len(p.maxDets) precision = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories recall = -np.ones((T,K,A,M)) scores = -np.ones((T,R,K,A,M)) # create dictionary for future indexing _pe = self._paramsEval catIds = _pe.catIds if _pe.useCats else [-1] setK = set(catIds) setA = set(map(tuple, _pe.areaRng)) setM = set(_pe.maxDets) setI = set(_pe.vidIds) # get inds to evaluate k_list = [n for n, k in enumerate(p.catIds) if k in setK] m_list = [m for n, m in enumerate(p.maxDets) if m in setM] a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA] i_list = [n for n, i in enumerate(p.vidIds) if i in setI] I0 = len(_pe.vidIds) A0 = len(_pe.areaRng) # retrieve E at each category, area range, and max number of detections for k, k0 in enumerate(k_list): Nk = k0*A0*I0 for a, a0 in enumerate(a_list): Na = a0*I0 for m, maxDet in enumerate(m_list): E = [self.evalImgs[Nk + Na + i] for i in i_list] E = [e for e in E if not e is None] if len(E) == 0: continue dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E]) # different sorting method generates slightly different results. # mergesort is used to be consistent as Matlab implementation. inds = np.argsort(-dtScores, kind='mergesort') dtScoresSorted = dtScores[inds] dtm = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds] dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet] for e in E], axis=1)[:,inds] gtIg = np.concatenate([e['gtIgnore'] for e in E]) npig = np.count_nonzero(gtIg==0 ) if npig == 0: continue tps = np.logical_and( dtm, np.logical_not(dtIg) ) fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) ) tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float) fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float) for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)): tp = np.array(tp) fp = np.array(fp) nd = len(tp) rc = tp / npig pr = tp / (fp+tp+np.spacing(1)) q = np.zeros((R,)) ss = np.zeros((R,)) if nd: recall[t,k,a,m] = rc[-1] else: recall[t,k,a,m] = 0 # numpy is slow without cython optimization for accessing elements # use python array gets significant speed improvement pr = pr.tolist(); q = q.tolist() for i in range(nd-1, 0, -1): if pr[i] > pr[i-1]: pr[i-1] = pr[i] inds = np.searchsorted(rc, p.recThrs, side='left') try: for ri, pi in enumerate(inds): q[ri] = pr[pi] ss[ri] = dtScoresSorted[pi] except: pass precision[t,:,k,a,m] = np.array(q) scores[t,:,k,a,m] = np.array(ss) self.eval = { 'params': p, 'counts': [T, R, K, A, M], 'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'precision': precision, 'recall': recall, 'scores': scores, } toc = time.time() print('DONE (t={:0.2f}s).'.format( toc-tic)) def summarize(self): ''' Compute and display summary metrics for evaluation results. Note this functin can *only* be applied on the default parameter setting ''' def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ): p = self.params iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}' titleStr = 'Average Precision' if ap == 1 else 'Average Recall' typeStr = '(AP)' if ap==1 else '(AR)' iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \ if iouThr is None else '{:0.2f}'.format(iouThr) aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng] mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets] if ap == 1: # dimension of precision: [TxRxKxAxM] s = self.eval['precision'] # IoU if iouThr is not None: t = np.where(iouThr == p.iouThrs)[0] s = s[t] s = s[:,:,:,aind,mind] else: # dimension of recall: [TxKxAxM] s = self.eval['recall'] if iouThr is not None: t = np.where(iouThr == p.iouThrs)[0] s = s[t] s = s[:,:,aind,mind] if len(s[s>-1])==0: mean_s = -1 else: mean_s = np.mean(s[s>-1]) print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s)) return mean_s def _summarizeDets(): stats = np.zeros((12,)) stats[0] = _summarize(1) stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2]) stats[2] = _summarize(1, iouThr=.75, maxDets=self.params.maxDets[2]) stats[3] = _summarize(1, areaRng='small', maxDets=self.params.maxDets[2]) stats[4] = _summarize(1, areaRng='medium', maxDets=self.params.maxDets[2]) stats[5] = _summarize(1, areaRng='large', maxDets=self.params.maxDets[2]) stats[6] = _summarize(0, maxDets=self.params.maxDets[0]) stats[7] = _summarize(0, maxDets=self.params.maxDets[1]) stats[8] = _summarize(0, maxDets=self.params.maxDets[2]) stats[9] = _summarize(0, areaRng='small', maxDets=self.params.maxDets[2]) stats[10] = _summarize(0, areaRng='medium', maxDets=self.params.maxDets[2]) stats[11] = _summarize(0, areaRng='large', maxDets=self.params.maxDets[2]) return stats def _summarizeKps(): stats = np.zeros((10,)) stats[0] = _summarize(1, maxDets=20) stats[1] = _summarize(1, maxDets=20, iouThr=.5) stats[2] = _summarize(1, maxDets=20, iouThr=.75) stats[3] = _summarize(1, maxDets=20, areaRng='medium') stats[4] = _summarize(1, maxDets=20, areaRng='large') stats[5] = _summarize(0, maxDets=20) stats[6] = _summarize(0, maxDets=20, iouThr=.5) stats[7] = _summarize(0, maxDets=20, iouThr=.75) stats[8] = _summarize(0, maxDets=20, areaRng='medium') stats[9] = _summarize(0, maxDets=20, areaRng='large') return stats if not self.eval: raise Exception('Please run accumulate() first') iouType = self.params.iouType if iouType == 'segm' or iouType == 'bbox': summarize = _summarizeDets elif iouType == 'keypoints': summarize = _summarizeKps self.stats = summarize() def __str__(self): self.summarize() class Params: ''' Params for coco evaluation api ''' def setDetParams(self): self.vidIds = [] self.catIds = [] # np.arange causes trouble. the data point on arange is slightly larger than the true value #self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True) #self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True) self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True) self.maxDets = [1, 10, 100] self.areaRng = [[0 ** 2, 1e5 ** 2], [0 ** 2, 128 ** 2], [ 128 ** 2, 256 ** 2], [256 ** 2, 1e5 ** 2]] self.areaRngLbl = ['all', 'small', 'medium', 'large'] self.useCats = 1 def setKpParams(self): self.vidIds = [] self.catIds = [] # np.arange causes trouble. the data point on arange is slightly larger than the true value self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True) self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True) self.maxDets = [20] self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]] self.areaRngLbl = ['all', 'medium', 'large'] self.useCats = 1 def __init__(self, iouType='segm'): if iouType == 'segm' or iouType == 'bbox': self.setDetParams() elif iouType == 'keypoints': self.setKpParams() else: raise Exception('iouType not supported') self.iouType = iouType # useSegm is deprecated self.useSegm = None ================================================ FILE: mask2former_video/data_video/ytvis_eval.py ================================================ # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC import contextlib import copy import io import itertools import json import logging import numpy as np import os from collections import OrderedDict import pycocotools.mask as mask_util import torch from .datasets.ytvis_api.ytvos import YTVOS from .datasets.ytvis_api.ytvoseval import YTVOSeval from tabulate import tabulate import detectron2.utils.comm as comm from detectron2.config import CfgNode from detectron2.data import MetadataCatalog from detectron2.evaluation import DatasetEvaluator from detectron2.utils.file_io import PathManager from detectron2.utils.logger import create_small_table class YTVISEvaluator(DatasetEvaluator): """ Evaluate AR for object proposals, AP for instance detection/segmentation, AP for keypoint detection outputs using COCO's metrics. See http://cocodataset.org/#detection-eval and http://cocodataset.org/#keypoints-eval to understand its metrics. In addition to COCO, this evaluator is able to support any bounding box detection, instance segmentation, or keypoint detection dataset. """ def __init__( self, dataset_name, tasks=None, distributed=True, output_dir=None, *, use_fast_impl=True, ): """ Args: dataset_name (str): name of the dataset to be evaluated. It must have either the following corresponding metadata: "json_file": the path to the COCO format annotation Or it must be in detectron2's standard dataset format so it can be converted to COCO format automatically. tasks (tuple[str]): tasks that can be evaluated under the given configuration. A task is one of "bbox", "segm", "keypoints". By default, will infer this automatically from predictions. distributed (True): if True, will collect results from all ranks and run evaluation in the main process. Otherwise, will only evaluate the results in the current process. output_dir (str): optional, an output directory to dump all results predicted on the dataset. The dump contains two files: 1. "instances_predictions.pth" a file in torch serialization format that contains all the raw original predictions. 2. "coco_instances_results.json" a json file in COCO's result format. use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP. Although the results should be very close to the official implementation in COCO API, it is still recommended to compute results with the official API for use in papers. The faster implementation also uses more RAM. """ self._logger = logging.getLogger(__name__) self._distributed = distributed self._output_dir = output_dir self._use_fast_impl = use_fast_impl if tasks is not None and isinstance(tasks, CfgNode): self._logger.warning( "COCO Evaluator instantiated using config, this is deprecated behavior." " Please pass in explicit arguments instead." ) self._tasks = None # Infering it from predictions should be better else: self._tasks = tasks self._cpu_device = torch.device("cpu") self._metadata = MetadataCatalog.get(dataset_name) json_file = PathManager.get_local_path(self._metadata.json_file) with contextlib.redirect_stdout(io.StringIO()): self._ytvis_api = YTVOS(json_file) # Test set json files do not contain annotations (evaluation must be # performed using the COCO evaluation server). self._do_evaluation = "annotations" in self._ytvis_api.dataset def reset(self): self._predictions = [] def process(self, inputs, outputs): """ Args: inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). It is a list of dict. Each dict corresponds to an image and contains keys like "height", "width", "file_name", "image_id". outputs: the outputs of a COCO model. It is a list of dicts with key "instances" that contains :class:`Instances`. """ prediction = instances_to_coco_json_video(inputs, outputs) self._predictions.extend(prediction) def evaluate(self): """ Args: img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset """ if self._distributed: comm.synchronize() predictions = comm.gather(self._predictions, dst=0) predictions = list(itertools.chain(*predictions)) if not comm.is_main_process(): return {} else: predictions = self._predictions if len(predictions) == 0: self._logger.warning("[COCOEvaluator] Did not receive valid predictions.") return {} if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "instances_predictions.pth") with PathManager.open(file_path, "wb") as f: torch.save(predictions, f) self._results = OrderedDict() self._eval_predictions(predictions) # Copy so the caller can do whatever with results return copy.deepcopy(self._results) def _eval_predictions(self, predictions): """ Evaluate predictions. Fill self._results with the metrics of the tasks. """ self._logger.info("Preparing results for YTVIS format ...") # unmap the category ids for COCO if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) num_classes = len(all_contiguous_ids) assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} for result in predictions: category_id = result["category_id"] assert category_id < num_classes, ( f"A prediction has class={category_id}, " f"but the dataset only has {num_classes} classes and " f"predicted class id should be in [0, {num_classes - 1}]." ) result["category_id"] = reverse_id_mapping[category_id] if self._output_dir: file_path = os.path.join(self._output_dir, "results.json") self._logger.info("Saving results to {}".format(file_path)) with PathManager.open(file_path, "w") as f: f.write(json.dumps(predictions)) f.flush() if not self._do_evaluation: self._logger.info("Annotations are not available for evaluation.") return coco_eval = ( _evaluate_predictions_on_coco( self._ytvis_api, predictions, ) if len(predictions) > 0 else None # cocoapi does not handle empty results very well ) res = self._derive_coco_results( coco_eval, class_names=self._metadata.get("thing_classes") ) self._results["segm"] = res def _derive_coco_results(self, coco_eval, class_names=None): """ Derive the desired score numbers from summarized COCOeval. Args: coco_eval (None or COCOEval): None represents no predictions from model. iou_type (str): class_names (None or list[str]): if provided, will use it to predict per-category AP. Returns: a dict of {metric name: score} """ metrics = ["AP", "AP50", "AP75", "APs", "APm", "APl", "AR1", "AR10"] if coco_eval is None: self._logger.warn("No predictions from the model!") return {metric: float("nan") for metric in metrics} # the standard metrics results = { metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan") for idx, metric in enumerate(metrics) } self._logger.info( "Evaluation results for {}: \n".format("segm") + create_small_table(results) ) if not np.isfinite(sum(results.values())): self._logger.info("Some metrics cannot be computed and is shown as NaN.") if class_names is None or len(class_names) <= 1: return results # Compute per-category AP # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa precisions = coco_eval.eval["precision"] # precision has dims (iou, recall, cls, area range, max dets) assert len(class_names) == precisions.shape[2] results_per_category = [] for idx, name in enumerate(class_names): # area range index 0: all area ranges # max dets index -1: typically 100 per image precision = precisions[:, :, idx, 0, -1] precision = precision[precision > -1] ap = np.mean(precision) if precision.size else float("nan") results_per_category.append(("{}".format(name), float(ap * 100))) # tabulate it N_COLS = min(6, len(results_per_category) * 2) results_flatten = list(itertools.chain(*results_per_category)) results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)]) table = tabulate( results_2d, tablefmt="pipe", floatfmt=".3f", headers=["category", "AP"] * (N_COLS // 2), numalign="left", ) self._logger.info("Per-category {} AP: \n".format("segm") + table) results.update({"AP-" + name: ap for name, ap in results_per_category}) return results def instances_to_coco_json_video(inputs, outputs): """ Dump an "Instances" object to a COCO-format json that's used for evaluation. Args: instances (Instances): video_id (int): the image id Returns: list[dict]: list of json annotations in COCO format. """ assert len(inputs) == 1, "More than one inputs are loaded for inference!" video_id = inputs[0]["video_id"] video_length = inputs[0]["length"] scores = outputs["pred_scores"] labels = outputs["pred_labels"] masks = outputs["pred_masks"] ytvis_results = [] for instance_id, (s, l, m) in enumerate(zip(scores, labels, masks)): segms = [ mask_util.encode(np.array(_mask[:, :, None], order="F", dtype="uint8"))[0] for _mask in m ] for rle in segms: rle["counts"] = rle["counts"].decode("utf-8") res = { "video_id": video_id, "score": s, "category_id": l, "segmentations": segms, } ytvis_results.append(res) return ytvis_results def _evaluate_predictions_on_coco( coco_gt, coco_results, img_ids=None, ): """ Evaluate the coco results using COCOEval API. """ assert len(coco_results) > 0 coco_results = copy.deepcopy(coco_results) # When evaluating mask AP, if the results contain bbox, cocoapi will # use the box area as the area of the instance, instead of the mask area. # This leads to a different definition of small/medium/large. # We remove the bbox field to let mask AP use mask area. for c in coco_results: c.pop("bbox", None) coco_dt = coco_gt.loadRes(coco_results) coco_eval = YTVOSeval(coco_gt, coco_dt) # For COCO, the default max_dets_per_image is [1, 10, 100]. max_dets_per_image = [1, 10, 100] # Default from COCOEval coco_eval.params.maxDets = max_dets_per_image if img_ids is not None: coco_eval.params.imgIds = img_ids coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() return coco_eval ================================================ FILE: mask2former_video/modeling/__init__.py ================================================ from .transformer_decoder.video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder ================================================ FILE: mask2former_video/modeling/criterion.py ================================================ # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py import logging import torch import torch.nn.functional as F from torch import nn from detectron2.utils.comm import get_world_size from detectron2.projects.point_rend.point_features import ( get_uncertain_point_coords_with_randomness, point_sample, ) from mask2former.utils.misc import is_dist_avail_and_initialized import random import cv2 import os def unfold_wo_center(x, kernel_size, dilation): assert x.dim() == 4 assert kernel_size % 2 == 1 # using SAME padding padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2 unfolded_x = F.unfold( x, kernel_size=kernel_size, padding=padding, dilation=dilation ) unfolded_x = unfolded_x.reshape( x.size(0), x.size(1), -1, x.size(2), x.size(3) ) # remove the center pixels size = kernel_size ** 2 unfolded_x = torch.cat(( unfolded_x[:, :, :size // 2], unfolded_x[:, :, size // 2 + 1:] ), dim=2) return unfolded_x def unfold_w_center(x, kernel_size, dilation): assert x.dim() == 4 assert kernel_size % 2 == 1 # using SAME padding padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2 unfolded_x = F.unfold( x, kernel_size=kernel_size, padding=padding, dilation=dilation ) unfolded_x = unfolded_x.reshape( x.size(0), x.size(1), -1, x.size(2), x.size(3) ) return unfolded_x def compute_pairwise_term(mask_logits, pairwise_size, pairwise_dilation): assert mask_logits.dim() == 4 log_fg_prob = F.logsigmoid(mask_logits) log_bg_prob = F.logsigmoid(-mask_logits) log_fg_prob_unfold = unfold_wo_center( log_fg_prob, kernel_size=pairwise_size, dilation=pairwise_dilation ) log_bg_prob_unfold = unfold_wo_center( log_bg_prob, kernel_size=pairwise_size, dilation=pairwise_dilation ) # the probability of making the same prediction = p_i * p_j + (1 - p_i) * (1 - p_j) # we compute the the probability in log space to avoid numerical instability log_same_fg_prob = log_fg_prob[:, :, None] + log_fg_prob_unfold log_same_bg_prob = log_bg_prob[:, :, None] + log_bg_prob_unfold max_ = torch.max(log_same_fg_prob, log_same_bg_prob) log_same_prob = torch.log( torch.exp(log_same_fg_prob - max_) + torch.exp(log_same_bg_prob - max_) ) + max_ # loss = -log(prob) return -log_same_prob[:, 0] def compute_pairwise_term_neighbor(mask_logits, mask_logits_neighbor, pairwise_size, pairwise_dilation): assert mask_logits.dim() == 4 log_fg_prob_neigh = F.logsigmoid(mask_logits_neighbor) log_bg_prob_neigh = F.logsigmoid(-mask_logits_neighbor) log_fg_prob = F.logsigmoid(mask_logits) log_bg_prob = F.logsigmoid(-mask_logits) log_fg_prob_unfold = unfold_w_center( log_fg_prob, kernel_size=pairwise_size, dilation=pairwise_dilation ) # print('log_fg_prob shape:', log_fg_prob.shape, 'log_fg_prob unfold:', log_fg_prob_unfold.shape) log_bg_prob_unfold = unfold_w_center( log_bg_prob, kernel_size=pairwise_size, dilation=pairwise_dilation ) # the probability of making the same prediction = p_i * p_j + (1 - p_i) * (1 - p_j) # we compute the the probability in log space to avoid numerical instability log_same_fg_prob = log_fg_prob_neigh[:, :, None] + log_fg_prob_unfold log_same_bg_prob = log_bg_prob_neigh[:, :, None] + log_bg_prob_unfold max_ = torch.max(log_same_fg_prob, log_same_bg_prob) log_same_prob = torch.log( torch.exp(log_same_fg_prob - max_) + torch.exp(log_same_bg_prob - max_) ) + max_ # loss = -log(prob) return -log_same_prob[:, 0] def dice_coefficient(x, target): eps = 1e-5 n_inst = x.size(0) x = x.reshape(n_inst, -1) target = target.reshape(n_inst, -1) intersection = (x * target).sum(dim=1) union = (x ** 2.0).sum(dim=1) + (target ** 2.0).sum(dim=1) + eps loss = 1. - (2 * intersection / union) return loss def compute_project_term(mask_scores, gt_bitmasks): mask_losses_y = dice_coefficient( mask_scores.max(dim=2, keepdim=True)[0], gt_bitmasks.max(dim=2, keepdim=True)[0] ) mask_losses_x = dice_coefficient( mask_scores.max(dim=3, keepdim=True)[0], gt_bitmasks.max(dim=3, keepdim=True)[0] ) return (mask_losses_x + mask_losses_y).mean() def dice_loss( inputs: torch.Tensor, targets: torch.Tensor, num_masks: float, ): """ Compute the DICE loss, similar to generalized IOU for masks Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). """ inputs = inputs.sigmoid() inputs = inputs.flatten(1) numerator = 2 * (inputs * targets).sum(-1) denominator = inputs.sum(-1) + targets.sum(-1) loss = 1 - (numerator + 1) / (denominator + 1) return loss.sum() / num_masks dice_loss_jit = torch.jit.script( dice_loss ) # type: torch.jit.ScriptModule def sigmoid_ce_loss( inputs: torch.Tensor, targets: torch.Tensor, num_masks: float, ): """ Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). Returns: Loss tensor """ loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") return loss.mean(1).sum() / num_masks sigmoid_ce_loss_jit = torch.jit.script( sigmoid_ce_loss ) # type: torch.jit.ScriptModule def visualize_masks(masks, output_dir='masks'): """ Visualize binary mask tensor with shape (N, H, W) and save them as PNG images in the output directory. """ os.makedirs(output_dir, exist_ok=True) n, h, w = masks.shape masks = masks.cpu().numpy() for i in range(n): mask = (masks[i] * 255).astype('uint8') print('mask sum', mask.sum(), mask.max(), mask.min()) # mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR) # mask = mask * 255 # mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR) filename = os.path.join(output_dir, f'mask_{i}.jpg') cv2.imwrite(filename, mask) def calculate_uncertainty(logits): """ We estimate uncerainty as L1 distance between 0.0 and the logit prediction in 'logits' for the foreground class in `classes`. Args: logits (Tensor): A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is the number of foreground classes. The values are logits. Returns: scores (Tensor): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most uncertain locations having the highest uncertainty score. """ assert logits.shape[1] == 1 gt_class_logits = logits.clone() return -(torch.abs(gt_class_logits)) class VideoSetCriterion(nn.Module): """This class computes the loss for DETR. The process happens in two steps: 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched ground-truth / prediction (supervise class and box) """ def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses, num_points, oversample_ratio, importance_sample_ratio): """Create the criterion. Parameters: num_classes: number of object categories, omitting the special no-object category matcher: module able to compute a matching between targets and proposals weight_dict: dict containing as key the names of the losses and as values their relative weight. eos_coef: relative classification weight applied to the no-object category losses: list of all the losses to be applied. See get_loss for list of available losses. """ super().__init__() self.num_classes = num_classes self.matcher = matcher self.weight_dict = weight_dict self.eos_coef = eos_coef self.losses = losses empty_weight = torch.ones(self.num_classes + 1) empty_weight[-1] = self.eos_coef self.register_buffer("empty_weight", empty_weight) # pointwise mask loss parameters self.num_points = num_points self.oversample_ratio = oversample_ratio self.importance_sample_ratio = importance_sample_ratio self._warmup_iters = 2000 self.register_buffer("_iter", torch.zeros([1])) def loss_labels(self, outputs, targets, indices, num_masks): """Classification loss (NLL) targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes] """ assert "pred_logits" in outputs src_logits = outputs["pred_logits"].float() idx = self._get_src_permutation_idx(indices) target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) target_classes = torch.full( src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device ) target_classes[idx] = target_classes_o loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) losses = {"loss_ce": loss_ce} return losses def loss_masks(self, outputs, targets, indices, num_masks): """Compute the losses related to the masks: the focal loss and the dice loss. targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w] """ assert "pred_masks" in outputs src_idx = self._get_src_permutation_idx(indices) src_masks = outputs["pred_masks"] src_masks = src_masks[src_idx] # Modified to handle video target_masks = torch.cat([t['masks'][i] for t, (_, i) in zip(targets, indices)]).to(src_masks) # No need to upsample predictions as we are using normalized coordinates :) # NT x 1 x H x W src_masks = src_masks.flatten(0, 1)[:, None] target_masks = target_masks.flatten(0, 1)[:, None] # print('src_masks shape:', src_masks.shape) # print('target_masks shape:', target_masks.shape) with torch.no_grad(): # sample point_coords point_coords = get_uncertain_point_coords_with_randomness( src_masks, lambda logits: calculate_uncertainty(logits), self.num_points, self.oversample_ratio, self.importance_sample_ratio, ) # get gt labels point_labels = point_sample( target_masks, point_coords, align_corners=False, ).squeeze(1) point_logits = point_sample( src_masks, point_coords, align_corners=False, ).squeeze(1) losses = { "loss_mask": sigmoid_ce_loss_jit(point_logits, point_labels, num_masks), "loss_mask_proj": src_masks.sum() * 0., "loss_dice": dice_loss_jit(point_logits, point_labels, num_masks), "loss_bound": src_masks.sum() * 0., "loss_bound_neighbor": src_masks.sum() * 0., } del src_masks del target_masks return losses def topk_mask(self, images_lab_sim): images_lab_sim_mask = torch.zeros_like(images_lab_sim) topk, indices = torch.topk(images_lab_sim, 5, dim =1) images_lab_sim_mask = images_lab_sim_mask.scatter(1, indices, topk) return images_lab_sim_mask def loss_masks_proj(self, outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2): """Compute the losses related to the masks: the focal loss and the dice loss. targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w] """ assert "pred_masks" in outputs self._iter += 1 # print('images_lab_sim is None:', (images_lab_sim is None)) if images_lab_sim is None: return self.loss_masks(outputs, targets, indices, num_masks) src_idx = self._get_src_permutation_idx(indices) src_masks = outputs["pred_masks"] src_masks = src_masks[src_idx] # Modified to handle video target_masks = torch.cat([t['masks'][i] for t, (_, i) in zip(targets, indices)]).to(src_masks) images_lab_sim = torch.cat(images_lab_sim, dim =0) images_lab_sim_nei = torch.cat(images_lab_sim_nei, dim=0) images_lab_sim_nei1 = torch.cat(images_lab_sim_nei1, dim=0) images_lab_sim_nei2 = torch.cat(images_lab_sim_nei2, dim=0) images_lab_sim = images_lab_sim.view(-1, target_masks.shape[1], images_lab_sim.shape[-3], images_lab_sim.shape[-2], images_lab_sim.shape[-1]) images_lab_sim_nei = images_lab_sim_nei.unsqueeze(1) images_lab_sim_nei1 = images_lab_sim_nei1.unsqueeze(1) images_lab_sim_nei2 = images_lab_sim_nei2.unsqueeze(1) if len(src_idx[0].tolist()) > 0: images_lab_sim = torch.cat([images_lab_sim[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1) images_lab_sim_nei = self.topk_mask(torch.cat([images_lab_sim_nei[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1)) images_lab_sim_nei1 = self.topk_mask(torch.cat([images_lab_sim_nei1[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1)) images_lab_sim_nei2 = self.topk_mask(torch.cat([images_lab_sim_nei2[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1)) k_size = 3 if src_masks.shape[0] > 0: pairwise_losses_neighbor = compute_pairwise_term_neighbor( src_masks[:,:1], src_masks[:,1:2], k_size, 3 ) pairwise_losses_neighbor1 = compute_pairwise_term_neighbor( src_masks[:,:1], src_masks[:,2:3], k_size, 3 ) pairwise_losses_neighbor2 = compute_pairwise_term_neighbor( src_masks[:,1:2], src_masks[:,2:3], k_size, 3 ) src_masks = src_masks.flatten(0, 1)[:, None] target_masks = target_masks.flatten(0, 1)[:, None] target_masks = F.interpolate(target_masks, (src_masks.shape[-2], src_masks.shape[-1]), mode='bilinear') if src_masks.shape[0] > 0: loss_prj_term = compute_project_term(src_masks.sigmoid(), target_masks) pairwise_losses = compute_pairwise_term( src_masks, 3, 2 ) weights = (images_lab_sim >= 0.3).float() * target_masks.float() target_masks_sum = target_masks.reshape(pairwise_losses_neighbor.shape[0], 3, target_masks.shape[-2], target_masks.shape[-1]).sum(dim=1, keepdim=True) target_masks_sum = (target_masks_sum >= 1.0).float() weights_neighbor = (images_lab_sim_nei >= 0.05).float() * target_masks_sum # ori is 0.5, 0.01, 0.001, 0.005, 0.0001, 0.02, 0.05, 0.075, 0.1 , dy 0.5 weights_neighbor1 = (images_lab_sim_nei1 >= 0.05).float() * target_masks_sum # ori is 0.5, 0.01, 0.001, 0.005, 0.0001, 0.02, 0.05, 0.075, 0.1, dy 0.5 weights_neighbor2 = (images_lab_sim_nei2 >= 0.05).float() * target_masks_sum # ori is 0.5, 0.01, 0.001, 0.005, 0.0001, 0.02, 0.05, 0.075, 0.1, dy 0.5 warmup_factor = min(self._iter.item() / float(self._warmup_iters), 1.0) #1.0 loss_pairwise = (pairwise_losses * weights).sum() / weights.sum().clamp(min=1.0) loss_pairwise_neighbor = (pairwise_losses_neighbor * weights_neighbor).sum() / weights_neighbor.sum().clamp(min=1.0) * warmup_factor loss_pairwise_neighbor1 = (pairwise_losses_neighbor1 * weights_neighbor1).sum() / weights_neighbor1.sum().clamp(min=1.0) * warmup_factor loss_pairwise_neighbor2 = (pairwise_losses_neighbor2 * weights_neighbor2).sum() / weights_neighbor2.sum().clamp(min=1.0) * warmup_factor else: loss_prj_term = src_masks.sum() * 0. loss_pairwise = src_masks.sum() * 0. loss_pairwise_neighbor = src_masks.sum() * 0. loss_pairwise_neighbor1 = src_masks.sum() * 0. loss_pairwise_neighbor2 = src_masks.sum() * 0. losses = { "loss_mask": src_masks.sum() * 0., "loss_mask_proj": loss_prj_term, "loss_dice": src_masks.sum() * 0., "loss_bound": loss_pairwise, "loss_bound_neighbor": (loss_pairwise_neighbor + loss_pairwise_neighbor1 + loss_pairwise_neighbor2) * 0.1, } del src_masks del target_masks return losses def _get_src_permutation_idx(self, indices): # permute predictions following indices batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) src_idx = torch.cat([src for (src, _) in indices]) return batch_idx, src_idx def _get_tgt_permutation_idx(self, indices): # permute targets following indices batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) tgt_idx = torch.cat([tgt for (_, tgt) in indices]) return batch_idx, tgt_idx def get_loss(self, loss, outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2): loss_map = { 'labels': self.loss_labels, 'masks': self.loss_masks_proj, } assert loss in loss_map, f"do you really want to compute {loss} loss?" if loss == 'masks': return loss_map[loss](outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2) else: return loss_map[loss](outputs, targets, indices, num_masks) def forward(self, outputs, targets, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2): """This performs the loss computation. Parameters: outputs: dict of tensors, see the output specification of the model for the format targets: list of dicts, such that len(targets) == batch_size. The expected keys in each dict depends on the losses applied, see each loss' doc """ outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"} # Retrieve the matching between the outputs of the last layer and the targets indices = self.matcher(outputs_without_aux, targets) # Compute the average number of target boxes accross all nodes, for normalization purposes num_masks = sum(len(t["labels"]) for t in targets) num_masks = torch.as_tensor( [num_masks], dtype=torch.float, device=next(iter(outputs.values())).device ) if is_dist_avail_and_initialized(): torch.distributed.all_reduce(num_masks) num_masks = torch.clamp(num_masks / get_world_size(), min=1).item() # Compute all the requested losses losses = {} for loss in self.losses: losses.update(self.get_loss(loss, outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2)) # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. if "aux_outputs" in outputs: for i, aux_outputs in enumerate(outputs["aux_outputs"]): indices = self.matcher(aux_outputs, targets) for loss in self.losses: l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2) l_dict = {k + f"_{i}": v for k, v in l_dict.items()} losses.update(l_dict) return losses def __repr__(self): head = "Criterion " + self.__class__.__name__ body = [ "matcher: {}".format(self.matcher.__repr__(_repr_indent=8)), "losses: {}".format(self.losses), "weight_dict: {}".format(self.weight_dict), "num_classes: {}".format(self.num_classes), "eos_coef: {}".format(self.eos_coef), "num_points: {}".format(self.num_points), "oversample_ratio: {}".format(self.oversample_ratio), "importance_sample_ratio: {}".format(self.importance_sample_ratio), ] _repr_indent = 4 lines = [head] + [" " * _repr_indent + line for line in body] return "\n".join(lines) ================================================ FILE: mask2former_video/modeling/matcher.py ================================================ # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py """ Modules to compute the matching cost and solve the corresponding LSAP. """ import torch import torch.nn.functional as F from scipy.optimize import linear_sum_assignment from torch import nn from torch.cuda.amp import autocast from detectron2.projects.point_rend.point_features import point_sample import cv2 import os # def visualize_masks(masks, output_dir='masks_new'): # """ # Visualize binary mask tensor with shape (N, H, W) and save them as PNG images in the output directory. # """ # os.makedirs(output_dir, exist_ok=True) # masks = masks.flatten(0, 1) # print('masks shape:', masks.shape) # n, h, w = masks.shape # for i in range(n): # mask = masks[i].cpu().numpy() # mask = (mask * 255).astype('uint8') # # mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR) # filename = os.path.join(output_dir, f'mask_{i}.png') # cv2.imwrite(filename, mask) def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor: """ Compute the bounding boxes around the provided masks. Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. Args: masks (Tensor[N, H, W]): masks to transform where N is the number of masks and (H, W) are the spatial dimensions. Returns: Tensor[N, 4]: bounding boxes """ if masks.numel() == 0: return masks n = masks.shape[0] masks = masks.flatten(0, 1) for index, mask in enumerate(masks): y, x = torch.where(mask != 0) if len(x) * len(y) == 0: continue masks[index, torch.min(y):torch.max(y)+1, torch.min(x):torch.max(x)+1] = 1.0 masks = masks.view(n, -1, masks.shape[-2], masks.shape[-1]) return masks def masks_to_boxes_new(masks: torch.Tensor) -> torch.Tensor: """ Compute the bounding boxes around the provided masks. Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. Args: masks (Tensor[N, H, W]): masks to transform where N is the number of masks and (H, W) are the spatial dimensions. Returns: Tensor[N, 4]: bounding boxes """ if masks.numel() == 0: return masks n, _, h, w = masks.shape masks = masks.flatten(0, 1) y = torch.arange(0, h, dtype=torch.float).to(masks.device) x = torch.arange(0, w, dtype=torch.float).to(masks.device) y, x = torch.meshgrid(y, x) x_mask = (masks * x.unsqueeze(0)) x_max = x_mask.flatten(1).max(-1)[0] + 1 x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] y_mask = (masks * y.unsqueeze(0)) y_max = y_mask.flatten(1).max(-1)[0] + 1 y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] boxes = torch.stack([x_min, y_min, x_max, y_max], 1) # print('boxes shape:', boxes.shape) mem_mask = torch.zeros_like(masks) hMask = torch.logical_or(torch.arange(h).unsqueeze(0).to(boxes)=boxes[:, 3, None]) wMask = torch.logical_or(torch.arange(w).unsqueeze(0).to(boxes)=boxes[:, 2, None]) mem_mask = torch.logical_or(hMask.unsqueeze(2), wMask.unsqueeze(1)).float() # print('mem mask shape:', mem_mask.shape) mem_mask = 1.0 - mem_mask.view(n, -1, masks.shape[-2], masks.shape[-1]) return mem_mask def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor): """ Compute the DICE loss, similar to generalized IOU for masks Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). """ inputs = inputs.sigmoid() inputs = inputs.flatten(1) numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets) denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :] loss = 1 - (numerator + 1) / (denominator + 1) return loss def batch_dice_loss_nosig(inputs: torch.Tensor, targets: torch.Tensor): """ Compute the DICE loss, similar to generalized IOU for masks Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). """ # inputs = inputs.sigmoid() inputs = inputs.flatten(1) numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets) denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :] loss = 1 - (numerator + 1) / (denominator + 1) return loss batch_dice_loss_jit = torch.jit.script( batch_dice_loss ) # type: torch.jit.ScriptModule batch_dice_loss_jit_nosig = torch.jit.script( batch_dice_loss_nosig ) # type: torch.jit.ScriptModule def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor): """ Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). Returns: Loss tensor """ hw = inputs.shape[1] pos = F.binary_cross_entropy_with_logits( inputs, torch.ones_like(inputs), reduction="none" ) neg = F.binary_cross_entropy_with_logits( inputs, torch.zeros_like(inputs), reduction="none" ) loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum( "nc,mc->nm", neg, (1 - targets) ) return loss / hw def batch_sigmoid_ce_loss_nosig(inputs: torch.Tensor, targets: torch.Tensor): """ Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). Returns: Loss tensor """ hw = inputs.shape[1] pos = F.binary_cross_entropy( inputs, torch.ones_like(inputs), reduction="none" ) neg = F.binary_cross_entropy( inputs, torch.zeros_like(inputs), reduction="none" ) loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum( "nc,mc->nm", neg, (1 - targets) ) #print('loss max no sig:', loss.max()) return loss / hw batch_sigmoid_ce_loss_jit = torch.jit.script( batch_sigmoid_ce_loss ) # type: torch.jit.ScriptModule batch_sigmoid_ce_loss_jit_nosig = torch.jit.script( batch_sigmoid_ce_loss_nosig ) # type: torch.jit.ScriptModule class VideoHungarianMatcher(nn.Module): """This class computes an assignment between the targets and the predictions of the network For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are un-matched (and thus treated as non-objects). """ def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0): """Creates the matcher Params: cost_class: This is the relative weight of the classification error in the matching cost cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost """ super().__init__() self.cost_class = cost_class self.cost_mask = cost_mask self.cost_dice = cost_dice assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0" self.num_points = num_points @torch.no_grad() def memory_efficient_forward(self, outputs, targets): """More memory-friendly matching""" bs, num_queries = outputs["pred_logits"].shape[:2] indices = [] # Iterate through batch size for b in range(bs): out_prob = outputs["pred_logits"][b].softmax(-1) # [num_queries, num_classes] tgt_ids = targets[b]["labels"] # Compute the classification cost. Contrary to the loss, we don't use the NLL, # but approximate it in 1 - proba[target class]. # The 1 is a constant that doesn't change the matching, it can be ommitted. cost_class = -out_prob[:, tgt_ids] out_mask = outputs["pred_masks"][b] # [num_queries, T, H_pred, W_pred] is_ytvis = (out_mask.shape[1] == 3) # change here if is_ytvis: # out_mask_c = masks_to_boxes((out_mask.sigmoid() > 0.5).clone().float()).float() out_mask = masks_to_boxes_new((out_mask.sigmoid() > 0.5).float()).float() # ori match # visualize_masks(out_mask, 'box_mask_convert') # gt masks are already padded when preparing target tgt_mask = targets[b]["masks"].to(out_mask) # [num_gts, T, H_pred, W_pred] if is_ytvis: tgt_mask = masks_to_boxes(tgt_mask).float() # ori match, change here will also influnce criterion # all masks share the same set of points for efficient matching! point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device) # get gt labels tgt_mask = point_sample( tgt_mask, point_coords.repeat(tgt_mask.shape[0], 1, 1), align_corners=False, ).flatten(1) out_mask = point_sample( out_mask, point_coords.repeat(out_mask.shape[0], 1, 1), align_corners=False, ).flatten(1) with autocast(enabled=False): out_mask = out_mask.float() tgt_mask = tgt_mask.float() # Compute the dice loss betwen masks if not is_ytvis: cost_dice = batch_dice_loss_jit(out_mask, tgt_mask) cost_mask = batch_sigmoid_ce_loss_jit(out_mask, tgt_mask) else: cost_dice_nosig = batch_dice_loss_jit_nosig(out_mask, tgt_mask) # Final cost matrix if not is_ytvis: C = ( self.cost_mask * cost_mask + self.cost_class * cost_class + self.cost_dice * cost_dice ) else: C = ( self.cost_class * cost_class + self.cost_dice * cost_dice_nosig ) C = C.reshape(num_queries, -1).cpu() indices.append(linear_sum_assignment(C)) return [ (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices ] @torch.no_grad() def forward(self, outputs, targets): """Performs the matching Params: outputs: This is a dict that contains at least these entries: "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth objects in the target) containing the class labels "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) """ return self.memory_efficient_forward(outputs, targets) def __repr__(self, _repr_indent=4): head = "Matcher " + self.__class__.__name__ body = [ "cost_class: {}".format(self.cost_class), "cost_mask: {}".format(self.cost_mask), "cost_dice: {}".format(self.cost_dice), ] lines = [head] + [" " * _repr_indent + line for line in body] return "\n".join(lines) ================================================ FILE: mask2former_video/modeling/transformer_decoder/__init__.py ================================================ from .video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder ================================================ FILE: mask2former_video/modeling/transformer_decoder/position_encoding.py ================================================ # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py """ Various positional encodings for the transformer. """ import math import torch from torch import nn class PositionEmbeddingSine3D(nn.Module): """ This is a more standard version of the position embedding, very similar to the one used by the Attention is all you need paper, generalized to work on images. """ def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): super().__init__() self.num_pos_feats = num_pos_feats self.temperature = temperature self.normalize = normalize if scale is not None and normalize is False: raise ValueError("normalize should be True if scale is passed") if scale is None: scale = 2 * math.pi self.scale = scale def forward(self, x, mask=None): # b, t, c, h, w assert x.dim() == 5, f"{x.shape} should be a 5-dimensional Tensor, got {x.dim()}-dimensional Tensor instead" if mask is None: mask = torch.zeros((x.size(0), x.size(1), x.size(3), x.size(4)), device=x.device, dtype=torch.bool) not_mask = ~mask z_embed = not_mask.cumsum(1, dtype=torch.float32) y_embed = not_mask.cumsum(2, dtype=torch.float32) x_embed = not_mask.cumsum(3, dtype=torch.float32) if self.normalize: eps = 1e-6 z_embed = z_embed / (z_embed[:, -1:, :, :] + eps) * self.scale y_embed = y_embed / (y_embed[:, :, -1:, :] + eps) * self.scale x_embed = x_embed / (x_embed[:, :, :, -1:] + eps) * self.scale dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) dim_t_z = torch.arange((self.num_pos_feats * 2), dtype=torch.float32, device=x.device) dim_t_z = self.temperature ** (2 * (dim_t_z // 2) / (self.num_pos_feats * 2)) pos_x = x_embed[:, :, :, :, None] / dim_t pos_y = y_embed[:, :, :, :, None] / dim_t pos_z = z_embed[:, :, :, :, None] / dim_t_z pos_x = torch.stack((pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), dim=5).flatten(4) pos_y = torch.stack((pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), dim=5).flatten(4) pos_z = torch.stack((pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()), dim=5).flatten(4) pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3) # b, t, c, h, w return pos ================================================ FILE: mask2former_video/modeling/transformer_decoder/video_mask2former_transformer_decoder.py ================================================ # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py import logging import fvcore.nn.weight_init as weight_init from typing import Optional import torch from torch import nn, Tensor from torch.nn import functional as F from detectron2.config import configurable from detectron2.layers import Conv2d from mask2former.modeling.transformer_decoder.maskformer_transformer_decoder import TRANSFORMER_DECODER_REGISTRY from .position_encoding import PositionEmbeddingSine3D class SelfAttentionLayer(nn.Module): def __init__(self, d_model, nhead, dropout=0.0, activation="relu", normalize_before=False): super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.norm = nn.LayerNorm(d_model) self.dropout = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post(self, tgt, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): q = k = self.with_pos_embed(tgt, query_pos) tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) tgt = self.norm(tgt) return tgt def forward_pre(self, tgt, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): tgt2 = self.norm(tgt) q = k = self.with_pos_embed(tgt2, query_pos) tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) return tgt def forward(self, tgt, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): if self.normalize_before: return self.forward_pre(tgt, tgt_mask, tgt_key_padding_mask, query_pos) return self.forward_post(tgt, tgt_mask, tgt_key_padding_mask, query_pos) class CrossAttentionLayer(nn.Module): def __init__(self, d_model, nhead, dropout=0.0, activation="relu", normalize_before=False): super().__init__() self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.norm = nn.LayerNorm(d_model) self.dropout = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post(self, tgt, memory, memory_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos), key=self.with_pos_embed(memory, pos), value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) tgt = self.norm(tgt) return tgt def forward_pre(self, tgt, memory, memory_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): tgt2 = self.norm(tgt) tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos), key=self.with_pos_embed(memory, pos), value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) return tgt def forward(self, tgt, memory, memory_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): if self.normalize_before: return self.forward_pre(tgt, memory, memory_mask, memory_key_padding_mask, pos, query_pos) return self.forward_post(tgt, memory, memory_mask, memory_key_padding_mask, pos, query_pos) class FFNLayer(nn.Module): def __init__(self, d_model, dim_feedforward=2048, dropout=0.0, activation="relu", normalize_before=False): super().__init__() # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm = nn.LayerNorm(d_model) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post(self, tgt): tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) tgt = tgt + self.dropout(tgt2) tgt = self.norm(tgt) return tgt def forward_pre(self, tgt): tgt2 = self.norm(tgt) tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) tgt = tgt + self.dropout(tgt2) return tgt def forward(self, tgt): if self.normalize_before: return self.forward_pre(tgt) return self.forward_post(tgt) def _get_activation_fn(activation): """Return an activation function given a string""" if activation == "relu": return F.relu if activation == "gelu": return F.gelu if activation == "glu": return F.glu raise RuntimeError(F"activation should be relu/gelu, not {activation}.") class MLP(nn.Module): """ Very simple multi-layer perceptron (also called FFN)""" def __init__(self, input_dim, hidden_dim, output_dim, num_layers): super().__init__() self.num_layers = num_layers h = [hidden_dim] * (num_layers - 1) self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) def forward(self, x): for i, layer in enumerate(self.layers): x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) return x @TRANSFORMER_DECODER_REGISTRY.register() class VideoMultiScaleMaskedTransformerDecoder(nn.Module): _version = 2 def _load_from_state_dict( self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ): version = local_metadata.get("version", None) if version is None or version < 2: # Do not warn if train from scratch scratch = True logger = logging.getLogger(__name__) for k in list(state_dict.keys()): newk = k if "static_query" in k: newk = k.replace("static_query", "query_feat") if newk != k: state_dict[newk] = state_dict[k] del state_dict[k] scratch = False if not scratch: logger.warning( f"Weight format of {self.__class__.__name__} have changed! " "Please upgrade your models. Applying automatic conversion now ..." ) @configurable def __init__( self, in_channels, mask_classification=True, *, num_classes: int, hidden_dim: int, num_queries: int, nheads: int, dim_feedforward: int, dec_layers: int, pre_norm: bool, mask_dim: int, enforce_input_project: bool, # video related num_frames, ): """ NOTE: this interface is experimental. Args: in_channels: channels of the input features mask_classification: whether to add mask classifier or not num_classes: number of classes hidden_dim: Transformer feature dimension num_queries: number of queries nheads: number of heads dim_feedforward: feature dimension in feedforward network enc_layers: number of Transformer encoder layers dec_layers: number of Transformer decoder layers pre_norm: whether to use pre-LayerNorm or not mask_dim: mask feature dimension enforce_input_project: add input project 1x1 conv even if input channels and hidden dim is identical """ super().__init__() assert mask_classification, "Only support mask classification model" self.mask_classification = mask_classification self.num_frames = num_frames # positional encoding N_steps = hidden_dim // 2 self.pe_layer = PositionEmbeddingSine3D(N_steps, normalize=True) # define Transformer decoder here self.num_heads = nheads self.num_layers = dec_layers self.transformer_self_attention_layers = nn.ModuleList() self.transformer_cross_attention_layers = nn.ModuleList() self.transformer_ffn_layers = nn.ModuleList() for _ in range(self.num_layers): self.transformer_self_attention_layers.append( SelfAttentionLayer( d_model=hidden_dim, nhead=nheads, dropout=0.0, normalize_before=pre_norm, ) ) self.transformer_cross_attention_layers.append( CrossAttentionLayer( d_model=hidden_dim, nhead=nheads, dropout=0.0, normalize_before=pre_norm, ) ) self.transformer_ffn_layers.append( FFNLayer( d_model=hidden_dim, dim_feedforward=dim_feedforward, dropout=0.0, normalize_before=pre_norm, ) ) self.decoder_norm = nn.LayerNorm(hidden_dim) self.num_queries = num_queries # learnable query features self.query_feat = nn.Embedding(num_queries, hidden_dim) # learnable query p.e. self.query_embed = nn.Embedding(num_queries, hidden_dim) # level embedding (we always use 3 scales) self.num_feature_levels = 3 self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim) self.input_proj = nn.ModuleList() for _ in range(self.num_feature_levels): if in_channels != hidden_dim or enforce_input_project: self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1)) weight_init.c2_xavier_fill(self.input_proj[-1]) else: self.input_proj.append(nn.Sequential()) # output FFNs if self.mask_classification: self.class_embed = nn.Linear(hidden_dim, num_classes + 1) self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) @classmethod def from_config(cls, cfg, in_channels, mask_classification): ret = {} ret["in_channels"] = in_channels ret["mask_classification"] = mask_classification ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES # Transformer parameters: ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD # NOTE: because we add learnable query features which requires supervision, # we add minus 1 to decoder layers to be consistent with our loss # implementation: that is, number of auxiliary losses is always # equal to number of decoder layers. With learnable query features, the number of # auxiliary losses equals number of decoders plus 1. assert cfg.MODEL.MASK_FORMER.DEC_LAYERS >= 1 ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS - 1 ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM ret["num_frames"] = cfg.INPUT.SAMPLING_FRAME_NUM return ret def forward(self, x, mask_features, mask = None): bt, c_m, h_m, w_m = mask_features.shape if bt == 6 or bt == 3: # 3 is for swinl which cannot afford batch size 2 bs = bt // self.num_frames if self.training else 1 else: bs = bt // 4 if self.training else 1 # change here t = bt // bs mask_features = mask_features.view(bs, t, c_m, h_m, w_m) # x is a list of multi-scale feature assert len(x) == self.num_feature_levels src = [] pos = [] size_list = [] # disable mask, it does not affect performance del mask for i in range(self.num_feature_levels): size_list.append(x[i].shape[-2:]) pos.append(self.pe_layer(x[i].view(bs, t, -1, size_list[-1][0], size_list[-1][1]), None).flatten(3)) src.append(self.input_proj[i](x[i]).flatten(2) + self.level_embed.weight[i][None, :, None]) # NTxCxHW => NxTxCxHW => (TxHW)xNxC _, c, hw = src[-1].shape pos[-1] = pos[-1].view(bs, t, c, hw).permute(1, 3, 0, 2).flatten(0, 1) src[-1] = src[-1].view(bs, t, c, hw).permute(1, 3, 0, 2).flatten(0, 1) # QxNxC query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1) output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1) predictions_class = [] predictions_mask = [] # prediction heads on learnable query features outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0]) predictions_class.append(outputs_class) predictions_mask.append(outputs_mask) for i in range(self.num_layers): level_index = i % self.num_feature_levels attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False # attention: cross-attention first output = self.transformer_cross_attention_layers[i]( output, src[level_index], memory_mask=attn_mask, memory_key_padding_mask=None, # here we do not apply masking on padded region pos=pos[level_index], query_pos=query_embed ) output = self.transformer_self_attention_layers[i]( output, tgt_mask=None, tgt_key_padding_mask=None, query_pos=query_embed ) # FFN output = self.transformer_ffn_layers[i]( output ) outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels]) predictions_class.append(outputs_class) predictions_mask.append(outputs_mask) assert len(predictions_class) == self.num_layers + 1 out = { 'pred_logits': predictions_class[-1], 'pred_masks': predictions_mask[-1], 'aux_outputs': self._set_aux_loss( predictions_class if self.mask_classification else None, predictions_mask ) } return out def forward_prediction_heads(self, output, mask_features, attn_mask_target_size): decoder_output = self.decoder_norm(output) decoder_output = decoder_output.transpose(0, 1) outputs_class = self.class_embed(decoder_output) mask_embed = self.mask_embed(decoder_output) outputs_mask = torch.einsum("bqc,btchw->bqthw", mask_embed, mask_features) b, q, t, _, _ = outputs_mask.shape # NOTE: prediction is of higher-resolution # [B, Q, T, H, W] -> [B, Q, T*H*W] -> [B, h, Q, T*H*W] -> [B*h, Q, T*HW] attn_mask = F.interpolate(outputs_mask.flatten(0, 1), size=attn_mask_target_size, mode="bilinear", align_corners=False).view( b, q, t, attn_mask_target_size[0], attn_mask_target_size[1]) # must use bool type # If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged. attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool() attn_mask = attn_mask.detach() return outputs_class, outputs_mask, attn_mask @torch.jit.unused def _set_aux_loss(self, outputs_class, outputs_seg_masks): # this is a workaround to make torchscript happy, as torchscript # doesn't support dictionary with non-homogeneous values, such # as a dict having both a Tensor and a list. if self.mask_classification: return [ {"pred_logits": a, "pred_masks": b} for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1]) ] else: return [{"pred_masks": b} for b in outputs_seg_masks[:-1]] ================================================ FILE: mask2former_video/utils/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. ================================================ FILE: mask2former_video/utils/__init__.py.new ================================================ ================================================ FILE: mask2former_video/utils/memory.py ================================================ import logging from contextlib import contextmanager from functools import wraps import torch from torch.cuda.amp import autocast __all__ = ["retry_if_cuda_oom"] @contextmanager def _ignore_torch_cuda_oom(): """ A context which ignores CUDA OOM exception from pytorch. """ try: yield except RuntimeError as e: # NOTE: the string may change? if "CUDA out of memory. " in str(e): pass else: raise def retry_if_cuda_oom(func): """ Makes a function retry itself after encountering pytorch's CUDA OOM error. It will first retry after calling `torch.cuda.empty_cache()`. If that still fails, it will then retry by trying to convert inputs to CPUs. In this case, it expects the function to dispatch to CPU implementation. The return values may become CPU tensors as well and it's user's responsibility to convert it back to CUDA tensor if needed. Args: func: a stateless callable that takes tensor-like objects as arguments Returns: a callable which retries `func` if OOM is encountered. Examples: :: output = retry_if_cuda_oom(some_torch_function)(input1, input2) # output may be on CPU even if inputs are on GPU Note: 1. When converting inputs to CPU, it will only look at each argument and check if it has `.device` and `.to` for conversion. Nested structures of tensors are not supported. 2. Since the function might be called more than once, it has to be stateless. """ def maybe_to_cpu(x): try: like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to") except AttributeError: like_gpu_tensor = False if like_gpu_tensor: return x.to(device="cpu").to(torch.float32) else: return x @wraps(func) def wrapped(*args, **kwargs): with _ignore_torch_cuda_oom(): return func(*args, **kwargs) # Clear cache and retry torch.cuda.empty_cache() with _ignore_torch_cuda_oom(): return func(*args, **kwargs) # Try on CPU. This slows down the code significantly, therefore print a notice. logger = logging.getLogger(__name__) logger.info("Attempting to copy inputs to CPU due to CUDA OOM") new_args = (maybe_to_cpu(x) for x in args) new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()} with autocast(enabled=False): return func(*new_args, **new_kwargs) return wrapped ================================================ FILE: mask2former_video/video_maskformer_model.py ================================================ import logging import math from typing import Tuple import torch from torch import nn from torch.nn import functional as F from detectron2.config import configurable from detectron2.data import MetadataCatalog from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head from detectron2.modeling.backbone import Backbone from detectron2.modeling.postprocessing import sem_seg_postprocess from detectron2.structures import Boxes, ImageList, Instances, BitMasks from .modeling.criterion import VideoSetCriterion from .modeling.matcher import VideoHungarianMatcher from .utils.memory import retry_if_cuda_oom from skimage import color import cv2 import numpy as np def unfold_wo_center(x, kernel_size, dilation): assert x.dim() == 4 assert kernel_size % 2 == 1 # using SAME padding padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2 unfolded_x = F.unfold( x, kernel_size=kernel_size, padding=padding, dilation=dilation ) unfolded_x = unfolded_x.reshape( x.size(0), x.size(1), -1, x.size(2), x.size(3) ) # remove the center pixels size = kernel_size ** 2 unfolded_x = torch.cat(( unfolded_x[:, :, :size // 2], unfolded_x[:, :, size // 2 + 1:] ), dim=2) return unfolded_x def unfold_w_center(x, kernel_size, dilation): assert x.dim() == 4 assert kernel_size % 2 == 1 # using SAME padding padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2 unfolded_x = F.unfold( x, kernel_size=kernel_size, padding=padding, dilation=dilation ) unfolded_x = unfolded_x.reshape( x.size(0), x.size(1), -1, x.size(2), x.size(3) ) return unfolded_x def get_images_color_similarity(images, kernel_size, dilation): assert images.dim() == 4 assert images.size(0) == 1 unfolded_images = unfold_wo_center( images, kernel_size=kernel_size, dilation=dilation ) diff = images[:, :, None] - unfolded_images similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5) return similarity def get_neighbor_images_color_similarity(images, images_neighbor, kernel_size, dilation): assert images.dim() == 4 assert images.size(0) == 1 unfolded_images = unfold_w_center( images, kernel_size=kernel_size, dilation=dilation ) diff = images_neighbor[:, :, None] - unfolded_images similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5) return similarity def get_neighbor_images_patch_color_similarity(images, images_neighbor, kernel_size, dilation): assert images.dim() == 4 assert images.size(0) == 1 unfolded_images = unfold_w_center( images, kernel_size=kernel_size, dilation= 1 #dilation ) unfolded_images_neighbor = unfold_w_center( images_neighbor, kernel_size=kernel_size, dilation= 1 #dilation ) unfolded_images = unfolded_images.flatten(1,2) unfolded_images_neighbor = unfolded_images_neighbor.flatten(1,2) similarity = get_neighbor_images_color_similarity(unfolded_images, unfolded_images_neighbor, 3, 3) return similarity logger = logging.getLogger(__name__) @META_ARCH_REGISTRY.register() class VideoMaskFormer(nn.Module): """ Main class for mask classification semantic segmentation architectures. """ @configurable def __init__( self, *, backbone: Backbone, sem_seg_head: nn.Module, criterion: nn.Module, num_queries: int, object_mask_threshold: float, overlap_threshold: float, metadata, size_divisibility: int, sem_seg_postprocess_before_inference: bool, pixel_mean: Tuple[float], pixel_std: Tuple[float], # video num_frames, ): """ Args: backbone: a backbone module, must follow detectron2's backbone interface sem_seg_head: a module that predicts semantic segmentation from backbone features criterion: a module that defines the loss num_queries: int, number of queries object_mask_threshold: float, threshold to filter query based on classification score for panoptic segmentation inference overlap_threshold: overlap threshold used in general inference for panoptic segmentation metadata: dataset meta, get `thing` and `stuff` category names for panoptic segmentation inference size_divisibility: Some backbones require the input height and width to be divisible by a specific integer. We can use this to override such requirement. sem_seg_postprocess_before_inference: whether to resize the prediction back to original input size before semantic segmentation inference or after. For high-resolution dataset like Mapillary, resizing predictions before inference will cause OOM error. pixel_mean, pixel_std: list or tuple with #channels element, representing the per-channel mean and std to be used to normalize the input image semantic_on: bool, whether to output semantic segmentation prediction instance_on: bool, whether to output instance segmentation prediction panoptic_on: bool, whether to output panoptic segmentation prediction test_topk_per_image: int, instance segmentation parameter, keep topk instances per image """ super().__init__() self.backbone = backbone self.sem_seg_head = sem_seg_head self.criterion = criterion self.num_queries = num_queries self.overlap_threshold = overlap_threshold self.object_mask_threshold = object_mask_threshold self.metadata = metadata if size_divisibility < 0: # use backbone size_divisibility if not set size_divisibility = self.backbone.size_divisibility self.size_divisibility = size_divisibility self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False) self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False) self.num_frames = num_frames #self.structure_fc = nn.Conv2d(27, 256, 1) @classmethod def from_config(cls, cfg): backbone = build_backbone(cfg) sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape()) # Loss parameters: deep_supervision = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION no_object_weight = cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT # loss weights class_weight = cfg.MODEL.MASK_FORMER.CLASS_WEIGHT dice_weight = cfg.MODEL.MASK_FORMER.DICE_WEIGHT mask_weight = cfg.MODEL.MASK_FORMER.MASK_WEIGHT # building criterion matcher = VideoHungarianMatcher( cost_class=class_weight, cost_mask=mask_weight, cost_dice=dice_weight, num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS, ) weight_dict = {"loss_ce": class_weight, "loss_mask": mask_weight, "loss_mask_proj": mask_weight, "loss_dice": dice_weight, "loss_bound": mask_weight, "loss_bound_neighbor": mask_weight, "loss_out_box": mask_weight} if deep_supervision: dec_layers = cfg.MODEL.MASK_FORMER.DEC_LAYERS aux_weight_dict = {} for i in range(dec_layers - 1): aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) weight_dict.update(aux_weight_dict) losses = ["labels", "masks"] criterion = VideoSetCriterion( sem_seg_head.num_classes, matcher=matcher, weight_dict=weight_dict, eos_coef=no_object_weight, losses=losses, num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS, oversample_ratio=cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO, importance_sample_ratio=cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO, ) return { "backbone": backbone, "sem_seg_head": sem_seg_head, "criterion": criterion, "num_queries": cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES, "object_mask_threshold": cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD, "overlap_threshold": cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD, "metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]), "size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY, "sem_seg_postprocess_before_inference": True, "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, # video "num_frames": cfg.INPUT.SAMPLING_FRAME_NUM, } @property def device(self): return self.pixel_mean.device def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper`. Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * "image": Tensor, image in (C, H, W) format. * "instances": per-region ground truth * Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model (may be different from input resolution), used in inference. Returns: list[dict]: each dict has the results for one image. The dict contains the following keys: * "sem_seg": A Tensor that represents the per-pixel segmentation prediced by the head. The prediction has shape KxHxW that represents the logits of each class for each pixel. * "panoptic_seg": A tuple that represent panoptic output panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment. segments_info (list[dict]): Describe each segment in `panoptic_seg`. Each dict contains keys "id", "category_id", "isthing". """ images = [] for video in batched_inputs: for frame in video["image"]: images.append(frame.to(self.device)) is_coco = (len(images) == 8) or (len(images) == 4)# change here, 4 is for swinl with bs 1 which cannot afford batch size 2 if self.training and not is_coco: k_size = 3 rs_images = ImageList.from_tensors(images, self.size_divisibility) downsampled_images = F.avg_pool2d(rs_images.tensor.float(), kernel_size=4, stride=4, padding=0) #for img in images] images_lab = [torch.as_tensor(color.rgb2lab(ds_image[[2, 1, 0]].byte().permute(1, 2, 0).cpu().numpy()), device=ds_image.device, dtype=torch.float32).permute(2, 0, 1) for ds_image in downsampled_images] images_lab_sim = [get_images_color_similarity(img_lab.unsqueeze(0), k_size, 2) for img_lab in images_lab] # ori is 0.3, 0.5, 0.7 images_lab_sim_nei = [get_neighbor_images_patch_color_similarity(images_lab[ii].unsqueeze(0), images_lab[ii+1].unsqueeze(0), 3, 3) for ii in range(0, len(images_lab), 3)] # change k form 3 to 5, ori is 3, ori dilation is 3 images_lab_sim_nei1 = [get_neighbor_images_patch_color_similarity(images_lab[ii].unsqueeze(0), images_lab[ii+2].unsqueeze(0), 3, 3) for ii in range(0, len(images_lab), 3)] images_lab_sim_nei2 = [get_neighbor_images_patch_color_similarity(images_lab[ii+1].unsqueeze(0), images_lab[ii+2].unsqueeze(0), 3, 3) for ii in range(0, len(images_lab), 3)] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, self.size_divisibility) features = self.backbone(images.tensor) outputs = self.sem_seg_head(features) if self.training: # mask classification target targets = self.prepare_targets(batched_inputs, images, is_coco) if not is_coco: # bipartite matching-based loss losses = self.criterion(outputs, targets, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2) else: losses = self.criterion(outputs, targets, None, None, None, None) for k in list(losses.keys()): if k in self.criterion.weight_dict: losses[k] *= self.criterion.weight_dict[k] else: # remove this loss if not specified in `weight_dict` losses.pop(k) return losses else: mask_cls_results = outputs["pred_logits"] mask_pred_results = outputs["pred_masks"] mask_cls_result = mask_cls_results[0] # upsample masks mask_pred_result = retry_if_cuda_oom(F.interpolate)( mask_pred_results[0], size=(images.tensor.shape[-2], images.tensor.shape[-1]), mode="bilinear", align_corners=False, ) del outputs input_per_image = batched_inputs[0] image_size = images.image_sizes[0] # image size without padding after data augmentation height = input_per_image.get("height", image_size[0]) # raw image size before data augmentation width = input_per_image.get("width", image_size[1]) return retry_if_cuda_oom(self.inference_video)(mask_cls_result, mask_pred_result, image_size, height, width) def prepare_targets(self, targets, images, is_coco): h_pad, w_pad = images.tensor.shape[-2:] gt_instances = [] for targets_per_video in targets: _num_instance = len(targets_per_video["instances"][0]) if is_coco: mask_shape = [_num_instance, 4, h_pad, w_pad] #change here else: mask_shape = [_num_instance, self.num_frames, h_pad, w_pad] gt_masks_per_video = torch.zeros(mask_shape, dtype=torch.bool, device=self.device) gt_classes_per_video = targets_per_video["instances"][0].gt_classes.to(self.device) gt_ids_per_video = [] for f_i, targets_per_frame in enumerate(targets_per_video["instances"]): targets_per_frame = targets_per_frame.to(self.device) h, w = targets_per_frame.image_size _update_cls = gt_classes_per_video == -1 gt_classes_per_video[_update_cls] = targets_per_frame.gt_classes[_update_cls] gt_ids_per_video.append(targets_per_frame.gt_ids[:, None]) if isinstance(targets_per_frame.gt_masks, BitMasks): gt_masks_per_video[:, f_i, :h, :w] = targets_per_frame.gt_masks.tensor else: #polygon gt_masks_per_video[:, f_i, :h, :w] = targets_per_frame.gt_masks gt_ids_per_video = torch.cat(gt_ids_per_video, dim=1) gt_ids_per_video[gt_masks_per_video.sum(dim=(2,3)) == 0] = -1 valid_bool_frame = (gt_ids_per_video != -1) valid_bool_clip = valid_bool_frame.any(dim=-1) # valid_idx = (gt_ids_per_video != -1).any(dim=-1) gt_classes_per_video = gt_classes_per_video[valid_bool_clip].long() #targets_per_frame.gt_classes[valid_idx] # N, gt_ids_per_video = gt_ids_per_video[valid_bool_clip].long() # N, num_frames valid_bool_frame = valid_bool_frame[valid_bool_clip] if len(gt_ids_per_video) > 0: min_id = max(gt_ids_per_video[valid_bool_frame].min(), 0) gt_ids_per_video[valid_bool_frame] -= min_id gt_instances.append({"labels": gt_classes_per_video, "ids": gt_ids_per_video}) gt_masks_per_video = gt_masks_per_video[valid_bool_clip].float() # N, num_frames, H, W gt_instances[-1].update({"masks": gt_masks_per_video}) return gt_instances def inference_video(self, pred_cls, pred_masks, img_size, output_height, output_width): if len(pred_cls) > 0: scores = F.softmax(pred_cls, dim=-1)[:, :-1] labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1) # keep top-10 predictions scores_per_image, topk_indices = scores.flatten(0, 1).topk(10, sorted=False) labels_per_image = labels[topk_indices] topk_indices = topk_indices // self.sem_seg_head.num_classes pred_masks = pred_masks[topk_indices] pred_masks = pred_masks[:, :, : img_size[0], : img_size[1]] pred_masks = F.interpolate( pred_masks, size=(output_height, output_width), mode="bilinear", align_corners=False ) masks = pred_masks > 0. out_scores = scores_per_image.tolist() out_labels = labels_per_image.tolist() out_masks = [m for m in masks.cpu()] else: out_scores = [] out_labels = [] out_masks = [] video_output = { "image_size": (output_height, output_width), "pred_scores": out_scores, "pred_labels": out_labels, "pred_masks": out_masks, } return video_output ================================================ FILE: mfvis_nococo/__init__.py ================================================ from . import modeling # config from .config import add_maskformer2_video_config # models from .video_maskformer_model import VideoMaskFormer # video from .data_video import ( YTVISDatasetMapper, YTVISEvaluator, build_detection_train_loader, build_detection_test_loader, get_detection_dataset_dicts, ) ================================================ FILE: mfvis_nococo/configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation.yaml ================================================ MODEL: BACKBONE: FREEZE_AT: 0 NAME: "build_resnet_backbone" WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] MASK_ON: True RESNETS: DEPTH: 50 STEM_TYPE: "basic" # not used STEM_OUT_CHANNELS: 64 STRIDE_IN_1X1: False OUT_FEATURES: ["res2", "res3", "res4", "res5"] # NORM: "SyncBN" RES5_MULTI_GRID: [1, 1, 1] # not used DATASETS: TRAIN: ("ytvis_2019_train",) TEST: ("ytvis_2019_val",) SOLVER: IMS_PER_BATCH: 16 BASE_LR: 0.0001 STEPS: (4000,) MAX_ITER: 6000 WARMUP_FACTOR: 1.0 WARMUP_ITERS: 10 WEIGHT_DECAY: 0.05 OPTIMIZER: "ADAMW" BACKBONE_MULTIPLIER: 0.1 CLIP_GRADIENTS: ENABLED: True CLIP_TYPE: "full_model" CLIP_VALUE: 0.01 NORM_TYPE: 2.0 AMP: ENABLED: True INPUT: MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip" RANDOM_FLIP: "flip_by_clip" AUGMENTATIONS: [] MIN_SIZE_TRAIN: (360, 480) MIN_SIZE_TEST: 360 CROP: ENABLED: False TYPE: "absolute_range" SIZE: (600, 720) FORMAT: "RGB" TEST: EVAL_PERIOD: 0 DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 4 VERSION: 2 ================================================ FILE: mfvis_nococo/configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep_coco.yaml ================================================ _BASE_: video_maskformer2_R50_bs16_8ep.yaml OUTPUT_DIR: 'box_patch_newknn_t5s5_spretrained1_r101_correct' MODEL: WEIGHTS: "./pretrained_model/model_final_eba159.pkl" RESNETS: DEPTH: 101 STEM_TYPE: "basic" # not used STEM_OUT_CHANNELS: 64 STRIDE_IN_1X1: False OUT_FEATURES: ["res2", "res3", "res4", "res5"] # NORM: "SyncBN" RES5_MULTI_GRID: [1, 1, 1] # not used ================================================ FILE: mfvis_nococo/configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml ================================================ _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml OUTPUT_DIR: 'box_patch_newknn_t5s5_spretrained3_correct1' SEED: 29118357 MODEL: WEIGHTS: "./model_final_proj.pth" META_ARCHITECTURE: "VideoMaskFormer" SEM_SEG_HEAD: NAME: "MaskFormerHead" IGNORE_VALUE: 255 NUM_CLASSES: 40 LOSS_WEIGHT: 1.0 CONVS_DIM: 256 MASK_DIM: 256 NORM: "GN" # pixel decoder PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" IN_FEATURES: ["res2", "res3", "res4", "res5"] DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] COMMON_STRIDE: 4 TRANSFORMER_ENC_LAYERS: 6 MASK_FORMER: TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder" TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" DEEP_SUPERVISION: True NO_OBJECT_WEIGHT: 0.1 CLASS_WEIGHT: 2.0 MASK_WEIGHT: 5.0 DICE_WEIGHT: 5.0 HIDDEN_DIM: 256 NUM_OBJECT_QUERIES: 100 NHEADS: 8 DROPOUT: 0.0 DIM_FEEDFORWARD: 2048 ENC_LAYERS: 0 PRE_NORM: False ENFORCE_INPUT_PROJ: False SIZE_DIVISIBILITY: 32 DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query TRAIN_NUM_POINTS: 20000 #20000 #12544 OVERSAMPLE_RATIO: 3.0 IMPORTANCE_SAMPLE_RATIO: 0.75 TEST: SEMANTIC_ON: False INSTANCE_ON: True PANOPTIC_ON: False OVERLAP_THRESHOLD: 0.8 OBJECT_MASK_THRESHOLD: 0.8 ================================================ FILE: mfvis_nococo/configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep_coco.yaml ================================================ _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml OUTPUT_DIR: 'box_patch_newknn_t5s5_spretrained3_coco_correct1' SEED: 29118357 MODEL: WEIGHTS: "./pretrained_model/model_final_3c8ec9.pkl" META_ARCHITECTURE: "VideoMaskFormer" SEM_SEG_HEAD: NAME: "MaskFormerHead" IGNORE_VALUE: 255 NUM_CLASSES: 40 LOSS_WEIGHT: 1.0 CONVS_DIM: 256 MASK_DIM: 256 NORM: "GN" # pixel decoder PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" IN_FEATURES: ["res2", "res3", "res4", "res5"] DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] COMMON_STRIDE: 4 TRANSFORMER_ENC_LAYERS: 6 MASK_FORMER: TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder" TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" DEEP_SUPERVISION: True NO_OBJECT_WEIGHT: 0.1 CLASS_WEIGHT: 2.0 MASK_WEIGHT: 5.0 DICE_WEIGHT: 5.0 HIDDEN_DIM: 256 NUM_OBJECT_QUERIES: 100 NHEADS: 8 DROPOUT: 0.0 DIM_FEEDFORWARD: 2048 ENC_LAYERS: 0 PRE_NORM: False ENFORCE_INPUT_PROJ: False SIZE_DIVISIBILITY: 32 DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query TRAIN_NUM_POINTS: 20000 #20000 #12544 OVERSAMPLE_RATIO: 3.0 IMPORTANCE_SAMPLE_RATIO: 0.75 TEST: SEMANTIC_ON: False INSTANCE_ON: True PANOPTIC_ON: False OVERLAP_THRESHOLD: 0.8 OBJECT_MASK_THRESHOLD: 0.8 ================================================ FILE: mfvis_nococo/mask2former/__init__.py ================================================ from . import data # register all new datasets from . import modeling # config from .config import add_maskformer2_config # dataset loading from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper from .data.dataset_mappers.mask_former_instance_dataset_mapper import ( MaskFormerInstanceDatasetMapper, ) from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import ( MaskFormerPanopticDatasetMapper, ) from .data.dataset_mappers.mask_former_semantic_dataset_mapper import ( MaskFormerSemanticDatasetMapper, ) # models from .maskformer_model import MaskFormer from .test_time_augmentation import SemanticSegmentorWithTTA # evaluation from .evaluation.instance_evaluation import InstanceSegEvaluator ================================================ FILE: mfvis_nococo/mask2former/config.py ================================================ # -*- coding: utf-8 -*- from detectron2.config import CfgNode as CN def add_maskformer2_config(cfg): """ Add config for MASK_FORMER. """ # NOTE: configs from original maskformer # data config # select the dataset mapper cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic" # Color augmentation cfg.INPUT.COLOR_AUG_SSD = False # We retry random cropping until no single category in semantic segmentation GT occupies more # than `SINGLE_CATEGORY_MAX_AREA` part of the crop. cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0 # Pad image and segmentation GT in dataset mapper. cfg.INPUT.SIZE_DIVISIBILITY = -1 # solver config # weight decay on embedding cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0 # optimizer cfg.SOLVER.OPTIMIZER = "ADAMW" cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1 # mask_former model config cfg.MODEL.MASK_FORMER = CN() # loss cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1 cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0 cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0 cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0 # transformer config cfg.MODEL.MASK_FORMER.NHEADS = 8 cfg.MODEL.MASK_FORMER.DROPOUT = 0.1 cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048 cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0 cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6 cfg.MODEL.MASK_FORMER.PRE_NORM = False cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256 cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100 cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5" cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False # mask_former inference config cfg.MODEL.MASK_FORMER.TEST = CN() cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0 cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0 cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet) # you can use this config to override cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32 # pixel decoder config cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256 # adding transformer in pixel decoder cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0 # pixel decoder cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder" # swin transformer backbone cfg.MODEL.SWIN = CN() cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224 cfg.MODEL.SWIN.PATCH_SIZE = 4 cfg.MODEL.SWIN.EMBED_DIM = 96 cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2] cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24] cfg.MODEL.SWIN.WINDOW_SIZE = 7 cfg.MODEL.SWIN.MLP_RATIO = 4.0 cfg.MODEL.SWIN.QKV_BIAS = True cfg.MODEL.SWIN.QK_SCALE = None cfg.MODEL.SWIN.DROP_RATE = 0.0 cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0 cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3 cfg.MODEL.SWIN.APE = False cfg.MODEL.SWIN.PATCH_NORM = True cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"] cfg.MODEL.SWIN.USE_CHECKPOINT = False # NOTE: maskformer2 extra configs # transformer module cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder" # LSJ aug cfg.INPUT.IMAGE_SIZE = 1024 cfg.INPUT.MIN_SCALE = 0.1 cfg.INPUT.MAX_SCALE = 2.0 # MSDeformAttn encoder configs cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"] cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4 cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8 # point loss configs # Number of points sampled during training for a mask point head. cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112 # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the # original paper. cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0 # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in # the original paper. cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75 ================================================ FILE: mfvis_nococo/mask2former/data/__init__.py ================================================ from . import datasets ================================================ FILE: mfvis_nococo/mask2former/data/dataset_mappers/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. ================================================ FILE: mfvis_nococo/mask2former/data/dataset_mappers/__init__.py.new ================================================ ================================================ FILE: mfvis_nococo/mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py ================================================ # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py import copy import logging import numpy as np import torch from detectron2.config import configurable from detectron2.data import detection_utils as utils from detectron2.data import transforms as T from detectron2.data.transforms import TransformGen from detectron2.structures import BitMasks, Instances from pycocotools import mask as coco_mask __all__ = ["COCOInstanceNewBaselineDatasetMapper"] def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor: """ Compute the bounding boxes around the provided masks. Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. Args: masks (Tensor[N, H, W]): masks to transform where N is the number of masks and (H, W) are the spatial dimensions. Returns: Tensor[N, 4]: bounding boxes """ if masks.numel() == 0: return masks n = masks.shape[0] for index, mask in enumerate(masks): y, x = torch.where(mask != 0) if len(x) * len(y) == 0: continue h = torch.max(y) - torch.min(y) w = torch.max(x) - torch.min(x) masks[index, torch.min(y):torch.max(y), torch.min(x):torch.max(x)] = 1.0 return masks def convert_coco_poly_to_mask(segmentations, height, width): masks = [] for polygons in segmentations: rles = coco_mask.frPyObjects(polygons, height, width) mask = coco_mask.decode(rles) if len(mask.shape) < 3: mask = mask[..., None] mask = torch.as_tensor(mask, dtype=torch.uint8) mask = mask.any(dim=2) masks.append(mask) if masks: masks = torch.stack(masks, dim=0) masks = masks_to_boxes(masks) else: masks = torch.zeros((0, height, width), dtype=torch.uint8) return masks def build_transform_gen(cfg, is_train): """ Create a list of default :class:`Augmentation` from config. Now it includes resizing and flipping. Returns: list[Augmentation] """ assert is_train, "Only support training augmentation" image_size = cfg.INPUT.IMAGE_SIZE min_scale = cfg.INPUT.MIN_SCALE max_scale = cfg.INPUT.MAX_SCALE augmentation = [] if cfg.INPUT.RANDOM_FLIP != "none": augmentation.append( T.RandomFlip( horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal", vertical=cfg.INPUT.RANDOM_FLIP == "vertical", ) ) augmentation.extend([ T.ResizeScale( min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size ), T.FixedSizeCrop(crop_size=(image_size, image_size)), ]) return augmentation # This is specifically designed for the COCO dataset. class COCOInstanceNewBaselineDatasetMapper: """ A callable which takes a dataset dict in Detectron2 Dataset format, and map it into a format used by MaskFormer. This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. The callable currently does the following: 1. Read the image from "file_name" 2. Applies geometric transforms to the image and annotation 3. Find and applies suitable cropping to the image and annotation 4. Prepare image and annotation to Tensors """ @configurable def __init__( self, is_train=True, *, tfm_gens, image_format, ): """ NOTE: this interface is experimental. Args: is_train: for training or inference augmentations: a list of augmentations or deterministic transforms to apply tfm_gens: data augmentation image_format: an image format supported by :func:`detection_utils.read_image`. """ self.tfm_gens = tfm_gens logging.getLogger(__name__).info( "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens)) ) self.img_format = image_format self.is_train = is_train @classmethod def from_config(cls, cfg, is_train=True): # Build augmentation tfm_gens = build_transform_gen(cfg, is_train) ret = { "is_train": is_train, "tfm_gens": tfm_gens, "image_format": cfg.INPUT.FORMAT, } return ret def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below image = utils.read_image(dataset_dict["file_name"], format=self.img_format) utils.check_image_size(dataset_dict, image) # TODO: get padding mask # by feeding a "segmentation mask" to the same transforms padding_mask = np.ones(image.shape[:2]) image, transforms = T.apply_transform_gens(self.tfm_gens, image) # the crop transformation has default padding value 0 for segmentation padding_mask = transforms.apply_segmentation(padding_mask) padding_mask = ~ padding_mask.astype(bool) image_shape = image.shape[:2] # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask)) if not self.is_train: # USER: Modify this if you want to keep them for some reason. dataset_dict.pop("annotations", None) return dataset_dict if "annotations" in dataset_dict: # USER: Modify this if you want to keep them for some reason. for anno in dataset_dict["annotations"]: # Let's always keep mask # if not self.mask_on: # anno.pop("segmentation", None) anno.pop("keypoints", None) # USER: Implement additional transformations if you have other types of data annos = [ utils.transform_instance_annotations(obj, transforms, image_shape) for obj in dataset_dict.pop("annotations") if obj.get("iscrowd", 0) == 0 ] # NOTE: does not support BitMask due to augmentation # Current BitMask cannot handle empty objects instances = utils.annotations_to_instances(annos, image_shape) # After transforms such as cropping are applied, the bounding box may no longer # tightly bound the object. As an example, imagine a triangle object # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to # the intersection of original bounding box and the cropping box. instances.gt_boxes = instances.gt_masks.get_bounding_boxes() # Need to filter empty instances first (due to augmentation) instances = utils.filter_empty_instances(instances) # Generate masks from polygon h, w = instances.image_size # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float) if hasattr(instances, 'gt_masks'): gt_masks = instances.gt_masks gt_masks_box = convert_coco_poly_to_mask(gt_masks.polygons, h, w) instances.gt_masks = gt_masks_box dataset_dict["instances"] = instances return dataset_dict ================================================ FILE: mfvis_nococo/mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py ================================================ # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py import copy import logging import numpy as np import torch from detectron2.config import configurable from detectron2.data import detection_utils as utils from detectron2.data import transforms as T from detectron2.data.transforms import TransformGen from detectron2.structures import BitMasks, Boxes, Instances __all__ = ["COCOPanopticNewBaselineDatasetMapper"] def build_transform_gen(cfg, is_train): """ Create a list of default :class:`Augmentation` from config. Now it includes resizing and flipping. Returns: list[Augmentation] """ assert is_train, "Only support training augmentation" image_size = cfg.INPUT.IMAGE_SIZE min_scale = cfg.INPUT.MIN_SCALE max_scale = cfg.INPUT.MAX_SCALE augmentation = [] if cfg.INPUT.RANDOM_FLIP != "none": augmentation.append( T.RandomFlip( horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal", vertical=cfg.INPUT.RANDOM_FLIP == "vertical", ) ) augmentation.extend([ T.ResizeScale( min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size ), T.FixedSizeCrop(crop_size=(image_size, image_size)), ]) return augmentation # This is specifically designed for the COCO dataset. class COCOPanopticNewBaselineDatasetMapper: """ A callable which takes a dataset dict in Detectron2 Dataset format, and map it into a format used by MaskFormer. This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. The callable currently does the following: 1. Read the image from "file_name" 2. Applies geometric transforms to the image and annotation 3. Find and applies suitable cropping to the image and annotation 4. Prepare image and annotation to Tensors """ @configurable def __init__( self, is_train=True, *, tfm_gens, image_format, ): """ NOTE: this interface is experimental. Args: is_train: for training or inference augmentations: a list of augmentations or deterministic transforms to apply crop_gen: crop augmentation tfm_gens: data augmentation image_format: an image format supported by :func:`detection_utils.read_image`. """ self.tfm_gens = tfm_gens logging.getLogger(__name__).info( "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format( str(self.tfm_gens) ) ) self.img_format = image_format self.is_train = is_train @classmethod def from_config(cls, cfg, is_train=True): # Build augmentation tfm_gens = build_transform_gen(cfg, is_train) ret = { "is_train": is_train, "tfm_gens": tfm_gens, "image_format": cfg.INPUT.FORMAT, } return ret def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below image = utils.read_image(dataset_dict["file_name"], format=self.img_format) utils.check_image_size(dataset_dict, image) image, transforms = T.apply_transform_gens(self.tfm_gens, image) image_shape = image.shape[:2] # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) if not self.is_train: # USER: Modify this if you want to keep them for some reason. dataset_dict.pop("annotations", None) return dataset_dict if "pan_seg_file_name" in dataset_dict: pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB") segments_info = dataset_dict["segments_info"] # apply the same transformation to panoptic segmentation pan_seg_gt = transforms.apply_segmentation(pan_seg_gt) from panopticapi.utils import rgb2id pan_seg_gt = rgb2id(pan_seg_gt) instances = Instances(image_shape) classes = [] masks = [] for segment_info in segments_info: class_id = segment_info["category_id"] if not segment_info["iscrowd"]: classes.append(class_id) masks.append(pan_seg_gt == segment_info["id"]) classes = np.array(classes) instances.gt_classes = torch.tensor(classes, dtype=torch.int64) if len(masks) == 0: # Some image does not have annotation (all ignored) instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) instances.gt_boxes = Boxes(torch.zeros((0, 4))) else: masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) ) instances.gt_masks = masks.tensor instances.gt_boxes = masks.get_bounding_boxes() dataset_dict["instances"] = instances return dataset_dict ================================================ FILE: mfvis_nococo/mask2former/data/dataset_mappers/mask_former_instance_dataset_mapper.py ================================================ import copy import logging import numpy as np import pycocotools.mask as mask_util import torch from torch.nn import functional as F from detectron2.config import configurable from detectron2.data import detection_utils as utils from detectron2.data import transforms as T from detectron2.projects.point_rend import ColorAugSSDTransform from detectron2.structures import BitMasks, Instances, polygons_to_bitmask __all__ = ["MaskFormerInstanceDatasetMapper"] class MaskFormerInstanceDatasetMapper: """ A callable which takes a dataset dict in Detectron2 Dataset format, and map it into a format used by MaskFormer for instance segmentation. The callable currently does the following: 1. Read the image from "file_name" 2. Applies geometric transforms to the image and annotation 3. Find and applies suitable cropping to the image and annotation 4. Prepare image and annotation to Tensors """ @configurable def __init__( self, is_train=True, *, augmentations, image_format, size_divisibility, ): """ NOTE: this interface is experimental. Args: is_train: for training or inference augmentations: a list of augmentations or deterministic transforms to apply image_format: an image format supported by :func:`detection_utils.read_image`. size_divisibility: pad image size to be divisible by this value """ self.is_train = is_train self.tfm_gens = augmentations self.img_format = image_format self.size_divisibility = size_divisibility logger = logging.getLogger(__name__) mode = "training" if is_train else "inference" logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}") @classmethod def from_config(cls, cfg, is_train=True): # Build augmentation augs = [ T.ResizeShortestEdge( cfg.INPUT.MIN_SIZE_TRAIN, cfg.INPUT.MAX_SIZE_TRAIN, cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING, ) ] if cfg.INPUT.CROP.ENABLED: augs.append( T.RandomCrop( cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE, ) ) if cfg.INPUT.COLOR_AUG_SSD: augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) augs.append(T.RandomFlip()) ret = { "is_train": is_train, "augmentations": augs, "image_format": cfg.INPUT.FORMAT, "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY, } return ret def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!" dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below image = utils.read_image(dataset_dict["file_name"], format=self.img_format) utils.check_image_size(dataset_dict, image) aug_input = T.AugInput(image) aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) image = aug_input.image # transform instnace masks assert "annotations" in dataset_dict for anno in dataset_dict["annotations"]: anno.pop("keypoints", None) annos = [ utils.transform_instance_annotations(obj, transforms, image.shape[:2]) for obj in dataset_dict.pop("annotations") if obj.get("iscrowd", 0) == 0 ] if len(annos): assert "segmentation" in annos[0] segms = [obj["segmentation"] for obj in annos] masks = [] for segm in segms: if isinstance(segm, list): # polygon masks.append(polygons_to_bitmask(segm, *image.shape[:2])) elif isinstance(segm, dict): # COCO RLE masks.append(mask_util.decode(segm)) elif isinstance(segm, np.ndarray): assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim ) # mask array masks.append(segm) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a binary segmentation mask " " in a 2D numpy array of shape HxW.".format(type(segm)) ) # Pad image and segmentation label here! image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks] classes = [int(obj["category_id"]) for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) if self.size_divisibility > 0: image_size = (image.shape[-2], image.shape[-1]) padding_size = [ 0, self.size_divisibility - image_size[1], 0, self.size_divisibility - image_size[0], ] # pad image image = F.pad(image, padding_size, value=128).contiguous() # pad mask masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks] image_shape = (image.shape[-2], image.shape[-1]) # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = image # Prepare per-category binary masks instances = Instances(image_shape) instances.gt_classes = classes if len(masks) == 0: # Some image does not have annotation (all ignored) instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1])) else: masks = BitMasks(torch.stack(masks)) instances.gt_masks = masks.tensor dataset_dict["instances"] = instances return dataset_dict ================================================ FILE: mfvis_nococo/mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py ================================================ import copy import logging import numpy as np import torch from torch.nn import functional as F from detectron2.config import configurable from detectron2.data import detection_utils as utils from detectron2.data import transforms as T from detectron2.structures import BitMasks, Instances from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper __all__ = ["MaskFormerPanopticDatasetMapper"] class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper): """ A callable which takes a dataset dict in Detectron2 Dataset format, and map it into a format used by MaskFormer for panoptic segmentation. The callable currently does the following: 1. Read the image from "file_name" 2. Applies geometric transforms to the image and annotation 3. Find and applies suitable cropping to the image and annotation 4. Prepare image and annotation to Tensors """ @configurable def __init__( self, is_train=True, *, augmentations, image_format, ignore_label, size_divisibility, ): """ NOTE: this interface is experimental. Args: is_train: for training or inference augmentations: a list of augmentations or deterministic transforms to apply image_format: an image format supported by :func:`detection_utils.read_image`. ignore_label: the label that is ignored to evaluation size_divisibility: pad image size to be divisible by this value """ super().__init__( is_train, augmentations=augmentations, image_format=image_format, ignore_label=ignore_label, size_divisibility=size_divisibility, ) def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!" dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below image = utils.read_image(dataset_dict["file_name"], format=self.img_format) utils.check_image_size(dataset_dict, image) # semantic segmentation if "sem_seg_file_name" in dataset_dict: # PyTorch transformation not implemented for uint16, so converting it to double first sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double") else: sem_seg_gt = None # panoptic segmentation if "pan_seg_file_name" in dataset_dict: pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB") segments_info = dataset_dict["segments_info"] else: pan_seg_gt = None segments_info = None if pan_seg_gt is None: raise ValueError( "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format( dataset_dict["file_name"] ) ) aug_input = T.AugInput(image, sem_seg=sem_seg_gt) aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) image = aug_input.image if sem_seg_gt is not None: sem_seg_gt = aug_input.sem_seg # apply the same transformation to panoptic segmentation pan_seg_gt = transforms.apply_segmentation(pan_seg_gt) from panopticapi.utils import rgb2id pan_seg_gt = rgb2id(pan_seg_gt) # Pad image and segmentation label here! image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) if sem_seg_gt is not None: sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long")) if self.size_divisibility > 0: image_size = (image.shape[-2], image.shape[-1]) padding_size = [ 0, self.size_divisibility - image_size[1], 0, self.size_divisibility - image_size[0], ] image = F.pad(image, padding_size, value=128).contiguous() if sem_seg_gt is not None: sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous() pan_seg_gt = F.pad( pan_seg_gt, padding_size, value=0 ).contiguous() # 0 is the VOID panoptic label image_shape = (image.shape[-2], image.shape[-1]) # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = image if sem_seg_gt is not None: dataset_dict["sem_seg"] = sem_seg_gt.long() if "annotations" in dataset_dict: raise ValueError("Pemantic segmentation dataset should not have 'annotations'.") # Prepare per-category binary masks pan_seg_gt = pan_seg_gt.numpy() instances = Instances(image_shape) classes = [] masks = [] for segment_info in segments_info: class_id = segment_info["category_id"] if not segment_info["iscrowd"]: classes.append(class_id) masks.append(pan_seg_gt == segment_info["id"]) classes = np.array(classes) instances.gt_classes = torch.tensor(classes, dtype=torch.int64) if len(masks) == 0: # Some image does not have annotation (all ignored) instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) else: masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) ) instances.gt_masks = masks.tensor dataset_dict["instances"] = instances return dataset_dict ================================================ FILE: mfvis_nococo/mask2former/data/dataset_mappers/mask_former_semantic_dataset_mapper.py ================================================ import copy import logging import numpy as np import torch from torch.nn import functional as F from detectron2.config import configurable from detectron2.data import MetadataCatalog from detectron2.data import detection_utils as utils from detectron2.data import transforms as T from detectron2.projects.point_rend import ColorAugSSDTransform from detectron2.structures import BitMasks, Instances __all__ = ["MaskFormerSemanticDatasetMapper"] class MaskFormerSemanticDatasetMapper: """ A callable which takes a dataset dict in Detectron2 Dataset format, and map it into a format used by MaskFormer for semantic segmentation. The callable currently does the following: 1. Read the image from "file_name" 2. Applies geometric transforms to the image and annotation 3. Find and applies suitable cropping to the image and annotation 4. Prepare image and annotation to Tensors """ @configurable def __init__( self, is_train=True, *, augmentations, image_format, ignore_label, size_divisibility, ): """ NOTE: this interface is experimental. Args: is_train: for training or inference augmentations: a list of augmentations or deterministic transforms to apply image_format: an image format supported by :func:`detection_utils.read_image`. ignore_label: the label that is ignored to evaluation size_divisibility: pad image size to be divisible by this value """ self.is_train = is_train self.tfm_gens = augmentations self.img_format = image_format self.ignore_label = ignore_label self.size_divisibility = size_divisibility logger = logging.getLogger(__name__) mode = "training" if is_train else "inference" logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}") @classmethod def from_config(cls, cfg, is_train=True): # Build augmentation augs = [ T.ResizeShortestEdge( cfg.INPUT.MIN_SIZE_TRAIN, cfg.INPUT.MAX_SIZE_TRAIN, cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING, ) ] if cfg.INPUT.CROP.ENABLED: augs.append( T.RandomCrop_CategoryAreaConstraint( cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE, cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA, cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, ) ) if cfg.INPUT.COLOR_AUG_SSD: augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) augs.append(T.RandomFlip()) # Assume always applies to the training set. dataset_names = cfg.DATASETS.TRAIN meta = MetadataCatalog.get(dataset_names[0]) ignore_label = meta.ignore_label ret = { "is_train": is_train, "augmentations": augs, "image_format": cfg.INPUT.FORMAT, "ignore_label": ignore_label, "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY, } return ret def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!" dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below image = utils.read_image(dataset_dict["file_name"], format=self.img_format) utils.check_image_size(dataset_dict, image) if "sem_seg_file_name" in dataset_dict: # PyTorch transformation not implemented for uint16, so converting it to double first sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double") else: sem_seg_gt = None if sem_seg_gt is None: raise ValueError( "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format( dataset_dict["file_name"] ) ) aug_input = T.AugInput(image, sem_seg=sem_seg_gt) aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) image = aug_input.image sem_seg_gt = aug_input.sem_seg # Pad image and segmentation label here! image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) if sem_seg_gt is not None: sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) if self.size_divisibility > 0: image_size = (image.shape[-2], image.shape[-1]) padding_size = [ 0, self.size_divisibility - image_size[1], 0, self.size_divisibility - image_size[0], ] image = F.pad(image, padding_size, value=128).contiguous() if sem_seg_gt is not None: sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous() image_shape = (image.shape[-2], image.shape[-1]) # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = image if sem_seg_gt is not None: dataset_dict["sem_seg"] = sem_seg_gt.long() if "annotations" in dataset_dict: raise ValueError("Semantic segmentation dataset should not have 'annotations'.") # Prepare per-category binary masks if sem_seg_gt is not None: sem_seg_gt = sem_seg_gt.numpy() instances = Instances(image_shape) classes = np.unique(sem_seg_gt) # remove ignored region classes = classes[classes != self.ignore_label] instances.gt_classes = torch.tensor(classes, dtype=torch.int64) masks = [] for class_id in classes: masks.append(sem_seg_gt == class_id) if len(masks) == 0: # Some image does not have annotation (all ignored) instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1])) else: masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) ) instances.gt_masks = masks.tensor dataset_dict["instances"] = instances return dataset_dict ================================================ FILE: mfvis_nococo/mask2former/data/datasets/__init__.py ================================================ from . import ( register_ade20k_full, register_ade20k_panoptic, register_coco_stuff_10k, register_mapillary_vistas, register_coco_panoptic_annos_semseg, register_ade20k_instance, register_mapillary_vistas_panoptic, ) ================================================ FILE: mfvis_nococo/mask2former/data/datasets/register_ade20k_full.py ================================================ import os from detectron2.data import DatasetCatalog, MetadataCatalog from detectron2.data.datasets import load_sem_seg ADE20K_SEM_SEG_FULL_CATEGORIES = [ {"name": "wall", "id": 2978, "trainId": 0}, {"name": "building, edifice", "id": 312, "trainId": 1}, {"name": "sky", "id": 2420, "trainId": 2}, {"name": "tree", "id": 2855, "trainId": 3}, {"name": "road, route", "id": 2131, "trainId": 4}, {"name": "floor, flooring", "id": 976, "trainId": 5}, {"name": "ceiling", "id": 447, "trainId": 6}, {"name": "bed", "id": 165, "trainId": 7}, {"name": "sidewalk, pavement", "id": 2377, "trainId": 8}, {"name": "earth, ground", "id": 838, "trainId": 9}, {"name": "cabinet", "id": 350, "trainId": 10}, {"name": "person, individual, someone, somebody, mortal, soul", "id": 1831, "trainId": 11}, {"name": "grass", "id": 1125, "trainId": 12}, {"name": "windowpane, window", "id": 3055, "trainId": 13}, {"name": "car, auto, automobile, machine, motorcar", "id": 401, "trainId": 14}, {"name": "mountain, mount", "id": 1610, "trainId": 15}, {"name": "plant, flora, plant life", "id": 1910, "trainId": 16}, {"name": "table", "id": 2684, "trainId": 17}, {"name": "chair", "id": 471, "trainId": 18}, {"name": "curtain, drape, drapery, mantle, pall", "id": 687, "trainId": 19}, {"name": "door", "id": 774, "trainId": 20}, {"name": "sofa, couch, lounge", "id": 2473, "trainId": 21}, {"name": "sea", "id": 2264, "trainId": 22}, {"name": "painting, picture", "id": 1735, "trainId": 23}, {"name": "water", "id": 2994, "trainId": 24}, {"name": "mirror", "id": 1564, "trainId": 25}, {"name": "house", "id": 1276, "trainId": 26}, {"name": "rug, carpet, carpeting", "id": 2178, "trainId": 27}, {"name": "shelf", "id": 2329, "trainId": 28}, {"name": "armchair", "id": 57, "trainId": 29}, {"name": "fence, fencing", "id": 907, "trainId": 30}, {"name": "field", "id": 913, "trainId": 31}, {"name": "lamp", "id": 1395, "trainId": 32}, {"name": "rock, stone", "id": 2138, "trainId": 33}, {"name": "seat", "id": 2272, "trainId": 34}, {"name": "river", "id": 2128, "trainId": 35}, {"name": "desk", "id": 724, "trainId": 36}, {"name": "bathtub, bathing tub, bath, tub", "id": 155, "trainId": 37}, {"name": "railing, rail", "id": 2053, "trainId": 38}, {"name": "signboard, sign", "id": 2380, "trainId": 39}, {"name": "cushion", "id": 689, "trainId": 40}, {"name": "path", "id": 1788, "trainId": 41}, {"name": "work surface", "id": 3087, "trainId": 42}, {"name": "stairs, steps", "id": 2530, "trainId": 43}, {"name": "column, pillar", "id": 581, "trainId": 44}, {"name": "sink", "id": 2388, "trainId": 45}, {"name": "wardrobe, closet, press", "id": 2985, "trainId": 46}, {"name": "snow", "id": 2454, "trainId": 47}, {"name": "refrigerator, icebox", "id": 2096, "trainId": 48}, {"name": "base, pedestal, stand", "id": 137, "trainId": 49}, {"name": "bridge, span", "id": 294, "trainId": 50}, {"name": "blind, screen", "id": 212, "trainId": 51}, {"name": "runway", "id": 2185, "trainId": 52}, {"name": "cliff, drop, drop-off", "id": 524, "trainId": 53}, {"name": "sand", "id": 2212, "trainId": 54}, {"name": "fireplace, hearth, open fireplace", "id": 943, "trainId": 55}, {"name": "pillow", "id": 1869, "trainId": 56}, {"name": "screen door, screen", "id": 2251, "trainId": 57}, {"name": "toilet, can, commode, crapper, pot, potty, stool, throne", "id": 2793, "trainId": 58}, {"name": "skyscraper", "id": 2423, "trainId": 59}, {"name": "grandstand, covered stand", "id": 1121, "trainId": 60}, {"name": "box", "id": 266, "trainId": 61}, {"name": "pool table, billiard table, snooker table", "id": 1948, "trainId": 62}, {"name": "palm, palm tree", "id": 1744, "trainId": 63}, {"name": "double door", "id": 783, "trainId": 64}, {"name": "coffee table, cocktail table", "id": 571, "trainId": 65}, {"name": "counter", "id": 627, "trainId": 66}, {"name": "countertop", "id": 629, "trainId": 67}, {"name": "chest of drawers, chest, bureau, dresser", "id": 491, "trainId": 68}, {"name": "kitchen island", "id": 1374, "trainId": 69}, {"name": "boat", "id": 223, "trainId": 70}, {"name": "waterfall, falls", "id": 3016, "trainId": 71}, { "name": "stove, kitchen stove, range, kitchen range, cooking stove", "id": 2598, "trainId": 72, }, {"name": "flower", "id": 978, "trainId": 73}, {"name": "bookcase", "id": 239, "trainId": 74}, {"name": "controls", "id": 608, "trainId": 75}, {"name": "book", "id": 236, "trainId": 76}, {"name": "stairway, staircase", "id": 2531, "trainId": 77}, {"name": "streetlight, street lamp", "id": 2616, "trainId": 78}, { "name": "computer, computing machine, computing device, data processor, electronic computer, information processing system", "id": 591, "trainId": 79, }, { "name": "bus, autobus, coach, charabanc, double-decker, jitney, motorbus, motorcoach, omnibus, passenger vehicle", "id": 327, "trainId": 80, }, {"name": "swivel chair", "id": 2679, "trainId": 81}, {"name": "light, light source", "id": 1451, "trainId": 82}, {"name": "bench", "id": 181, "trainId": 83}, {"name": "case, display case, showcase, vitrine", "id": 420, "trainId": 84}, {"name": "towel", "id": 2821, "trainId": 85}, {"name": "fountain", "id": 1023, "trainId": 86}, {"name": "embankment", "id": 855, "trainId": 87}, { "name": "television receiver, television, television set, tv, tv set, idiot box, boob tube, telly, goggle box", "id": 2733, "trainId": 88, }, {"name": "van", "id": 2928, "trainId": 89}, {"name": "hill", "id": 1240, "trainId": 90}, {"name": "awning, sunshade, sunblind", "id": 77, "trainId": 91}, {"name": "poster, posting, placard, notice, bill, card", "id": 1969, "trainId": 92}, {"name": "truck, motortruck", "id": 2880, "trainId": 93}, {"name": "airplane, aeroplane, plane", "id": 14, "trainId": 94}, {"name": "pole", "id": 1936, "trainId": 95}, {"name": "tower", "id": 2828, "trainId": 96}, {"name": "court", "id": 631, "trainId": 97}, {"name": "ball", "id": 103, "trainId": 98}, { "name": "aircraft carrier, carrier, flattop, attack aircraft carrier", "id": 3144, "trainId": 99, }, {"name": "buffet, counter, sideboard", "id": 308, "trainId": 100}, {"name": "hovel, hut, hutch, shack, shanty", "id": 1282, "trainId": 101}, {"name": "apparel, wearing apparel, dress, clothes", "id": 38, "trainId": 102}, {"name": "minibike, motorbike", "id": 1563, "trainId": 103}, {"name": "animal, animate being, beast, brute, creature, fauna", "id": 29, "trainId": 104}, {"name": "chandelier, pendant, pendent", "id": 480, "trainId": 105}, {"name": "step, stair", "id": 2569, "trainId": 106}, {"name": "booth, cubicle, stall, kiosk", "id": 247, "trainId": 107}, {"name": "bicycle, bike, wheel, cycle", "id": 187, "trainId": 108}, {"name": "doorframe, doorcase", "id": 778, "trainId": 109}, {"name": "sconce", "id": 2243, "trainId": 110}, {"name": "pond", "id": 1941, "trainId": 111}, {"name": "trade name, brand name, brand, marque", "id": 2833, "trainId": 112}, {"name": "bannister, banister, balustrade, balusters, handrail", "id": 120, "trainId": 113}, {"name": "bag", "id": 95, "trainId": 114}, {"name": "traffic light, traffic signal, stoplight", "id": 2836, "trainId": 115}, {"name": "gazebo", "id": 1087, "trainId": 116}, {"name": "escalator, moving staircase, moving stairway", "id": 868, "trainId": 117}, {"name": "land, ground, soil", "id": 1401, "trainId": 118}, {"name": "board, plank", "id": 220, "trainId": 119}, {"name": "arcade machine", "id": 47, "trainId": 120}, {"name": "eiderdown, duvet, continental quilt", "id": 843, "trainId": 121}, {"name": "bar", "id": 123, "trainId": 122}, {"name": "stall, stand, sales booth", "id": 2537, "trainId": 123}, {"name": "playground", "id": 1927, "trainId": 124}, {"name": "ship", "id": 2337, "trainId": 125}, {"name": "ottoman, pouf, pouffe, puff, hassock", "id": 1702, "trainId": 126}, { "name": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin", "id": 64, "trainId": 127, }, {"name": "bottle", "id": 249, "trainId": 128}, {"name": "cradle", "id": 642, "trainId": 129}, {"name": "pot, flowerpot", "id": 1981, "trainId": 130}, { "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter", "id": 609, "trainId": 131, }, {"name": "train, railroad train", "id": 2840, "trainId": 132}, {"name": "stool", "id": 2586, "trainId": 133}, {"name": "lake", "id": 1393, "trainId": 134}, {"name": "tank, storage tank", "id": 2704, "trainId": 135}, {"name": "ice, water ice", "id": 1304, "trainId": 136}, {"name": "basket, handbasket", "id": 146, "trainId": 137}, {"name": "manhole", "id": 1494, "trainId": 138}, {"name": "tent, collapsible shelter", "id": 2739, "trainId": 139}, {"name": "canopy", "id": 389, "trainId": 140}, {"name": "microwave, microwave oven", "id": 1551, "trainId": 141}, {"name": "barrel, cask", "id": 131, "trainId": 142}, {"name": "dirt track", "id": 738, "trainId": 143}, {"name": "beam", "id": 161, "trainId": 144}, {"name": "dishwasher, dish washer, dishwashing machine", "id": 747, "trainId": 145}, {"name": "plate", "id": 1919, "trainId": 146}, {"name": "screen, crt screen", "id": 3109, "trainId": 147}, {"name": "ruins", "id": 2179, "trainId": 148}, {"name": "washer, automatic washer, washing machine", "id": 2989, "trainId": 149}, {"name": "blanket, cover", "id": 206, "trainId": 150}, {"name": "plaything, toy", "id": 1930, "trainId": 151}, {"name": "food, solid food", "id": 1002, "trainId": 152}, {"name": "screen, silver screen, projection screen", "id": 2254, "trainId": 153}, {"name": "oven", "id": 1708, "trainId": 154}, {"name": "stage", "id": 2526, "trainId": 155}, {"name": "beacon, lighthouse, beacon light, pharos", "id": 160, "trainId": 156}, {"name": "umbrella", "id": 2901, "trainId": 157}, {"name": "sculpture", "id": 2262, "trainId": 158}, {"name": "aqueduct", "id": 44, "trainId": 159}, {"name": "container", "id": 597, "trainId": 160}, {"name": "scaffolding, staging", "id": 2235, "trainId": 161}, {"name": "hood, exhaust hood", "id": 1260, "trainId": 162}, {"name": "curb, curbing, kerb", "id": 682, "trainId": 163}, {"name": "roller coaster", "id": 2151, "trainId": 164}, {"name": "horse, equus caballus", "id": 3107, "trainId": 165}, {"name": "catwalk", "id": 432, "trainId": 166}, {"name": "glass, drinking glass", "id": 1098, "trainId": 167}, {"name": "vase", "id": 2932, "trainId": 168}, {"name": "central reservation", "id": 461, "trainId": 169}, {"name": "carousel", "id": 410, "trainId": 170}, {"name": "radiator", "id": 2046, "trainId": 171}, {"name": "closet", "id": 533, "trainId": 172}, {"name": "machine", "id": 1481, "trainId": 173}, {"name": "pier, wharf, wharfage, dock", "id": 1858, "trainId": 174}, {"name": "fan", "id": 894, "trainId": 175}, {"name": "inflatable bounce game", "id": 1322, "trainId": 176}, {"name": "pitch", "id": 1891, "trainId": 177}, {"name": "paper", "id": 1756, "trainId": 178}, {"name": "arcade, colonnade", "id": 49, "trainId": 179}, {"name": "hot tub", "id": 1272, "trainId": 180}, {"name": "helicopter", "id": 1229, "trainId": 181}, {"name": "tray", "id": 2850, "trainId": 182}, {"name": "partition, divider", "id": 1784, "trainId": 183}, {"name": "vineyard", "id": 2962, "trainId": 184}, {"name": "bowl", "id": 259, "trainId": 185}, {"name": "bullring", "id": 319, "trainId": 186}, {"name": "flag", "id": 954, "trainId": 187}, {"name": "pot", "id": 1974, "trainId": 188}, {"name": "footbridge, overcrossing, pedestrian bridge", "id": 1013, "trainId": 189}, {"name": "shower", "id": 2356, "trainId": 190}, {"name": "bag, traveling bag, travelling bag, grip, suitcase", "id": 97, "trainId": 191}, {"name": "bulletin board, notice board", "id": 318, "trainId": 192}, {"name": "confessional booth", "id": 592, "trainId": 193}, {"name": "trunk, tree trunk, bole", "id": 2885, "trainId": 194}, {"name": "forest", "id": 1017, "trainId": 195}, {"name": "elevator door", "id": 851, "trainId": 196}, {"name": "laptop, laptop computer", "id": 1407, "trainId": 197}, {"name": "instrument panel", "id": 1332, "trainId": 198}, {"name": "bucket, pail", "id": 303, "trainId": 199}, {"name": "tapestry, tapis", "id": 2714, "trainId": 200}, {"name": "platform", "id": 1924, "trainId": 201}, {"name": "jacket", "id": 1346, "trainId": 202}, {"name": "gate", "id": 1081, "trainId": 203}, {"name": "monitor, monitoring device", "id": 1583, "trainId": 204}, { "name": "telephone booth, phone booth, call box, telephone box, telephone kiosk", "id": 2727, "trainId": 205, }, {"name": "spotlight, spot", "id": 2509, "trainId": 206}, {"name": "ring", "id": 2123, "trainId": 207}, {"name": "control panel", "id": 602, "trainId": 208}, {"name": "blackboard, chalkboard", "id": 202, "trainId": 209}, {"name": "air conditioner, air conditioning", "id": 10, "trainId": 210}, {"name": "chest", "id": 490, "trainId": 211}, {"name": "clock", "id": 530, "trainId": 212}, {"name": "sand dune", "id": 2213, "trainId": 213}, {"name": "pipe, pipage, piping", "id": 1884, "trainId": 214}, {"name": "vault", "id": 2934, "trainId": 215}, {"name": "table football", "id": 2687, "trainId": 216}, {"name": "cannon", "id": 387, "trainId": 217}, {"name": "swimming pool, swimming bath, natatorium", "id": 2668, "trainId": 218}, {"name": "fluorescent, fluorescent fixture", "id": 982, "trainId": 219}, {"name": "statue", "id": 2547, "trainId": 220}, { "name": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system", "id": 1474, "trainId": 221, }, {"name": "exhibitor", "id": 877, "trainId": 222}, {"name": "ladder", "id": 1391, "trainId": 223}, {"name": "carport", "id": 414, "trainId": 224}, {"name": "dam", "id": 698, "trainId": 225}, {"name": "pulpit", "id": 2019, "trainId": 226}, {"name": "skylight, fanlight", "id": 2422, "trainId": 227}, {"name": "water tower", "id": 3010, "trainId": 228}, {"name": "grill, grille, grillwork", "id": 1139, "trainId": 229}, {"name": "display board", "id": 753, "trainId": 230}, {"name": "pane, pane of glass, window glass", "id": 1747, "trainId": 231}, {"name": "rubbish, trash, scrap", "id": 2175, "trainId": 232}, {"name": "ice rink", "id": 1301, "trainId": 233}, {"name": "fruit", "id": 1033, "trainId": 234}, {"name": "patio", "id": 1789, "trainId": 235}, {"name": "vending machine", "id": 2939, "trainId": 236}, {"name": "telephone, phone, telephone set", "id": 2730, "trainId": 237}, {"name": "net", "id": 1652, "trainId": 238}, { "name": "backpack, back pack, knapsack, packsack, rucksack, haversack", "id": 90, "trainId": 239, }, {"name": "jar", "id": 1349, "trainId": 240}, {"name": "track", "id": 2830, "trainId": 241}, {"name": "magazine", "id": 1485, "trainId": 242}, {"name": "shutter", "id": 2370, "trainId": 243}, {"name": "roof", "id": 2155, "trainId": 244}, {"name": "banner, streamer", "id": 118, "trainId": 245}, {"name": "landfill", "id": 1402, "trainId": 246}, {"name": "post", "id": 1957, "trainId": 247}, {"name": "altarpiece, reredos", "id": 3130, "trainId": 248}, {"name": "hat, chapeau, lid", "id": 1197, "trainId": 249}, {"name": "arch, archway", "id": 52, "trainId": 250}, {"name": "table game", "id": 2688, "trainId": 251}, {"name": "bag, handbag, pocketbook, purse", "id": 96, "trainId": 252}, {"name": "document, written document, papers", "id": 762, "trainId": 253}, {"name": "dome", "id": 772, "trainId": 254}, {"name": "pier", "id": 1857, "trainId": 255}, {"name": "shanties", "id": 2315, "trainId": 256}, {"name": "forecourt", "id": 1016, "trainId": 257}, {"name": "crane", "id": 643, "trainId": 258}, {"name": "dog, domestic dog, canis familiaris", "id": 3105, "trainId": 259}, {"name": "piano, pianoforte, forte-piano", "id": 1849, "trainId": 260}, {"name": "drawing", "id": 791, "trainId": 261}, {"name": "cabin", "id": 349, "trainId": 262}, { "name": "ad, advertisement, advertizement, advertising, advertizing, advert", "id": 6, "trainId": 263, }, {"name": "amphitheater, amphitheatre, coliseum", "id": 3114, "trainId": 264}, {"name": "monument", "id": 1587, "trainId": 265}, {"name": "henhouse", "id": 1233, "trainId": 266}, {"name": "cockpit", "id": 559, "trainId": 267}, {"name": "heater, warmer", "id": 1223, "trainId": 268}, {"name": "windmill, aerogenerator, wind generator", "id": 3049, "trainId": 269}, {"name": "pool", "id": 1943, "trainId": 270}, {"name": "elevator, lift", "id": 853, "trainId": 271}, {"name": "decoration, ornament, ornamentation", "id": 709, "trainId": 272}, {"name": "labyrinth", "id": 1390, "trainId": 273}, {"name": "text, textual matter", "id": 2748, "trainId": 274}, {"name": "printer", "id": 2007, "trainId": 275}, {"name": "mezzanine, first balcony", "id": 1546, "trainId": 276}, {"name": "mattress", "id": 1513, "trainId": 277}, {"name": "straw", "id": 2600, "trainId": 278}, {"name": "stalls", "id": 2538, "trainId": 279}, {"name": "patio, terrace", "id": 1790, "trainId": 280}, {"name": "billboard, hoarding", "id": 194, "trainId": 281}, {"name": "bus stop", "id": 326, "trainId": 282}, {"name": "trouser, pant", "id": 2877, "trainId": 283}, {"name": "console table, console", "id": 594, "trainId": 284}, {"name": "rack", "id": 2036, "trainId": 285}, {"name": "notebook", "id": 1662, "trainId": 286}, {"name": "shrine", "id": 2366, "trainId": 287}, {"name": "pantry", "id": 1754, "trainId": 288}, {"name": "cart", "id": 418, "trainId": 289}, {"name": "steam shovel", "id": 2553, "trainId": 290}, {"name": "porch", "id": 1951, "trainId": 291}, {"name": "postbox, mailbox, letter box", "id": 1963, "trainId": 292}, {"name": "figurine, statuette", "id": 918, "trainId": 293}, {"name": "recycling bin", "id": 2086, "trainId": 294}, {"name": "folding screen", "id": 997, "trainId": 295}, {"name": "telescope", "id": 2731, "trainId": 296}, {"name": "deck chair, beach chair", "id": 704, "trainId": 297}, {"name": "kennel", "id": 1365, "trainId": 298}, {"name": "coffee maker", "id": 569, "trainId": 299}, {"name": "altar, communion table, lord's table", "id": 3108, "trainId": 300}, {"name": "fish", "id": 948, "trainId": 301}, {"name": "easel", "id": 839, "trainId": 302}, {"name": "artificial golf green", "id": 63, "trainId": 303}, {"name": "iceberg", "id": 1305, "trainId": 304}, {"name": "candlestick, candle holder", "id": 378, "trainId": 305}, {"name": "shower stall, shower bath", "id": 2362, "trainId": 306}, {"name": "television stand", "id": 2734, "trainId": 307}, { "name": "wall socket, wall plug, electric outlet, electrical outlet, outlet, electric receptacle", "id": 2982, "trainId": 308, }, {"name": "skeleton", "id": 2398, "trainId": 309}, {"name": "grand piano, grand", "id": 1119, "trainId": 310}, {"name": "candy, confect", "id": 382, "trainId": 311}, {"name": "grille door", "id": 1141, "trainId": 312}, {"name": "pedestal, plinth, footstall", "id": 1805, "trainId": 313}, {"name": "jersey, t-shirt, tee shirt", "id": 3102, "trainId": 314}, {"name": "shoe", "id": 2341, "trainId": 315}, {"name": "gravestone, headstone, tombstone", "id": 1131, "trainId": 316}, {"name": "shanty", "id": 2316, "trainId": 317}, {"name": "structure", "id": 2626, "trainId": 318}, {"name": "rocking chair, rocker", "id": 3104, "trainId": 319}, {"name": "bird", "id": 198, "trainId": 320}, {"name": "place mat", "id": 1896, "trainId": 321}, {"name": "tomb", "id": 2800, "trainId": 322}, {"name": "big top", "id": 190, "trainId": 323}, {"name": "gas pump, gasoline pump, petrol pump, island dispenser", "id": 3131, "trainId": 324}, {"name": "lockers", "id": 1463, "trainId": 325}, {"name": "cage", "id": 357, "trainId": 326}, {"name": "finger", "id": 929, "trainId": 327}, {"name": "bleachers", "id": 209, "trainId": 328}, {"name": "ferris wheel", "id": 912, "trainId": 329}, {"name": "hairdresser chair", "id": 1164, "trainId": 330}, {"name": "mat", "id": 1509, "trainId": 331}, {"name": "stands", "id": 2539, "trainId": 332}, {"name": "aquarium, fish tank, marine museum", "id": 3116, "trainId": 333}, {"name": "streetcar, tram, tramcar, trolley, trolley car", "id": 2615, "trainId": 334}, {"name": "napkin, table napkin, serviette", "id": 1644, "trainId": 335}, {"name": "dummy", "id": 818, "trainId": 336}, {"name": "booklet, brochure, folder, leaflet, pamphlet", "id": 242, "trainId": 337}, {"name": "sand trap", "id": 2217, "trainId": 338}, {"name": "shop, store", "id": 2347, "trainId": 339}, {"name": "table cloth", "id": 2686, "trainId": 340}, {"name": "service station", "id": 2300, "trainId": 341}, {"name": "coffin", "id": 572, "trainId": 342}, {"name": "drawer", "id": 789, "trainId": 343}, {"name": "cages", "id": 358, "trainId": 344}, {"name": "slot machine, coin machine", "id": 2443, "trainId": 345}, {"name": "balcony", "id": 101, "trainId": 346}, {"name": "volleyball court", "id": 2969, "trainId": 347}, {"name": "table tennis", "id": 2692, "trainId": 348}, {"name": "control table", "id": 606, "trainId": 349}, {"name": "shirt", "id": 2339, "trainId": 350}, {"name": "merchandise, ware, product", "id": 1533, "trainId": 351}, {"name": "railway", "id": 2060, "trainId": 352}, {"name": "parterre", "id": 1782, "trainId": 353}, {"name": "chimney", "id": 495, "trainId": 354}, {"name": "can, tin, tin can", "id": 371, "trainId": 355}, {"name": "tanks", "id": 2707, "trainId": 356}, {"name": "fabric, cloth, material, textile", "id": 889, "trainId": 357}, {"name": "alga, algae", "id": 3156, "trainId": 358}, {"name": "system", "id": 2683, "trainId": 359}, {"name": "map", "id": 1499, "trainId": 360}, {"name": "greenhouse", "id": 1135, "trainId": 361}, {"name": "mug", "id": 1619, "trainId": 362}, {"name": "barbecue", "id": 125, "trainId": 363}, {"name": "trailer", "id": 2838, "trainId": 364}, {"name": "toilet tissue, toilet paper, bathroom tissue", "id": 2792, "trainId": 365}, {"name": "organ", "id": 1695, "trainId": 366}, {"name": "dishrag, dishcloth", "id": 746, "trainId": 367}, {"name": "island", "id": 1343, "trainId": 368}, {"name": "keyboard", "id": 1370, "trainId": 369}, {"name": "trench", "id": 2858, "trainId": 370}, {"name": "basket, basketball hoop, hoop", "id": 145, "trainId": 371}, {"name": "steering wheel, wheel", "id": 2565, "trainId": 372}, {"name": "pitcher, ewer", "id": 1892, "trainId": 373}, {"name": "goal", "id": 1103, "trainId": 374}, {"name": "bread, breadstuff, staff of life", "id": 286, "trainId": 375}, {"name": "beds", "id": 170, "trainId": 376}, {"name": "wood", "id": 3073, "trainId": 377}, {"name": "file cabinet", "id": 922, "trainId": 378}, {"name": "newspaper, paper", "id": 1655, "trainId": 379}, {"name": "motorboat", "id": 1602, "trainId": 380}, {"name": "rope", "id": 2160, "trainId": 381}, {"name": "guitar", "id": 1151, "trainId": 382}, {"name": "rubble", "id": 2176, "trainId": 383}, {"name": "scarf", "id": 2239, "trainId": 384}, {"name": "barrels", "id": 132, "trainId": 385}, {"name": "cap", "id": 394, "trainId": 386}, {"name": "leaves", "id": 1424, "trainId": 387}, {"name": "control tower", "id": 607, "trainId": 388}, {"name": "dashboard", "id": 700, "trainId": 389}, {"name": "bandstand", "id": 116, "trainId": 390}, {"name": "lectern", "id": 1425, "trainId": 391}, {"name": "switch, electric switch, electrical switch", "id": 2676, "trainId": 392}, {"name": "baseboard, mopboard, skirting board", "id": 141, "trainId": 393}, {"name": "shower room", "id": 2360, "trainId": 394}, {"name": "smoke", "id": 2449, "trainId": 395}, {"name": "faucet, spigot", "id": 897, "trainId": 396}, {"name": "bulldozer", "id": 317, "trainId": 397}, {"name": "saucepan", "id": 2228, "trainId": 398}, {"name": "shops", "id": 2351, "trainId": 399}, {"name": "meter", "id": 1543, "trainId": 400}, {"name": "crevasse", "id": 656, "trainId": 401}, {"name": "gear", "id": 1088, "trainId": 402}, {"name": "candelabrum, candelabra", "id": 373, "trainId": 403}, {"name": "sofa bed", "id": 2472, "trainId": 404}, {"name": "tunnel", "id": 2892, "trainId": 405}, {"name": "pallet", "id": 1740, "trainId": 406}, {"name": "wire, conducting wire", "id": 3067, "trainId": 407}, {"name": "kettle, boiler", "id": 1367, "trainId": 408}, {"name": "bidet", "id": 188, "trainId": 409}, { "name": "baby buggy, baby carriage, carriage, perambulator, pram, stroller, go-cart, pushchair, pusher", "id": 79, "trainId": 410, }, {"name": "music stand", "id": 1633, "trainId": 411}, {"name": "pipe, tube", "id": 1885, "trainId": 412}, {"name": "cup", "id": 677, "trainId": 413}, {"name": "parking meter", "id": 1779, "trainId": 414}, {"name": "ice hockey rink", "id": 1297, "trainId": 415}, {"name": "shelter", "id": 2334, "trainId": 416}, {"name": "weeds", "id": 3027, "trainId": 417}, {"name": "temple", "id": 2735, "trainId": 418}, {"name": "patty, cake", "id": 1791, "trainId": 419}, {"name": "ski slope", "id": 2405, "trainId": 420}, {"name": "panel", "id": 1748, "trainId": 421}, {"name": "wallet", "id": 2983, "trainId": 422}, {"name": "wheel", "id": 3035, "trainId": 423}, {"name": "towel rack, towel horse", "id": 2824, "trainId": 424}, {"name": "roundabout", "id": 2168, "trainId": 425}, {"name": "canister, cannister, tin", "id": 385, "trainId": 426}, {"name": "rod", "id": 2148, "trainId": 427}, {"name": "soap dispenser", "id": 2465, "trainId": 428}, {"name": "bell", "id": 175, "trainId": 429}, {"name": "canvas", "id": 390, "trainId": 430}, {"name": "box office, ticket office, ticket booth", "id": 268, "trainId": 431}, {"name": "teacup", "id": 2722, "trainId": 432}, {"name": "trellis", "id": 2857, "trainId": 433}, {"name": "workbench", "id": 3088, "trainId": 434}, {"name": "valley, vale", "id": 2926, "trainId": 435}, {"name": "toaster", "id": 2782, "trainId": 436}, {"name": "knife", "id": 1378, "trainId": 437}, {"name": "podium", "id": 1934, "trainId": 438}, {"name": "ramp", "id": 2072, "trainId": 439}, {"name": "tumble dryer", "id": 2889, "trainId": 440}, {"name": "fireplug, fire hydrant, plug", "id": 944, "trainId": 441}, {"name": "gym shoe, sneaker, tennis shoe", "id": 1158, "trainId": 442}, {"name": "lab bench", "id": 1383, "trainId": 443}, {"name": "equipment", "id": 867, "trainId": 444}, {"name": "rocky formation", "id": 2145, "trainId": 445}, {"name": "plastic", "id": 1915, "trainId": 446}, {"name": "calendar", "id": 361, "trainId": 447}, {"name": "caravan", "id": 402, "trainId": 448}, {"name": "check-in-desk", "id": 482, "trainId": 449}, {"name": "ticket counter", "id": 2761, "trainId": 450}, {"name": "brush", "id": 300, "trainId": 451}, {"name": "mill", "id": 1554, "trainId": 452}, {"name": "covered bridge", "id": 636, "trainId": 453}, {"name": "bowling alley", "id": 260, "trainId": 454}, {"name": "hanger", "id": 1186, "trainId": 455}, {"name": "excavator", "id": 871, "trainId": 456}, {"name": "trestle", "id": 2859, "trainId": 457}, {"name": "revolving door", "id": 2103, "trainId": 458}, {"name": "blast furnace", "id": 208, "trainId": 459}, {"name": "scale, weighing machine", "id": 2236, "trainId": 460}, {"name": "projector", "id": 2012, "trainId": 461}, {"name": "soap", "id": 2462, "trainId": 462}, {"name": "locker", "id": 1462, "trainId": 463}, {"name": "tractor", "id": 2832, "trainId": 464}, {"name": "stretcher", "id": 2617, "trainId": 465}, {"name": "frame", "id": 1024, "trainId": 466}, {"name": "grating", "id": 1129, "trainId": 467}, {"name": "alembic", "id": 18, "trainId": 468}, {"name": "candle, taper, wax light", "id": 376, "trainId": 469}, {"name": "barrier", "id": 134, "trainId": 470}, {"name": "cardboard", "id": 407, "trainId": 471}, {"name": "cave", "id": 434, "trainId": 472}, {"name": "puddle", "id": 2017, "trainId": 473}, {"name": "tarp", "id": 2717, "trainId": 474}, {"name": "price tag", "id": 2005, "trainId": 475}, {"name": "watchtower", "id": 2993, "trainId": 476}, {"name": "meters", "id": 1545, "trainId": 477}, { "name": "light bulb, lightbulb, bulb, incandescent lamp, electric light, electric-light bulb", "id": 1445, "trainId": 478, }, {"name": "tracks", "id": 2831, "trainId": 479}, {"name": "hair dryer", "id": 1161, "trainId": 480}, {"name": "skirt", "id": 2411, "trainId": 481}, {"name": "viaduct", "id": 2949, "trainId": 482}, {"name": "paper towel", "id": 1769, "trainId": 483}, {"name": "coat", "id": 552, "trainId": 484}, {"name": "sheet", "id": 2327, "trainId": 485}, {"name": "fire extinguisher, extinguisher, asphyxiator", "id": 939, "trainId": 486}, {"name": "water wheel", "id": 3013, "trainId": 487}, {"name": "pottery, clayware", "id": 1986, "trainId": 488}, {"name": "magazine rack", "id": 1486, "trainId": 489}, {"name": "teapot", "id": 2723, "trainId": 490}, {"name": "microphone, mike", "id": 1549, "trainId": 491}, {"name": "support", "id": 2649, "trainId": 492}, {"name": "forklift", "id": 1020, "trainId": 493}, {"name": "canyon", "id": 392, "trainId": 494}, {"name": "cash register, register", "id": 422, "trainId": 495}, {"name": "leaf, leafage, foliage", "id": 1419, "trainId": 496}, {"name": "remote control, remote", "id": 2099, "trainId": 497}, {"name": "soap dish", "id": 2464, "trainId": 498}, {"name": "windshield, windscreen", "id": 3058, "trainId": 499}, {"name": "cat", "id": 430, "trainId": 500}, {"name": "cue, cue stick, pool cue, pool stick", "id": 675, "trainId": 501}, {"name": "vent, venthole, vent-hole, blowhole", "id": 2941, "trainId": 502}, {"name": "videos", "id": 2955, "trainId": 503}, {"name": "shovel", "id": 2355, "trainId": 504}, {"name": "eaves", "id": 840, "trainId": 505}, {"name": "antenna, aerial, transmitting aerial", "id": 32, "trainId": 506}, {"name": "shipyard", "id": 2338, "trainId": 507}, {"name": "hen, biddy", "id": 1232, "trainId": 508}, {"name": "traffic cone", "id": 2834, "trainId": 509}, {"name": "washing machines", "id": 2991, "trainId": 510}, {"name": "truck crane", "id": 2879, "trainId": 511}, {"name": "cds", "id": 444, "trainId": 512}, {"name": "niche", "id": 1657, "trainId": 513}, {"name": "scoreboard", "id": 2246, "trainId": 514}, {"name": "briefcase", "id": 296, "trainId": 515}, {"name": "boot", "id": 245, "trainId": 516}, {"name": "sweater, jumper", "id": 2661, "trainId": 517}, {"name": "hay", "id": 1202, "trainId": 518}, {"name": "pack", "id": 1714, "trainId": 519}, {"name": "bottle rack", "id": 251, "trainId": 520}, {"name": "glacier", "id": 1095, "trainId": 521}, {"name": "pergola", "id": 1828, "trainId": 522}, {"name": "building materials", "id": 311, "trainId": 523}, {"name": "television camera", "id": 2732, "trainId": 524}, {"name": "first floor", "id": 947, "trainId": 525}, {"name": "rifle", "id": 2115, "trainId": 526}, {"name": "tennis table", "id": 2738, "trainId": 527}, {"name": "stadium", "id": 2525, "trainId": 528}, {"name": "safety belt", "id": 2194, "trainId": 529}, {"name": "cover", "id": 634, "trainId": 530}, {"name": "dish rack", "id": 740, "trainId": 531}, {"name": "synthesizer", "id": 2682, "trainId": 532}, {"name": "pumpkin", "id": 2020, "trainId": 533}, {"name": "gutter", "id": 1156, "trainId": 534}, {"name": "fruit stand", "id": 1036, "trainId": 535}, {"name": "ice floe, floe", "id": 1295, "trainId": 536}, {"name": "handle, grip, handgrip, hold", "id": 1181, "trainId": 537}, {"name": "wheelchair", "id": 3037, "trainId": 538}, {"name": "mousepad, mouse mat", "id": 1614, "trainId": 539}, {"name": "diploma", "id": 736, "trainId": 540}, {"name": "fairground ride", "id": 893, "trainId": 541}, {"name": "radio", "id": 2047, "trainId": 542}, {"name": "hotplate", "id": 1274, "trainId": 543}, {"name": "junk", "id": 1361, "trainId": 544}, {"name": "wheelbarrow", "id": 3036, "trainId": 545}, {"name": "stream", "id": 2606, "trainId": 546}, {"name": "toll plaza", "id": 2797, "trainId": 547}, {"name": "punching bag", "id": 2022, "trainId": 548}, {"name": "trough", "id": 2876, "trainId": 549}, {"name": "throne", "id": 2758, "trainId": 550}, {"name": "chair desk", "id": 472, "trainId": 551}, {"name": "weighbridge", "id": 3028, "trainId": 552}, {"name": "extractor fan", "id": 882, "trainId": 553}, {"name": "hanging clothes", "id": 1189, "trainId": 554}, {"name": "dish, dish aerial, dish antenna, saucer", "id": 743, "trainId": 555}, {"name": "alarm clock, alarm", "id": 3122, "trainId": 556}, {"name": "ski lift", "id": 2401, "trainId": 557}, {"name": "chain", "id": 468, "trainId": 558}, {"name": "garage", "id": 1061, "trainId": 559}, {"name": "mechanical shovel", "id": 1523, "trainId": 560}, {"name": "wine rack", "id": 3059, "trainId": 561}, {"name": "tramway", "id": 2843, "trainId": 562}, {"name": "treadmill", "id": 2853, "trainId": 563}, {"name": "menu", "id": 1529, "trainId": 564}, {"name": "block", "id": 214, "trainId": 565}, {"name": "well", "id": 3032, "trainId": 566}, {"name": "witness stand", "id": 3071, "trainId": 567}, {"name": "branch", "id": 277, "trainId": 568}, {"name": "duck", "id": 813, "trainId": 569}, {"name": "casserole", "id": 426, "trainId": 570}, {"name": "frying pan", "id": 1039, "trainId": 571}, {"name": "desk organizer", "id": 727, "trainId": 572}, {"name": "mast", "id": 1508, "trainId": 573}, {"name": "spectacles, specs, eyeglasses, glasses", "id": 2490, "trainId": 574}, {"name": "service elevator", "id": 2299, "trainId": 575}, {"name": "dollhouse", "id": 768, "trainId": 576}, {"name": "hammock", "id": 1172, "trainId": 577}, {"name": "clothes hanging", "id": 537, "trainId": 578}, {"name": "photocopier", "id": 1847, "trainId": 579}, {"name": "notepad", "id": 1664, "trainId": 580}, {"name": "golf cart", "id": 1110, "trainId": 581}, {"name": "footpath", "id": 1014, "trainId": 582}, {"name": "cross", "id": 662, "trainId": 583}, {"name": "baptismal font", "id": 121, "trainId": 584}, {"name": "boiler", "id": 227, "trainId": 585}, {"name": "skip", "id": 2410, "trainId": 586}, {"name": "rotisserie", "id": 2165, "trainId": 587}, {"name": "tables", "id": 2696, "trainId": 588}, {"name": "water mill", "id": 3005, "trainId": 589}, {"name": "helmet", "id": 1231, "trainId": 590}, {"name": "cover curtain", "id": 635, "trainId": 591}, {"name": "brick", "id": 292, "trainId": 592}, {"name": "table runner", "id": 2690, "trainId": 593}, {"name": "ashtray", "id": 65, "trainId": 594}, {"name": "street box", "id": 2607, "trainId": 595}, {"name": "stick", "id": 2574, "trainId": 596}, {"name": "hangers", "id": 1188, "trainId": 597}, {"name": "cells", "id": 456, "trainId": 598}, {"name": "urinal", "id": 2913, "trainId": 599}, {"name": "centerpiece", "id": 459, "trainId": 600}, {"name": "portable fridge", "id": 1955, "trainId": 601}, {"name": "dvds", "id": 827, "trainId": 602}, {"name": "golf club", "id": 1111, "trainId": 603}, {"name": "skirting board", "id": 2412, "trainId": 604}, {"name": "water cooler", "id": 2997, "trainId": 605}, {"name": "clipboard", "id": 528, "trainId": 606}, {"name": "camera, photographic camera", "id": 366, "trainId": 607}, {"name": "pigeonhole", "id": 1863, "trainId": 608}, {"name": "chips", "id": 500, "trainId": 609}, {"name": "food processor", "id": 1001, "trainId": 610}, {"name": "post box", "id": 1958, "trainId": 611}, {"name": "lid", "id": 1441, "trainId": 612}, {"name": "drum", "id": 809, "trainId": 613}, {"name": "blender", "id": 210, "trainId": 614}, {"name": "cave entrance", "id": 435, "trainId": 615}, {"name": "dental chair", "id": 718, "trainId": 616}, {"name": "obelisk", "id": 1674, "trainId": 617}, {"name": "canoe", "id": 388, "trainId": 618}, {"name": "mobile", "id": 1572, "trainId": 619}, {"name": "monitors", "id": 1584, "trainId": 620}, {"name": "pool ball", "id": 1944, "trainId": 621}, {"name": "cue rack", "id": 674, "trainId": 622}, {"name": "baggage carts", "id": 99, "trainId": 623}, {"name": "shore", "id": 2352, "trainId": 624}, {"name": "fork", "id": 1019, "trainId": 625}, {"name": "paper filer", "id": 1763, "trainId": 626}, {"name": "bicycle rack", "id": 185, "trainId": 627}, {"name": "coat rack", "id": 554, "trainId": 628}, {"name": "garland", "id": 1066, "trainId": 629}, {"name": "sports bag", "id": 2508, "trainId": 630}, {"name": "fish tank", "id": 951, "trainId": 631}, {"name": "towel dispenser", "id": 2822, "trainId": 632}, {"name": "carriage", "id": 415, "trainId": 633}, {"name": "brochure", "id": 297, "trainId": 634}, {"name": "plaque", "id": 1914, "trainId": 635}, {"name": "stringer", "id": 2619, "trainId": 636}, {"name": "iron", "id": 1338, "trainId": 637}, {"name": "spoon", "id": 2505, "trainId": 638}, {"name": "flag pole", "id": 955, "trainId": 639}, {"name": "toilet brush", "id": 2786, "trainId": 640}, {"name": "book stand", "id": 238, "trainId": 641}, {"name": "water faucet, water tap, tap, hydrant", "id": 3000, "trainId": 642}, {"name": "ticket office", "id": 2763, "trainId": 643}, {"name": "broom", "id": 299, "trainId": 644}, {"name": "dvd", "id": 822, "trainId": 645}, {"name": "ice bucket", "id": 1288, "trainId": 646}, {"name": "carapace, shell, cuticle, shield", "id": 3101, "trainId": 647}, {"name": "tureen", "id": 2894, "trainId": 648}, {"name": "folders", "id": 992, "trainId": 649}, {"name": "chess", "id": 489, "trainId": 650}, {"name": "root", "id": 2157, "trainId": 651}, {"name": "sewing machine", "id": 2309, "trainId": 652}, {"name": "model", "id": 1576, "trainId": 653}, {"name": "pen", "id": 1810, "trainId": 654}, {"name": "violin", "id": 2964, "trainId": 655}, {"name": "sweatshirt", "id": 2662, "trainId": 656}, {"name": "recycling materials", "id": 2087, "trainId": 657}, {"name": "mitten", "id": 1569, "trainId": 658}, {"name": "chopping board, cutting board", "id": 503, "trainId": 659}, {"name": "mask", "id": 1505, "trainId": 660}, {"name": "log", "id": 1468, "trainId": 661}, {"name": "mouse, computer mouse", "id": 1613, "trainId": 662}, {"name": "grill", "id": 1138, "trainId": 663}, {"name": "hole", "id": 1256, "trainId": 664}, {"name": "target", "id": 2715, "trainId": 665}, {"name": "trash bag", "id": 2846, "trainId": 666}, {"name": "chalk", "id": 477, "trainId": 667}, {"name": "sticks", "id": 2576, "trainId": 668}, {"name": "balloon", "id": 108, "trainId": 669}, {"name": "score", "id": 2245, "trainId": 670}, {"name": "hair spray", "id": 1162, "trainId": 671}, {"name": "roll", "id": 2149, "trainId": 672}, {"name": "runner", "id": 2183, "trainId": 673}, {"name": "engine", "id": 858, "trainId": 674}, {"name": "inflatable glove", "id": 1324, "trainId": 675}, {"name": "games", "id": 1055, "trainId": 676}, {"name": "pallets", "id": 1741, "trainId": 677}, {"name": "baskets", "id": 149, "trainId": 678}, {"name": "coop", "id": 615, "trainId": 679}, {"name": "dvd player", "id": 825, "trainId": 680}, {"name": "rocking horse", "id": 2143, "trainId": 681}, {"name": "buckets", "id": 304, "trainId": 682}, {"name": "bread rolls", "id": 283, "trainId": 683}, {"name": "shawl", "id": 2322, "trainId": 684}, {"name": "watering can", "id": 3017, "trainId": 685}, {"name": "spotlights", "id": 2510, "trainId": 686}, {"name": "post-it", "id": 1960, "trainId": 687}, {"name": "bowls", "id": 265, "trainId": 688}, {"name": "security camera", "id": 2282, "trainId": 689}, {"name": "runner cloth", "id": 2184, "trainId": 690}, {"name": "lock", "id": 1461, "trainId": 691}, {"name": "alarm, warning device, alarm system", "id": 3113, "trainId": 692}, {"name": "side", "id": 2372, "trainId": 693}, {"name": "roulette", "id": 2166, "trainId": 694}, {"name": "bone", "id": 232, "trainId": 695}, {"name": "cutlery", "id": 693, "trainId": 696}, {"name": "pool balls", "id": 1945, "trainId": 697}, {"name": "wheels", "id": 3039, "trainId": 698}, {"name": "spice rack", "id": 2494, "trainId": 699}, {"name": "plant pots", "id": 1908, "trainId": 700}, {"name": "towel ring", "id": 2827, "trainId": 701}, {"name": "bread box", "id": 280, "trainId": 702}, {"name": "video", "id": 2950, "trainId": 703}, {"name": "funfair", "id": 1044, "trainId": 704}, {"name": "breads", "id": 288, "trainId": 705}, {"name": "tripod", "id": 2863, "trainId": 706}, {"name": "ironing board", "id": 1342, "trainId": 707}, {"name": "skimmer", "id": 2409, "trainId": 708}, {"name": "hollow", "id": 1258, "trainId": 709}, {"name": "scratching post", "id": 2249, "trainId": 710}, {"name": "tricycle", "id": 2862, "trainId": 711}, {"name": "file box", "id": 920, "trainId": 712}, {"name": "mountain pass", "id": 1607, "trainId": 713}, {"name": "tombstones", "id": 2802, "trainId": 714}, {"name": "cooker", "id": 610, "trainId": 715}, {"name": "card game, cards", "id": 3129, "trainId": 716}, {"name": "golf bag", "id": 1108, "trainId": 717}, {"name": "towel paper", "id": 2823, "trainId": 718}, {"name": "chaise lounge", "id": 476, "trainId": 719}, {"name": "sun", "id": 2641, "trainId": 720}, {"name": "toilet paper holder", "id": 2788, "trainId": 721}, {"name": "rake", "id": 2070, "trainId": 722}, {"name": "key", "id": 1368, "trainId": 723}, {"name": "umbrella stand", "id": 2903, "trainId": 724}, {"name": "dartboard", "id": 699, "trainId": 725}, {"name": "transformer", "id": 2844, "trainId": 726}, {"name": "fireplace utensils", "id": 942, "trainId": 727}, {"name": "sweatshirts", "id": 2663, "trainId": 728}, { "name": "cellular telephone, cellular phone, cellphone, cell, mobile phone", "id": 457, "trainId": 729, }, {"name": "tallboy", "id": 2701, "trainId": 730}, {"name": "stapler", "id": 2540, "trainId": 731}, {"name": "sauna", "id": 2231, "trainId": 732}, {"name": "test tube", "id": 2746, "trainId": 733}, {"name": "palette", "id": 1738, "trainId": 734}, {"name": "shopping carts", "id": 2350, "trainId": 735}, {"name": "tools", "id": 2808, "trainId": 736}, {"name": "push button, push, button", "id": 2025, "trainId": 737}, {"name": "star", "id": 2541, "trainId": 738}, {"name": "roof rack", "id": 2156, "trainId": 739}, {"name": "barbed wire", "id": 126, "trainId": 740}, {"name": "spray", "id": 2512, "trainId": 741}, {"name": "ear", "id": 831, "trainId": 742}, {"name": "sponge", "id": 2503, "trainId": 743}, {"name": "racket", "id": 2039, "trainId": 744}, {"name": "tins", "id": 2774, "trainId": 745}, {"name": "eyeglasses", "id": 886, "trainId": 746}, {"name": "file", "id": 919, "trainId": 747}, {"name": "scarfs", "id": 2240, "trainId": 748}, {"name": "sugar bowl", "id": 2636, "trainId": 749}, {"name": "flip flop", "id": 963, "trainId": 750}, {"name": "headstones", "id": 1218, "trainId": 751}, {"name": "laptop bag", "id": 1406, "trainId": 752}, {"name": "leash", "id": 1420, "trainId": 753}, {"name": "climbing frame", "id": 526, "trainId": 754}, {"name": "suit hanger", "id": 2639, "trainId": 755}, {"name": "floor spotlight", "id": 975, "trainId": 756}, {"name": "plate rack", "id": 1921, "trainId": 757}, {"name": "sewer", "id": 2305, "trainId": 758}, {"name": "hard drive", "id": 1193, "trainId": 759}, {"name": "sprinkler", "id": 2517, "trainId": 760}, {"name": "tools box", "id": 2809, "trainId": 761}, {"name": "necklace", "id": 1647, "trainId": 762}, {"name": "bulbs", "id": 314, "trainId": 763}, {"name": "steel industry", "id": 2560, "trainId": 764}, {"name": "club", "id": 545, "trainId": 765}, {"name": "jack", "id": 1345, "trainId": 766}, {"name": "door bars", "id": 775, "trainId": 767}, { "name": "control panel, instrument panel, control board, board, panel", "id": 603, "trainId": 768, }, {"name": "hairbrush", "id": 1163, "trainId": 769}, {"name": "napkin holder", "id": 1641, "trainId": 770}, {"name": "office", "id": 1678, "trainId": 771}, {"name": "smoke detector", "id": 2450, "trainId": 772}, {"name": "utensils", "id": 2915, "trainId": 773}, {"name": "apron", "id": 42, "trainId": 774}, {"name": "scissors", "id": 2242, "trainId": 775}, {"name": "terminal", "id": 2741, "trainId": 776}, {"name": "grinder", "id": 1143, "trainId": 777}, {"name": "entry phone", "id": 862, "trainId": 778}, {"name": "newspaper stand", "id": 1654, "trainId": 779}, {"name": "pepper shaker", "id": 1826, "trainId": 780}, {"name": "onions", "id": 1689, "trainId": 781}, { "name": "central processing unit, cpu, c p u , central processor, processor, mainframe", "id": 3124, "trainId": 782, }, {"name": "tape", "id": 2710, "trainId": 783}, {"name": "bat", "id": 152, "trainId": 784}, {"name": "coaster", "id": 549, "trainId": 785}, {"name": "calculator", "id": 360, "trainId": 786}, {"name": "potatoes", "id": 1982, "trainId": 787}, {"name": "luggage rack", "id": 1478, "trainId": 788}, {"name": "salt", "id": 2203, "trainId": 789}, {"name": "street number", "id": 2612, "trainId": 790}, {"name": "viewpoint", "id": 2956, "trainId": 791}, {"name": "sword", "id": 2681, "trainId": 792}, {"name": "cd", "id": 437, "trainId": 793}, {"name": "rowing machine", "id": 2171, "trainId": 794}, {"name": "plug", "id": 1933, "trainId": 795}, {"name": "andiron, firedog, dog, dog-iron", "id": 3110, "trainId": 796}, {"name": "pepper", "id": 1824, "trainId": 797}, {"name": "tongs", "id": 2803, "trainId": 798}, {"name": "bonfire", "id": 234, "trainId": 799}, {"name": "dog dish", "id": 764, "trainId": 800}, {"name": "belt", "id": 177, "trainId": 801}, {"name": "dumbbells", "id": 817, "trainId": 802}, {"name": "videocassette recorder, vcr", "id": 3145, "trainId": 803}, {"name": "hook", "id": 1262, "trainId": 804}, {"name": "envelopes", "id": 864, "trainId": 805}, {"name": "shower faucet", "id": 2359, "trainId": 806}, {"name": "watch", "id": 2992, "trainId": 807}, {"name": "padlock", "id": 1725, "trainId": 808}, {"name": "swimming pool ladder", "id": 2667, "trainId": 809}, {"name": "spanners", "id": 2484, "trainId": 810}, {"name": "gravy boat", "id": 1133, "trainId": 811}, {"name": "notice board", "id": 1667, "trainId": 812}, {"name": "trash bags", "id": 2847, "trainId": 813}, {"name": "fire alarm", "id": 932, "trainId": 814}, {"name": "ladle", "id": 1392, "trainId": 815}, {"name": "stethoscope", "id": 2573, "trainId": 816}, {"name": "rocket", "id": 2140, "trainId": 817}, {"name": "funnel", "id": 1046, "trainId": 818}, {"name": "bowling pins", "id": 264, "trainId": 819}, {"name": "valve", "id": 2927, "trainId": 820}, {"name": "thermometer", "id": 2752, "trainId": 821}, {"name": "cups", "id": 679, "trainId": 822}, {"name": "spice jar", "id": 2493, "trainId": 823}, {"name": "night light", "id": 1658, "trainId": 824}, {"name": "soaps", "id": 2466, "trainId": 825}, {"name": "games table", "id": 1057, "trainId": 826}, {"name": "slotted spoon", "id": 2444, "trainId": 827}, {"name": "reel", "id": 2093, "trainId": 828}, {"name": "scourer", "id": 2248, "trainId": 829}, {"name": "sleeping robe", "id": 2432, "trainId": 830}, {"name": "desk mat", "id": 726, "trainId": 831}, {"name": "dumbbell", "id": 816, "trainId": 832}, {"name": "hammer", "id": 1171, "trainId": 833}, {"name": "tie", "id": 2766, "trainId": 834}, {"name": "typewriter", "id": 2900, "trainId": 835}, {"name": "shaker", "id": 2313, "trainId": 836}, {"name": "cheese dish", "id": 488, "trainId": 837}, {"name": "sea star", "id": 2265, "trainId": 838}, {"name": "racquet", "id": 2043, "trainId": 839}, {"name": "butane gas cylinder", "id": 332, "trainId": 840}, {"name": "paper weight", "id": 1771, "trainId": 841}, {"name": "shaving brush", "id": 2320, "trainId": 842}, {"name": "sunglasses", "id": 2646, "trainId": 843}, {"name": "gear shift", "id": 1089, "trainId": 844}, {"name": "towel rail", "id": 2826, "trainId": 845}, {"name": "adding machine, totalizer, totaliser", "id": 3148, "trainId": 846}, ] def _get_ade20k_full_meta(): # Id 0 is reserved for ignore_label, we change ignore_label for 0 # to 255 in our pre-processing, so all ids are shifted by 1. stuff_ids = [k["id"] for k in ADE20K_SEM_SEG_FULL_CATEGORIES] assert len(stuff_ids) == 847, len(stuff_ids) # For semantic segmentation, this mapping maps from contiguous stuff id # (in [0, 91], used in models) to ids in the dataset (used for processing results) stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)} stuff_classes = [k["name"] for k in ADE20K_SEM_SEG_FULL_CATEGORIES] ret = { "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id, "stuff_classes": stuff_classes, } return ret def register_all_ade20k_full(root): root = os.path.join(root, "ADE20K_2021_17_01") meta = _get_ade20k_full_meta() for name, dirname in [("train", "training"), ("val", "validation")]: image_dir = os.path.join(root, "images_detectron2", dirname) gt_dir = os.path.join(root, "annotations_detectron2", dirname) name = f"ade20k_full_sem_seg_{name}" DatasetCatalog.register( name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="tif", image_ext="jpg") ) MetadataCatalog.get(name).set( stuff_classes=meta["stuff_classes"][:], image_root=image_dir, sem_seg_root=gt_dir, evaluator_type="sem_seg", ignore_label=65535, # NOTE: gt is saved in 16-bit TIFF images ) _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_ade20k_full(_root) ================================================ FILE: mfvis_nococo/mask2former/data/datasets/register_ade20k_instance.py ================================================ import json import logging import numpy as np import os from PIL import Image from detectron2.data import DatasetCatalog, MetadataCatalog from detectron2.data.datasets.coco import load_coco_json, register_coco_instances from detectron2.utils.file_io import PathManager ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}] _PREDEFINED_SPLITS = { # point annotations without masks "ade20k_instance_train": ( "ADEChallengeData2016/images/training", "ADEChallengeData2016/ade20k_instance_train.json", ), "ade20k_instance_val": ( "ADEChallengeData2016/images/validation", "ADEChallengeData2016/ade20k_instance_val.json", ), } def _get_ade_instances_meta(): thing_ids = [k["id"] for k in ADE_CATEGORIES] assert len(thing_ids) == 100, len(thing_ids) # Mapping from the incontiguous ADE category id to an id in [0, 99] thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} thing_classes = [k["name"] for k in ADE_CATEGORIES] ret = { "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, "thing_classes": thing_classes, } return ret def register_all_ade20k_instance(root): for key, (image_root, json_file) in _PREDEFINED_SPLITS.items(): # Assume pre-defined datasets live in `./datasets`. register_coco_instances( key, _get_ade_instances_meta(), os.path.join(root, json_file) if "://" not in json_file else json_file, os.path.join(root, image_root), ) _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_ade20k_instance(_root) ================================================ FILE: mfvis_nococo/mask2former/data/datasets/register_ade20k_panoptic.py ================================================ import json import os from detectron2.data import DatasetCatalog, MetadataCatalog from detectron2.utils.file_io import PathManager ADE20K_150_CATEGORIES = [ {"color": [120, 120, 120], "id": 0, "isthing": 0, "name": "wall"}, {"color": [180, 120, 120], "id": 1, "isthing": 0, "name": "building"}, {"color": [6, 230, 230], "id": 2, "isthing": 0, "name": "sky"}, {"color": [80, 50, 50], "id": 3, "isthing": 0, "name": "floor"}, {"color": [4, 200, 3], "id": 4, "isthing": 0, "name": "tree"}, {"color": [120, 120, 80], "id": 5, "isthing": 0, "name": "ceiling"}, {"color": [140, 140, 140], "id": 6, "isthing": 0, "name": "road, route"}, {"color": [204, 5, 255], "id": 7, "isthing": 1, "name": "bed"}, {"color": [230, 230, 230], "id": 8, "isthing": 1, "name": "window "}, {"color": [4, 250, 7], "id": 9, "isthing": 0, "name": "grass"}, {"color": [224, 5, 255], "id": 10, "isthing": 1, "name": "cabinet"}, {"color": [235, 255, 7], "id": 11, "isthing": 0, "name": "sidewalk, pavement"}, {"color": [150, 5, 61], "id": 12, "isthing": 1, "name": "person"}, {"color": [120, 120, 70], "id": 13, "isthing": 0, "name": "earth, ground"}, {"color": [8, 255, 51], "id": 14, "isthing": 1, "name": "door"}, {"color": [255, 6, 82], "id": 15, "isthing": 1, "name": "table"}, {"color": [143, 255, 140], "id": 16, "isthing": 0, "name": "mountain, mount"}, {"color": [204, 255, 4], "id": 17, "isthing": 0, "name": "plant"}, {"color": [255, 51, 7], "id": 18, "isthing": 1, "name": "curtain"}, {"color": [204, 70, 3], "id": 19, "isthing": 1, "name": "chair"}, {"color": [0, 102, 200], "id": 20, "isthing": 1, "name": "car"}, {"color": [61, 230, 250], "id": 21, "isthing": 0, "name": "water"}, {"color": [255, 6, 51], "id": 22, "isthing": 1, "name": "painting, picture"}, {"color": [11, 102, 255], "id": 23, "isthing": 1, "name": "sofa"}, {"color": [255, 7, 71], "id": 24, "isthing": 1, "name": "shelf"}, {"color": [255, 9, 224], "id": 25, "isthing": 0, "name": "house"}, {"color": [9, 7, 230], "id": 26, "isthing": 0, "name": "sea"}, {"color": [220, 220, 220], "id": 27, "isthing": 1, "name": "mirror"}, {"color": [255, 9, 92], "id": 28, "isthing": 0, "name": "rug"}, {"color": [112, 9, 255], "id": 29, "isthing": 0, "name": "field"}, {"color": [8, 255, 214], "id": 30, "isthing": 1, "name": "armchair"}, {"color": [7, 255, 224], "id": 31, "isthing": 1, "name": "seat"}, {"color": [255, 184, 6], "id": 32, "isthing": 1, "name": "fence"}, {"color": [10, 255, 71], "id": 33, "isthing": 1, "name": "desk"}, {"color": [255, 41, 10], "id": 34, "isthing": 0, "name": "rock, stone"}, {"color": [7, 255, 255], "id": 35, "isthing": 1, "name": "wardrobe, closet, press"}, {"color": [224, 255, 8], "id": 36, "isthing": 1, "name": "lamp"}, {"color": [102, 8, 255], "id": 37, "isthing": 1, "name": "tub"}, {"color": [255, 61, 6], "id": 38, "isthing": 1, "name": "rail"}, {"color": [255, 194, 7], "id": 39, "isthing": 1, "name": "cushion"}, {"color": [255, 122, 8], "id": 40, "isthing": 0, "name": "base, pedestal, stand"}, {"color": [0, 255, 20], "id": 41, "isthing": 1, "name": "box"}, {"color": [255, 8, 41], "id": 42, "isthing": 1, "name": "column, pillar"}, {"color": [255, 5, 153], "id": 43, "isthing": 1, "name": "signboard, sign"}, { "color": [6, 51, 255], "id": 44, "isthing": 1, "name": "chest of drawers, chest, bureau, dresser", }, {"color": [235, 12, 255], "id": 45, "isthing": 1, "name": "counter"}, {"color": [160, 150, 20], "id": 46, "isthing": 0, "name": "sand"}, {"color": [0, 163, 255], "id": 47, "isthing": 1, "name": "sink"}, {"color": [140, 140, 140], "id": 48, "isthing": 0, "name": "skyscraper"}, {"color": [250, 10, 15], "id": 49, "isthing": 1, "name": "fireplace"}, {"color": [20, 255, 0], "id": 50, "isthing": 1, "name": "refrigerator, icebox"}, {"color": [31, 255, 0], "id": 51, "isthing": 0, "name": "grandstand, covered stand"}, {"color": [255, 31, 0], "id": 52, "isthing": 0, "name": "path"}, {"color": [255, 224, 0], "id": 53, "isthing": 1, "name": "stairs"}, {"color": [153, 255, 0], "id": 54, "isthing": 0, "name": "runway"}, {"color": [0, 0, 255], "id": 55, "isthing": 1, "name": "case, display case, showcase, vitrine"}, { "color": [255, 71, 0], "id": 56, "isthing": 1, "name": "pool table, billiard table, snooker table", }, {"color": [0, 235, 255], "id": 57, "isthing": 1, "name": "pillow"}, {"color": [0, 173, 255], "id": 58, "isthing": 1, "name": "screen door, screen"}, {"color": [31, 0, 255], "id": 59, "isthing": 0, "name": "stairway, staircase"}, {"color": [11, 200, 200], "id": 60, "isthing": 0, "name": "river"}, {"color": [255, 82, 0], "id": 61, "isthing": 0, "name": "bridge, span"}, {"color": [0, 255, 245], "id": 62, "isthing": 1, "name": "bookcase"}, {"color": [0, 61, 255], "id": 63, "isthing": 0, "name": "blind, screen"}, {"color": [0, 255, 112], "id": 64, "isthing": 1, "name": "coffee table"}, { "color": [0, 255, 133], "id": 65, "isthing": 1, "name": "toilet, can, commode, crapper, pot, potty, stool, throne", }, {"color": [255, 0, 0], "id": 66, "isthing": 1, "name": "flower"}, {"color": [255, 163, 0], "id": 67, "isthing": 1, "name": "book"}, {"color": [255, 102, 0], "id": 68, "isthing": 0, "name": "hill"}, {"color": [194, 255, 0], "id": 69, "isthing": 1, "name": "bench"}, {"color": [0, 143, 255], "id": 70, "isthing": 1, "name": "countertop"}, {"color": [51, 255, 0], "id": 71, "isthing": 1, "name": "stove"}, {"color": [0, 82, 255], "id": 72, "isthing": 1, "name": "palm, palm tree"}, {"color": [0, 255, 41], "id": 73, "isthing": 1, "name": "kitchen island"}, {"color": [0, 255, 173], "id": 74, "isthing": 1, "name": "computer"}, {"color": [10, 0, 255], "id": 75, "isthing": 1, "name": "swivel chair"}, {"color": [173, 255, 0], "id": 76, "isthing": 1, "name": "boat"}, {"color": [0, 255, 153], "id": 77, "isthing": 0, "name": "bar"}, {"color": [255, 92, 0], "id": 78, "isthing": 1, "name": "arcade machine"}, {"color": [255, 0, 255], "id": 79, "isthing": 0, "name": "hovel, hut, hutch, shack, shanty"}, {"color": [255, 0, 245], "id": 80, "isthing": 1, "name": "bus"}, {"color": [255, 0, 102], "id": 81, "isthing": 1, "name": "towel"}, {"color": [255, 173, 0], "id": 82, "isthing": 1, "name": "light"}, {"color": [255, 0, 20], "id": 83, "isthing": 1, "name": "truck"}, {"color": [255, 184, 184], "id": 84, "isthing": 0, "name": "tower"}, {"color": [0, 31, 255], "id": 85, "isthing": 1, "name": "chandelier"}, {"color": [0, 255, 61], "id": 86, "isthing": 1, "name": "awning, sunshade, sunblind"}, {"color": [0, 71, 255], "id": 87, "isthing": 1, "name": "street lamp"}, {"color": [255, 0, 204], "id": 88, "isthing": 1, "name": "booth"}, {"color": [0, 255, 194], "id": 89, "isthing": 1, "name": "tv"}, {"color": [0, 255, 82], "id": 90, "isthing": 1, "name": "plane"}, {"color": [0, 10, 255], "id": 91, "isthing": 0, "name": "dirt track"}, {"color": [0, 112, 255], "id": 92, "isthing": 1, "name": "clothes"}, {"color": [51, 0, 255], "id": 93, "isthing": 1, "name": "pole"}, {"color": [0, 194, 255], "id": 94, "isthing": 0, "name": "land, ground, soil"}, { "color": [0, 122, 255], "id": 95, "isthing": 1, "name": "bannister, banister, balustrade, balusters, handrail", }, { "color": [0, 255, 163], "id": 96, "isthing": 0, "name": "escalator, moving staircase, moving stairway", }, { "color": [255, 153, 0], "id": 97, "isthing": 1, "name": "ottoman, pouf, pouffe, puff, hassock", }, {"color": [0, 255, 10], "id": 98, "isthing": 1, "name": "bottle"}, {"color": [255, 112, 0], "id": 99, "isthing": 0, "name": "buffet, counter, sideboard"}, { "color": [143, 255, 0], "id": 100, "isthing": 0, "name": "poster, posting, placard, notice, bill, card", }, {"color": [82, 0, 255], "id": 101, "isthing": 0, "name": "stage"}, {"color": [163, 255, 0], "id": 102, "isthing": 1, "name": "van"}, {"color": [255, 235, 0], "id": 103, "isthing": 1, "name": "ship"}, {"color": [8, 184, 170], "id": 104, "isthing": 1, "name": "fountain"}, { "color": [133, 0, 255], "id": 105, "isthing": 0, "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter", }, {"color": [0, 255, 92], "id": 106, "isthing": 0, "name": "canopy"}, { "color": [184, 0, 255], "id": 107, "isthing": 1, "name": "washer, automatic washer, washing machine", }, {"color": [255, 0, 31], "id": 108, "isthing": 1, "name": "plaything, toy"}, {"color": [0, 184, 255], "id": 109, "isthing": 0, "name": "pool"}, {"color": [0, 214, 255], "id": 110, "isthing": 1, "name": "stool"}, {"color": [255, 0, 112], "id": 111, "isthing": 1, "name": "barrel, cask"}, {"color": [92, 255, 0], "id": 112, "isthing": 1, "name": "basket, handbasket"}, {"color": [0, 224, 255], "id": 113, "isthing": 0, "name": "falls"}, {"color": [112, 224, 255], "id": 114, "isthing": 0, "name": "tent"}, {"color": [70, 184, 160], "id": 115, "isthing": 1, "name": "bag"}, {"color": [163, 0, 255], "id": 116, "isthing": 1, "name": "minibike, motorbike"}, {"color": [153, 0, 255], "id": 117, "isthing": 0, "name": "cradle"}, {"color": [71, 255, 0], "id": 118, "isthing": 1, "name": "oven"}, {"color": [255, 0, 163], "id": 119, "isthing": 1, "name": "ball"}, {"color": [255, 204, 0], "id": 120, "isthing": 1, "name": "food, solid food"}, {"color": [255, 0, 143], "id": 121, "isthing": 1, "name": "step, stair"}, {"color": [0, 255, 235], "id": 122, "isthing": 0, "name": "tank, storage tank"}, {"color": [133, 255, 0], "id": 123, "isthing": 1, "name": "trade name"}, {"color": [255, 0, 235], "id": 124, "isthing": 1, "name": "microwave"}, {"color": [245, 0, 255], "id": 125, "isthing": 1, "name": "pot"}, {"color": [255, 0, 122], "id": 126, "isthing": 1, "name": "animal"}, {"color": [255, 245, 0], "id": 127, "isthing": 1, "name": "bicycle"}, {"color": [10, 190, 212], "id": 128, "isthing": 0, "name": "lake"}, {"color": [214, 255, 0], "id": 129, "isthing": 1, "name": "dishwasher"}, {"color": [0, 204, 255], "id": 130, "isthing": 1, "name": "screen"}, {"color": [20, 0, 255], "id": 131, "isthing": 0, "name": "blanket, cover"}, {"color": [255, 255, 0], "id": 132, "isthing": 1, "name": "sculpture"}, {"color": [0, 153, 255], "id": 133, "isthing": 1, "name": "hood, exhaust hood"}, {"color": [0, 41, 255], "id": 134, "isthing": 1, "name": "sconce"}, {"color": [0, 255, 204], "id": 135, "isthing": 1, "name": "vase"}, {"color": [41, 0, 255], "id": 136, "isthing": 1, "name": "traffic light"}, {"color": [41, 255, 0], "id": 137, "isthing": 1, "name": "tray"}, {"color": [173, 0, 255], "id": 138, "isthing": 1, "name": "trash can"}, {"color": [0, 245, 255], "id": 139, "isthing": 1, "name": "fan"}, {"color": [71, 0, 255], "id": 140, "isthing": 0, "name": "pier"}, {"color": [122, 0, 255], "id": 141, "isthing": 0, "name": "crt screen"}, {"color": [0, 255, 184], "id": 142, "isthing": 1, "name": "plate"}, {"color": [0, 92, 255], "id": 143, "isthing": 1, "name": "monitor"}, {"color": [184, 255, 0], "id": 144, "isthing": 1, "name": "bulletin board"}, {"color": [0, 133, 255], "id": 145, "isthing": 0, "name": "shower"}, {"color": [255, 214, 0], "id": 146, "isthing": 1, "name": "radiator"}, {"color": [25, 194, 194], "id": 147, "isthing": 1, "name": "glass, drinking glass"}, {"color": [102, 255, 0], "id": 148, "isthing": 1, "name": "clock"}, {"color": [92, 0, 255], "id": 149, "isthing": 1, "name": "flag"}, ] ADE20k_COLORS = [k["color"] for k in ADE20K_150_CATEGORIES] MetadataCatalog.get("ade20k_sem_seg_train").set( stuff_colors=ADE20k_COLORS[:], ) MetadataCatalog.get("ade20k_sem_seg_val").set( stuff_colors=ADE20k_COLORS[:], ) def load_ade20k_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta): """ Args: image_dir (str): path to the raw dataset. e.g., "~/coco/train2017". gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017". json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json". Returns: list[dict]: a list of dicts in Detectron2 standard format. (See `Using Custom Datasets `_ ) """ def _convert_category_id(segment_info, meta): if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]: segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] segment_info["isthing"] = True else: segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] segment_info["isthing"] = False return segment_info with PathManager.open(json_file) as f: json_info = json.load(f) ret = [] for ann in json_info["annotations"]: image_id = ann["image_id"] # TODO: currently we assume image and label has the same filename but # different extension, and images have extension ".jpg" for COCO. Need # to make image extension a user-provided argument if we extend this # function to support other COCO-like datasets. image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg") label_file = os.path.join(gt_dir, ann["file_name"]) sem_label_file = os.path.join(semseg_dir, ann["file_name"]) segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]] ret.append( { "file_name": image_file, "image_id": image_id, "pan_seg_file_name": label_file, "sem_seg_file_name": sem_label_file, "segments_info": segments_info, } ) assert len(ret), f"No images found in {image_dir}!" assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"] assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"] assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"] return ret def register_ade20k_panoptic( name, metadata, image_root, panoptic_root, semantic_root, panoptic_json, instances_json=None ): """ Register a "standard" version of ADE20k panoptic segmentation dataset named `name`. The dictionaries in this registered dataset follows detectron2's standard format. Hence it's called "standard". Args: name (str): the name that identifies a dataset, e.g. "ade20k_panoptic_train" metadata (dict): extra metadata associated with this dataset. image_root (str): directory which contains all the images panoptic_root (str): directory which contains panoptic annotation images in COCO format panoptic_json (str): path to the json panoptic annotation file in COCO format sem_seg_root (none): not used, to be consistent with `register_coco_panoptic_separated`. instances_json (str): path to the json instance annotation file """ panoptic_name = name DatasetCatalog.register( panoptic_name, lambda: load_ade20k_panoptic_json( panoptic_json, image_root, panoptic_root, semantic_root, metadata ), ) MetadataCatalog.get(panoptic_name).set( panoptic_root=panoptic_root, image_root=image_root, panoptic_json=panoptic_json, json_file=instances_json, evaluator_type="ade20k_panoptic_seg", ignore_label=255, label_divisor=1000, **metadata, ) _PREDEFINED_SPLITS_ADE20K_PANOPTIC = { "ade20k_panoptic_train": ( "ADEChallengeData2016/images/training", "ADEChallengeData2016/ade20k_panoptic_train", "ADEChallengeData2016/ade20k_panoptic_train.json", "ADEChallengeData2016/annotations_detectron2/training", "ADEChallengeData2016/ade20k_instance_train.json", ), "ade20k_panoptic_val": ( "ADEChallengeData2016/images/validation", "ADEChallengeData2016/ade20k_panoptic_val", "ADEChallengeData2016/ade20k_panoptic_val.json", "ADEChallengeData2016/annotations_detectron2/validation", "ADEChallengeData2016/ade20k_instance_val.json", ), } def get_metadata(): meta = {} # The following metadata maps contiguous id from [0, #thing categories + # #stuff categories) to their names and colors. We have to replica of the # same name and color under "thing_*" and "stuff_*" because the current # visualization function in D2 handles thing and class classes differently # due to some heuristic used in Panoptic FPN. We keep the same naming to # enable reusing existing visualization functions. thing_classes = [k["name"] for k in ADE20K_150_CATEGORIES if k["isthing"] == 1] thing_colors = [k["color"] for k in ADE20K_150_CATEGORIES if k["isthing"] == 1] stuff_classes = [k["name"] for k in ADE20K_150_CATEGORIES] stuff_colors = [k["color"] for k in ADE20K_150_CATEGORIES] meta["thing_classes"] = thing_classes meta["thing_colors"] = thing_colors meta["stuff_classes"] = stuff_classes meta["stuff_colors"] = stuff_colors # Convert category id for training: # category id: like semantic segmentation, it is the class id for each # pixel. Since there are some classes not used in evaluation, the category # id is not always contiguous and thus we have two set of category ids: # - original category id: category id in the original dataset, mainly # used for evaluation. # - contiguous category id: [0, #classes), in order to train the linear # softmax classifier. thing_dataset_id_to_contiguous_id = {} stuff_dataset_id_to_contiguous_id = {} for i, cat in enumerate(ADE20K_150_CATEGORIES): if cat["isthing"]: thing_dataset_id_to_contiguous_id[cat["id"]] = i # else: # stuff_dataset_id_to_contiguous_id[cat["id"]] = i # in order to use sem_seg evaluator stuff_dataset_id_to_contiguous_id[cat["id"]] = i meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id return meta def register_all_ade20k_panoptic(root): metadata = get_metadata() for ( prefix, (image_root, panoptic_root, panoptic_json, semantic_root, instance_json), ) in _PREDEFINED_SPLITS_ADE20K_PANOPTIC.items(): # The "standard" version of COCO panoptic segmentation dataset, # e.g. used by Panoptic-DeepLab register_ade20k_panoptic( prefix, metadata, os.path.join(root, image_root), os.path.join(root, panoptic_root), os.path.join(root, semantic_root), os.path.join(root, panoptic_json), os.path.join(root, instance_json), ) _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_ade20k_panoptic(_root) ================================================ FILE: mfvis_nococo/mask2former/data/datasets/register_coco_panoptic_annos_semseg.py ================================================ import json import os from detectron2.data import DatasetCatalog, MetadataCatalog from detectron2.data.datasets import load_sem_seg from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES from detectron2.utils.file_io import PathManager _PREDEFINED_SPLITS_COCO_PANOPTIC = { "coco_2017_train_panoptic": ( # This is the original panoptic annotation directory "coco/panoptic_train2017", "coco/annotations/panoptic_train2017.json", # This directory contains semantic annotations that are # converted from panoptic annotations. # It is used by PanopticFPN. # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py # to create these directories. "coco/panoptic_semseg_train2017", ), "coco_2017_val_panoptic": ( "coco/panoptic_val2017", "coco/annotations/panoptic_val2017.json", "coco/panoptic_semseg_val2017", ), } def get_metadata(): meta = {} # The following metadata maps contiguous id from [0, #thing categories + # #stuff categories) to their names and colors. We have to replica of the # same name and color under "thing_*" and "stuff_*" because the current # visualization function in D2 handles thing and class classes differently # due to some heuristic used in Panoptic FPN. We keep the same naming to # enable reusing existing visualization functions. thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1] thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1] stuff_classes = [k["name"] for k in COCO_CATEGORIES] stuff_colors = [k["color"] for k in COCO_CATEGORIES] meta["thing_classes"] = thing_classes meta["thing_colors"] = thing_colors meta["stuff_classes"] = stuff_classes meta["stuff_colors"] = stuff_colors # Convert category id for training: # category id: like semantic segmentation, it is the class id for each # pixel. Since there are some classes not used in evaluation, the category # id is not always contiguous and thus we have two set of category ids: # - original category id: category id in the original dataset, mainly # used for evaluation. # - contiguous category id: [0, #classes), in order to train the linear # softmax classifier. thing_dataset_id_to_contiguous_id = {} stuff_dataset_id_to_contiguous_id = {} for i, cat in enumerate(COCO_CATEGORIES): if cat["isthing"]: thing_dataset_id_to_contiguous_id[cat["id"]] = i # else: # stuff_dataset_id_to_contiguous_id[cat["id"]] = i # in order to use sem_seg evaluator stuff_dataset_id_to_contiguous_id[cat["id"]] = i meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id return meta def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta): """ Args: image_dir (str): path to the raw dataset. e.g., "~/coco/train2017". gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017". json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json". Returns: list[dict]: a list of dicts in Detectron2 standard format. (See `Using Custom Datasets `_ ) """ def _convert_category_id(segment_info, meta): if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]: segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] segment_info["isthing"] = True else: segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] segment_info["isthing"] = False return segment_info with PathManager.open(json_file) as f: json_info = json.load(f) ret = [] for ann in json_info["annotations"]: image_id = int(ann["image_id"]) # TODO: currently we assume image and label has the same filename but # different extension, and images have extension ".jpg" for COCO. Need # to make image extension a user-provided argument if we extend this # function to support other COCO-like datasets. image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg") label_file = os.path.join(gt_dir, ann["file_name"]) sem_label_file = os.path.join(semseg_dir, ann["file_name"]) segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]] ret.append( { "file_name": image_file, "image_id": image_id, "pan_seg_file_name": label_file, "sem_seg_file_name": sem_label_file, "segments_info": segments_info, } ) assert len(ret), f"No images found in {image_dir}!" assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"] assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"] assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"] return ret def register_coco_panoptic_annos_sem_seg( name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json ): panoptic_name = name delattr(MetadataCatalog.get(panoptic_name), "thing_classes") delattr(MetadataCatalog.get(panoptic_name), "thing_colors") MetadataCatalog.get(panoptic_name).set( thing_classes=metadata["thing_classes"], thing_colors=metadata["thing_colors"], # thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"], ) # the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg" semantic_name = name + "_with_sem_seg" DatasetCatalog.register( semantic_name, lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, sem_seg_root, metadata), ) MetadataCatalog.get(semantic_name).set( sem_seg_root=sem_seg_root, panoptic_root=panoptic_root, image_root=image_root, panoptic_json=panoptic_json, json_file=instances_json, evaluator_type="coco_panoptic_seg", ignore_label=255, label_divisor=1000, **metadata, ) def register_all_coco_panoptic_annos_sem_seg(root): for ( prefix, (panoptic_root, panoptic_json, semantic_root), ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items(): prefix_instances = prefix[: -len("_panoptic")] instances_meta = MetadataCatalog.get(prefix_instances) image_root, instances_json = instances_meta.image_root, instances_meta.json_file register_coco_panoptic_annos_sem_seg( prefix, get_metadata(), image_root, os.path.join(root, panoptic_root), os.path.join(root, panoptic_json), os.path.join(root, semantic_root), instances_json, ) _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_coco_panoptic_annos_sem_seg(_root) ================================================ FILE: mfvis_nococo/mask2former/data/datasets/register_coco_stuff_10k.py ================================================ import os from detectron2.data import DatasetCatalog, MetadataCatalog from detectron2.data.datasets import load_sem_seg COCO_CATEGORIES = [ {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"}, {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"}, {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"}, {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"}, {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"}, {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"}, {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"}, {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"}, {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"}, {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"}, {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"}, {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"}, {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"}, {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"}, {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"}, {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"}, {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"}, {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"}, {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"}, {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"}, {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"}, {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"}, {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"}, {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"}, {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"}, {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"}, {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"}, {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"}, {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"}, {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"}, {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"}, {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"}, {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"}, {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"}, {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"}, {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"}, {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"}, {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"}, {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"}, {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"}, {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"}, {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"}, {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"}, {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"}, {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"}, {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"}, {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"}, {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"}, {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"}, {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"}, {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"}, {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"}, {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"}, {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"}, {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"}, {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"}, {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"}, {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"}, {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"}, {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"}, {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"}, {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"}, {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"}, {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"}, {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"}, {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"}, {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"}, {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"}, {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"}, {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"}, {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"}, {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"}, {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"}, {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"}, {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"}, {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"}, {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"}, {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"}, {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"}, {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"}, {"id": 92, "name": "banner", "supercategory": "textile"}, {"id": 93, "name": "blanket", "supercategory": "textile"}, {"id": 94, "name": "branch", "supercategory": "plant"}, {"id": 95, "name": "bridge", "supercategory": "building"}, {"id": 96, "name": "building-other", "supercategory": "building"}, {"id": 97, "name": "bush", "supercategory": "plant"}, {"id": 98, "name": "cabinet", "supercategory": "furniture-stuff"}, {"id": 99, "name": "cage", "supercategory": "structural"}, {"id": 100, "name": "cardboard", "supercategory": "raw-material"}, {"id": 101, "name": "carpet", "supercategory": "floor"}, {"id": 102, "name": "ceiling-other", "supercategory": "ceiling"}, {"id": 103, "name": "ceiling-tile", "supercategory": "ceiling"}, {"id": 104, "name": "cloth", "supercategory": "textile"}, {"id": 105, "name": "clothes", "supercategory": "textile"}, {"id": 106, "name": "clouds", "supercategory": "sky"}, {"id": 107, "name": "counter", "supercategory": "furniture-stuff"}, {"id": 108, "name": "cupboard", "supercategory": "furniture-stuff"}, {"id": 109, "name": "curtain", "supercategory": "textile"}, {"id": 110, "name": "desk-stuff", "supercategory": "furniture-stuff"}, {"id": 111, "name": "dirt", "supercategory": "ground"}, {"id": 112, "name": "door-stuff", "supercategory": "furniture-stuff"}, {"id": 113, "name": "fence", "supercategory": "structural"}, {"id": 114, "name": "floor-marble", "supercategory": "floor"}, {"id": 115, "name": "floor-other", "supercategory": "floor"}, {"id": 116, "name": "floor-stone", "supercategory": "floor"}, {"id": 117, "name": "floor-tile", "supercategory": "floor"}, {"id": 118, "name": "floor-wood", "supercategory": "floor"}, {"id": 119, "name": "flower", "supercategory": "plant"}, {"id": 120, "name": "fog", "supercategory": "water"}, {"id": 121, "name": "food-other", "supercategory": "food-stuff"}, {"id": 122, "name": "fruit", "supercategory": "food-stuff"}, {"id": 123, "name": "furniture-other", "supercategory": "furniture-stuff"}, {"id": 124, "name": "grass", "supercategory": "plant"}, {"id": 125, "name": "gravel", "supercategory": "ground"}, {"id": 126, "name": "ground-other", "supercategory": "ground"}, {"id": 127, "name": "hill", "supercategory": "solid"}, {"id": 128, "name": "house", "supercategory": "building"}, {"id": 129, "name": "leaves", "supercategory": "plant"}, {"id": 130, "name": "light", "supercategory": "furniture-stuff"}, {"id": 131, "name": "mat", "supercategory": "textile"}, {"id": 132, "name": "metal", "supercategory": "raw-material"}, {"id": 133, "name": "mirror-stuff", "supercategory": "furniture-stuff"}, {"id": 134, "name": "moss", "supercategory": "plant"}, {"id": 135, "name": "mountain", "supercategory": "solid"}, {"id": 136, "name": "mud", "supercategory": "ground"}, {"id": 137, "name": "napkin", "supercategory": "textile"}, {"id": 138, "name": "net", "supercategory": "structural"}, {"id": 139, "name": "paper", "supercategory": "raw-material"}, {"id": 140, "name": "pavement", "supercategory": "ground"}, {"id": 141, "name": "pillow", "supercategory": "textile"}, {"id": 142, "name": "plant-other", "supercategory": "plant"}, {"id": 143, "name": "plastic", "supercategory": "raw-material"}, {"id": 144, "name": "platform", "supercategory": "ground"}, {"id": 145, "name": "playingfield", "supercategory": "ground"}, {"id": 146, "name": "railing", "supercategory": "structural"}, {"id": 147, "name": "railroad", "supercategory": "ground"}, {"id": 148, "name": "river", "supercategory": "water"}, {"id": 149, "name": "road", "supercategory": "ground"}, {"id": 150, "name": "rock", "supercategory": "solid"}, {"id": 151, "name": "roof", "supercategory": "building"}, {"id": 152, "name": "rug", "supercategory": "textile"}, {"id": 153, "name": "salad", "supercategory": "food-stuff"}, {"id": 154, "name": "sand", "supercategory": "ground"}, {"id": 155, "name": "sea", "supercategory": "water"}, {"id": 156, "name": "shelf", "supercategory": "furniture-stuff"}, {"id": 157, "name": "sky-other", "supercategory": "sky"}, {"id": 158, "name": "skyscraper", "supercategory": "building"}, {"id": 159, "name": "snow", "supercategory": "ground"}, {"id": 160, "name": "solid-other", "supercategory": "solid"}, {"id": 161, "name": "stairs", "supercategory": "furniture-stuff"}, {"id": 162, "name": "stone", "supercategory": "solid"}, {"id": 163, "name": "straw", "supercategory": "plant"}, {"id": 164, "name": "structural-other", "supercategory": "structural"}, {"id": 165, "name": "table", "supercategory": "furniture-stuff"}, {"id": 166, "name": "tent", "supercategory": "building"}, {"id": 167, "name": "textile-other", "supercategory": "textile"}, {"id": 168, "name": "towel", "supercategory": "textile"}, {"id": 169, "name": "tree", "supercategory": "plant"}, {"id": 170, "name": "vegetable", "supercategory": "food-stuff"}, {"id": 171, "name": "wall-brick", "supercategory": "wall"}, {"id": 172, "name": "wall-concrete", "supercategory": "wall"}, {"id": 173, "name": "wall-other", "supercategory": "wall"}, {"id": 174, "name": "wall-panel", "supercategory": "wall"}, {"id": 175, "name": "wall-stone", "supercategory": "wall"}, {"id": 176, "name": "wall-tile", "supercategory": "wall"}, {"id": 177, "name": "wall-wood", "supercategory": "wall"}, {"id": 178, "name": "water-other", "supercategory": "water"}, {"id": 179, "name": "waterdrops", "supercategory": "water"}, {"id": 180, "name": "window-blind", "supercategory": "window"}, {"id": 181, "name": "window-other", "supercategory": "window"}, {"id": 182, "name": "wood", "supercategory": "solid"}, ] def _get_coco_stuff_meta(): # Id 0 is reserved for ignore_label, we change ignore_label for 0 # to 255 in our pre-processing. stuff_ids = [k["id"] for k in COCO_CATEGORIES] assert len(stuff_ids) == 171, len(stuff_ids) # For semantic segmentation, this mapping maps from contiguous stuff id # (in [0, 91], used in models) to ids in the dataset (used for processing results) stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)} stuff_classes = [k["name"] for k in COCO_CATEGORIES] ret = { "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id, "stuff_classes": stuff_classes, } return ret def register_all_coco_stuff_10k(root): root = os.path.join(root, "coco", "coco_stuff_10k") meta = _get_coco_stuff_meta() for name, image_dirname, sem_seg_dirname in [ ("train", "images_detectron2/train", "annotations_detectron2/train"), ("test", "images_detectron2/test", "annotations_detectron2/test"), ]: image_dir = os.path.join(root, image_dirname) gt_dir = os.path.join(root, sem_seg_dirname) name = f"coco_2017_{name}_stuff_10k_sem_seg" DatasetCatalog.register( name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg") ) MetadataCatalog.get(name).set( image_root=image_dir, sem_seg_root=gt_dir, evaluator_type="sem_seg", ignore_label=255, **meta, ) _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_coco_stuff_10k(_root) ================================================ FILE: mfvis_nococo/mask2former/data/datasets/register_mapillary_vistas.py ================================================ import os from detectron2.data import DatasetCatalog, MetadataCatalog from detectron2.data.datasets import load_sem_seg MAPILLARY_VISTAS_SEM_SEG_CATEGORIES = [ { "color": [165, 42, 42], "instances": True, "readable": "Bird", "name": "animal--bird", "evaluate": True, }, { "color": [0, 192, 0], "instances": True, "readable": "Ground Animal", "name": "animal--ground-animal", "evaluate": True, }, { "color": [196, 196, 196], "instances": False, "readable": "Curb", "name": "construction--barrier--curb", "evaluate": True, }, { "color": [190, 153, 153], "instances": False, "readable": "Fence", "name": "construction--barrier--fence", "evaluate": True, }, { "color": [180, 165, 180], "instances": False, "readable": "Guard Rail", "name": "construction--barrier--guard-rail", "evaluate": True, }, { "color": [90, 120, 150], "instances": False, "readable": "Barrier", "name": "construction--barrier--other-barrier", "evaluate": True, }, { "color": [102, 102, 156], "instances": False, "readable": "Wall", "name": "construction--barrier--wall", "evaluate": True, }, { "color": [128, 64, 255], "instances": False, "readable": "Bike Lane", "name": "construction--flat--bike-lane", "evaluate": True, }, { "color": [140, 140, 200], "instances": True, "readable": "Crosswalk - Plain", "name": "construction--flat--crosswalk-plain", "evaluate": True, }, { "color": [170, 170, 170], "instances": False, "readable": "Curb Cut", "name": "construction--flat--curb-cut", "evaluate": True, }, { "color": [250, 170, 160], "instances": False, "readable": "Parking", "name": "construction--flat--parking", "evaluate": True, }, { "color": [96, 96, 96], "instances": False, "readable": "Pedestrian Area", "name": "construction--flat--pedestrian-area", "evaluate": True, }, { "color": [230, 150, 140], "instances": False, "readable": "Rail Track", "name": "construction--flat--rail-track", "evaluate": True, }, { "color": [128, 64, 128], "instances": False, "readable": "Road", "name": "construction--flat--road", "evaluate": True, }, { "color": [110, 110, 110], "instances": False, "readable": "Service Lane", "name": "construction--flat--service-lane", "evaluate": True, }, { "color": [244, 35, 232], "instances": False, "readable": "Sidewalk", "name": "construction--flat--sidewalk", "evaluate": True, }, { "color": [150, 100, 100], "instances": False, "readable": "Bridge", "name": "construction--structure--bridge", "evaluate": True, }, { "color": [70, 70, 70], "instances": False, "readable": "Building", "name": "construction--structure--building", "evaluate": True, }, { "color": [150, 120, 90], "instances": False, "readable": "Tunnel", "name": "construction--structure--tunnel", "evaluate": True, }, { "color": [220, 20, 60], "instances": True, "readable": "Person", "name": "human--person", "evaluate": True, }, { "color": [255, 0, 0], "instances": True, "readable": "Bicyclist", "name": "human--rider--bicyclist", "evaluate": True, }, { "color": [255, 0, 100], "instances": True, "readable": "Motorcyclist", "name": "human--rider--motorcyclist", "evaluate": True, }, { "color": [255, 0, 200], "instances": True, "readable": "Other Rider", "name": "human--rider--other-rider", "evaluate": True, }, { "color": [200, 128, 128], "instances": True, "readable": "Lane Marking - Crosswalk", "name": "marking--crosswalk-zebra", "evaluate": True, }, { "color": [255, 255, 255], "instances": False, "readable": "Lane Marking - General", "name": "marking--general", "evaluate": True, }, { "color": [64, 170, 64], "instances": False, "readable": "Mountain", "name": "nature--mountain", "evaluate": True, }, { "color": [230, 160, 50], "instances": False, "readable": "Sand", "name": "nature--sand", "evaluate": True, }, { "color": [70, 130, 180], "instances": False, "readable": "Sky", "name": "nature--sky", "evaluate": True, }, { "color": [190, 255, 255], "instances": False, "readable": "Snow", "name": "nature--snow", "evaluate": True, }, { "color": [152, 251, 152], "instances": False, "readable": "Terrain", "name": "nature--terrain", "evaluate": True, }, { "color": [107, 142, 35], "instances": False, "readable": "Vegetation", "name": "nature--vegetation", "evaluate": True, }, { "color": [0, 170, 30], "instances": False, "readable": "Water", "name": "nature--water", "evaluate": True, }, { "color": [255, 255, 128], "instances": True, "readable": "Banner", "name": "object--banner", "evaluate": True, }, { "color": [250, 0, 30], "instances": True, "readable": "Bench", "name": "object--bench", "evaluate": True, }, { "color": [100, 140, 180], "instances": True, "readable": "Bike Rack", "name": "object--bike-rack", "evaluate": True, }, { "color": [220, 220, 220], "instances": True, "readable": "Billboard", "name": "object--billboard", "evaluate": True, }, { "color": [220, 128, 128], "instances": True, "readable": "Catch Basin", "name": "object--catch-basin", "evaluate": True, }, { "color": [222, 40, 40], "instances": True, "readable": "CCTV Camera", "name": "object--cctv-camera", "evaluate": True, }, { "color": [100, 170, 30], "instances": True, "readable": "Fire Hydrant", "name": "object--fire-hydrant", "evaluate": True, }, { "color": [40, 40, 40], "instances": True, "readable": "Junction Box", "name": "object--junction-box", "evaluate": True, }, { "color": [33, 33, 33], "instances": True, "readable": "Mailbox", "name": "object--mailbox", "evaluate": True, }, { "color": [100, 128, 160], "instances": True, "readable": "Manhole", "name": "object--manhole", "evaluate": True, }, { "color": [142, 0, 0], "instances": True, "readable": "Phone Booth", "name": "object--phone-booth", "evaluate": True, }, { "color": [70, 100, 150], "instances": False, "readable": "Pothole", "name": "object--pothole", "evaluate": True, }, { "color": [210, 170, 100], "instances": True, "readable": "Street Light", "name": "object--street-light", "evaluate": True, }, { "color": [153, 153, 153], "instances": True, "readable": "Pole", "name": "object--support--pole", "evaluate": True, }, { "color": [128, 128, 128], "instances": True, "readable": "Traffic Sign Frame", "name": "object--support--traffic-sign-frame", "evaluate": True, }, { "color": [0, 0, 80], "instances": True, "readable": "Utility Pole", "name": "object--support--utility-pole", "evaluate": True, }, { "color": [250, 170, 30], "instances": True, "readable": "Traffic Light", "name": "object--traffic-light", "evaluate": True, }, { "color": [192, 192, 192], "instances": True, "readable": "Traffic Sign (Back)", "name": "object--traffic-sign--back", "evaluate": True, }, { "color": [220, 220, 0], "instances": True, "readable": "Traffic Sign (Front)", "name": "object--traffic-sign--front", "evaluate": True, }, { "color": [140, 140, 20], "instances": True, "readable": "Trash Can", "name": "object--trash-can", "evaluate": True, }, { "color": [119, 11, 32], "instances": True, "readable": "Bicycle", "name": "object--vehicle--bicycle", "evaluate": True, }, { "color": [150, 0, 255], "instances": True, "readable": "Boat", "name": "object--vehicle--boat", "evaluate": True, }, { "color": [0, 60, 100], "instances": True, "readable": "Bus", "name": "object--vehicle--bus", "evaluate": True, }, { "color": [0, 0, 142], "instances": True, "readable": "Car", "name": "object--vehicle--car", "evaluate": True, }, { "color": [0, 0, 90], "instances": True, "readable": "Caravan", "name": "object--vehicle--caravan", "evaluate": True, }, { "color": [0, 0, 230], "instances": True, "readable": "Motorcycle", "name": "object--vehicle--motorcycle", "evaluate": True, }, { "color": [0, 80, 100], "instances": False, "readable": "On Rails", "name": "object--vehicle--on-rails", "evaluate": True, }, { "color": [128, 64, 64], "instances": True, "readable": "Other Vehicle", "name": "object--vehicle--other-vehicle", "evaluate": True, }, { "color": [0, 0, 110], "instances": True, "readable": "Trailer", "name": "object--vehicle--trailer", "evaluate": True, }, { "color": [0, 0, 70], "instances": True, "readable": "Truck", "name": "object--vehicle--truck", "evaluate": True, }, { "color": [0, 0, 192], "instances": True, "readable": "Wheeled Slow", "name": "object--vehicle--wheeled-slow", "evaluate": True, }, { "color": [32, 32, 32], "instances": False, "readable": "Car Mount", "name": "void--car-mount", "evaluate": True, }, { "color": [120, 10, 10], "instances": False, "readable": "Ego Vehicle", "name": "void--ego-vehicle", "evaluate": True, }, { "color": [0, 0, 0], "instances": False, "readable": "Unlabeled", "name": "void--unlabeled", "evaluate": False, }, ] def _get_mapillary_vistas_meta(): stuff_classes = [k["readable"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES if k["evaluate"]] assert len(stuff_classes) == 65 stuff_colors = [k["color"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES if k["evaluate"]] assert len(stuff_colors) == 65 ret = { "stuff_classes": stuff_classes, "stuff_colors": stuff_colors, } return ret def register_all_mapillary_vistas(root): root = os.path.join(root, "mapillary_vistas") meta = _get_mapillary_vistas_meta() for name, dirname in [("train", "training"), ("val", "validation")]: image_dir = os.path.join(root, dirname, "images") gt_dir = os.path.join(root, dirname, "labels") name = f"mapillary_vistas_sem_seg_{name}" DatasetCatalog.register( name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg") ) MetadataCatalog.get(name).set( image_root=image_dir, sem_seg_root=gt_dir, evaluator_type="sem_seg", ignore_label=65, # different from other datasets, Mapillary Vistas sets ignore_label to 65 **meta, ) _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_mapillary_vistas(_root) ================================================ FILE: mfvis_nococo/mask2former/data/datasets/register_mapillary_vistas_panoptic.py ================================================ import json import os from detectron2.data import DatasetCatalog, MetadataCatalog from detectron2.utils.file_io import PathManager MAPILLARY_VISTAS_SEM_SEG_CATEGORIES = [ {'color': [165, 42, 42], 'id': 1, 'isthing': 1, 'name': 'Bird', 'supercategory': 'animal--bird'}, {'color': [0, 192, 0], 'id': 2, 'isthing': 1, 'name': 'Ground Animal', 'supercategory': 'animal--ground-animal'}, {'color': [196, 196, 196], 'id': 3, 'isthing': 0, 'name': 'Curb', 'supercategory': 'construction--barrier--curb'}, {'color': [190, 153, 153], 'id': 4, 'isthing': 0, 'name': 'Fence', 'supercategory': 'construction--barrier--fence'}, {'color': [180, 165, 180], 'id': 5, 'isthing': 0, 'name': 'Guard Rail', 'supercategory': 'construction--barrier--guard-rail'}, {'color': [90, 120, 150], 'id': 6, 'isthing': 0, 'name': 'Barrier', 'supercategory': 'construction--barrier--other-barrier'}, {'color': [102, 102, 156], 'id': 7, 'isthing': 0, 'name': 'Wall', 'supercategory': 'construction--barrier--wall'}, {'color': [128, 64, 255], 'id': 8, 'isthing': 0, 'name': 'Bike Lane', 'supercategory': 'construction--flat--bike-lane'}, {'color': [140, 140, 200], 'id': 9, 'isthing': 1, 'name': 'Crosswalk - Plain', 'supercategory': 'construction--flat--crosswalk-plain'}, {'color': [170, 170, 170], 'id': 10, 'isthing': 0, 'name': 'Curb Cut', 'supercategory': 'construction--flat--curb-cut'}, {'color': [250, 170, 160], 'id': 11, 'isthing': 0, 'name': 'Parking', 'supercategory': 'construction--flat--parking'}, {'color': [96, 96, 96], 'id': 12, 'isthing': 0, 'name': 'Pedestrian Area', 'supercategory': 'construction--flat--pedestrian-area'}, {'color': [230, 150, 140], 'id': 13, 'isthing': 0, 'name': 'Rail Track', 'supercategory': 'construction--flat--rail-track'}, {'color': [128, 64, 128], 'id': 14, 'isthing': 0, 'name': 'Road', 'supercategory': 'construction--flat--road'}, {'color': [110, 110, 110], 'id': 15, 'isthing': 0, 'name': 'Service Lane', 'supercategory': 'construction--flat--service-lane'}, {'color': [244, 35, 232], 'id': 16, 'isthing': 0, 'name': 'Sidewalk', 'supercategory': 'construction--flat--sidewalk'}, {'color': [150, 100, 100], 'id': 17, 'isthing': 0, 'name': 'Bridge', 'supercategory': 'construction--structure--bridge'}, {'color': [70, 70, 70], 'id': 18, 'isthing': 0, 'name': 'Building', 'supercategory': 'construction--structure--building'}, {'color': [150, 120, 90], 'id': 19, 'isthing': 0, 'name': 'Tunnel', 'supercategory': 'construction--structure--tunnel'}, {'color': [220, 20, 60], 'id': 20, 'isthing': 1, 'name': 'Person', 'supercategory': 'human--person'}, {'color': [255, 0, 0], 'id': 21, 'isthing': 1, 'name': 'Bicyclist', 'supercategory': 'human--rider--bicyclist'}, {'color': [255, 0, 100], 'id': 22, 'isthing': 1, 'name': 'Motorcyclist', 'supercategory': 'human--rider--motorcyclist'}, {'color': [255, 0, 200], 'id': 23, 'isthing': 1, 'name': 'Other Rider', 'supercategory': 'human--rider--other-rider'}, {'color': [200, 128, 128], 'id': 24, 'isthing': 1, 'name': 'Lane Marking - Crosswalk', 'supercategory': 'marking--crosswalk-zebra'}, {'color': [255, 255, 255], 'id': 25, 'isthing': 0, 'name': 'Lane Marking - General', 'supercategory': 'marking--general'}, {'color': [64, 170, 64], 'id': 26, 'isthing': 0, 'name': 'Mountain', 'supercategory': 'nature--mountain'}, {'color': [230, 160, 50], 'id': 27, 'isthing': 0, 'name': 'Sand', 'supercategory': 'nature--sand'}, {'color': [70, 130, 180], 'id': 28, 'isthing': 0, 'name': 'Sky', 'supercategory': 'nature--sky'}, {'color': [190, 255, 255], 'id': 29, 'isthing': 0, 'name': 'Snow', 'supercategory': 'nature--snow'}, {'color': [152, 251, 152], 'id': 30, 'isthing': 0, 'name': 'Terrain', 'supercategory': 'nature--terrain'}, {'color': [107, 142, 35], 'id': 31, 'isthing': 0, 'name': 'Vegetation', 'supercategory': 'nature--vegetation'}, {'color': [0, 170, 30], 'id': 32, 'isthing': 0, 'name': 'Water', 'supercategory': 'nature--water'}, {'color': [255, 255, 128], 'id': 33, 'isthing': 1, 'name': 'Banner', 'supercategory': 'object--banner'}, {'color': [250, 0, 30], 'id': 34, 'isthing': 1, 'name': 'Bench', 'supercategory': 'object--bench'}, {'color': [100, 140, 180], 'id': 35, 'isthing': 1, 'name': 'Bike Rack', 'supercategory': 'object--bike-rack'}, {'color': [220, 220, 220], 'id': 36, 'isthing': 1, 'name': 'Billboard', 'supercategory': 'object--billboard'}, {'color': [220, 128, 128], 'id': 37, 'isthing': 1, 'name': 'Catch Basin', 'supercategory': 'object--catch-basin'}, {'color': [222, 40, 40], 'id': 38, 'isthing': 1, 'name': 'CCTV Camera', 'supercategory': 'object--cctv-camera'}, {'color': [100, 170, 30], 'id': 39, 'isthing': 1, 'name': 'Fire Hydrant', 'supercategory': 'object--fire-hydrant'}, {'color': [40, 40, 40], 'id': 40, 'isthing': 1, 'name': 'Junction Box', 'supercategory': 'object--junction-box'}, {'color': [33, 33, 33], 'id': 41, 'isthing': 1, 'name': 'Mailbox', 'supercategory': 'object--mailbox'}, {'color': [100, 128, 160], 'id': 42, 'isthing': 1, 'name': 'Manhole', 'supercategory': 'object--manhole'}, {'color': [142, 0, 0], 'id': 43, 'isthing': 1, 'name': 'Phone Booth', 'supercategory': 'object--phone-booth'}, {'color': [70, 100, 150], 'id': 44, 'isthing': 0, 'name': 'Pothole', 'supercategory': 'object--pothole'}, {'color': [210, 170, 100], 'id': 45, 'isthing': 1, 'name': 'Street Light', 'supercategory': 'object--street-light'}, {'color': [153, 153, 153], 'id': 46, 'isthing': 1, 'name': 'Pole', 'supercategory': 'object--support--pole'}, {'color': [128, 128, 128], 'id': 47, 'isthing': 1, 'name': 'Traffic Sign Frame', 'supercategory': 'object--support--traffic-sign-frame'}, {'color': [0, 0, 80], 'id': 48, 'isthing': 1, 'name': 'Utility Pole', 'supercategory': 'object--support--utility-pole'}, {'color': [250, 170, 30], 'id': 49, 'isthing': 1, 'name': 'Traffic Light', 'supercategory': 'object--traffic-light'}, {'color': [192, 192, 192], 'id': 50, 'isthing': 1, 'name': 'Traffic Sign (Back)', 'supercategory': 'object--traffic-sign--back'}, {'color': [220, 220, 0], 'id': 51, 'isthing': 1, 'name': 'Traffic Sign (Front)', 'supercategory': 'object--traffic-sign--front'}, {'color': [140, 140, 20], 'id': 52, 'isthing': 1, 'name': 'Trash Can', 'supercategory': 'object--trash-can'}, {'color': [119, 11, 32], 'id': 53, 'isthing': 1, 'name': 'Bicycle', 'supercategory': 'object--vehicle--bicycle'}, {'color': [150, 0, 255], 'id': 54, 'isthing': 1, 'name': 'Boat', 'supercategory': 'object--vehicle--boat'}, {'color': [0, 60, 100], 'id': 55, 'isthing': 1, 'name': 'Bus', 'supercategory': 'object--vehicle--bus'}, {'color': [0, 0, 142], 'id': 56, 'isthing': 1, 'name': 'Car', 'supercategory': 'object--vehicle--car'}, {'color': [0, 0, 90], 'id': 57, 'isthing': 1, 'name': 'Caravan', 'supercategory': 'object--vehicle--caravan'}, {'color': [0, 0, 230], 'id': 58, 'isthing': 1, 'name': 'Motorcycle', 'supercategory': 'object--vehicle--motorcycle'}, {'color': [0, 80, 100], 'id': 59, 'isthing': 0, 'name': 'On Rails', 'supercategory': 'object--vehicle--on-rails'}, {'color': [128, 64, 64], 'id': 60, 'isthing': 1, 'name': 'Other Vehicle', 'supercategory': 'object--vehicle--other-vehicle'}, {'color': [0, 0, 110], 'id': 61, 'isthing': 1, 'name': 'Trailer', 'supercategory': 'object--vehicle--trailer'}, {'color': [0, 0, 70], 'id': 62, 'isthing': 1, 'name': 'Truck', 'supercategory': 'object--vehicle--truck'}, {'color': [0, 0, 192], 'id': 63, 'isthing': 1, 'name': 'Wheeled Slow', 'supercategory': 'object--vehicle--wheeled-slow'}, {'color': [32, 32, 32], 'id': 64, 'isthing': 0, 'name': 'Car Mount', 'supercategory': 'void--car-mount'}, {'color': [120, 10, 10], 'id': 65, 'isthing': 0, 'name': 'Ego Vehicle', 'supercategory': 'void--ego-vehicle'} ] def load_mapillary_vistas_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta): """ Args: image_dir (str): path to the raw dataset. e.g., "~/coco/train2017". gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017". json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json". Returns: list[dict]: a list of dicts in Detectron2 standard format. (See `Using Custom Datasets `_ ) """ def _convert_category_id(segment_info, meta): if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]: segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] segment_info["isthing"] = True else: segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] segment_info["isthing"] = False return segment_info with PathManager.open(json_file) as f: json_info = json.load(f) ret = [] for ann in json_info["annotations"]: image_id = ann["image_id"] # TODO: currently we assume image and label has the same filename but # different extension, and images have extension ".jpg" for COCO. Need # to make image extension a user-provided argument if we extend this # function to support other COCO-like datasets. image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg") label_file = os.path.join(gt_dir, ann["file_name"]) sem_label_file = os.path.join(semseg_dir, ann["file_name"]) segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]] ret.append( { "file_name": image_file, "image_id": image_id, "pan_seg_file_name": label_file, "sem_seg_file_name": sem_label_file, "segments_info": segments_info, } ) assert len(ret), f"No images found in {image_dir}!" assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"] assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"] assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"] return ret def register_mapillary_vistas_panoptic( name, metadata, image_root, panoptic_root, semantic_root, panoptic_json, instances_json=None ): """ Register a "standard" version of ADE20k panoptic segmentation dataset named `name`. The dictionaries in this registered dataset follows detectron2's standard format. Hence it's called "standard". Args: name (str): the name that identifies a dataset, e.g. "ade20k_panoptic_train" metadata (dict): extra metadata associated with this dataset. image_root (str): directory which contains all the images panoptic_root (str): directory which contains panoptic annotation images in COCO format panoptic_json (str): path to the json panoptic annotation file in COCO format sem_seg_root (none): not used, to be consistent with `register_coco_panoptic_separated`. instances_json (str): path to the json instance annotation file """ panoptic_name = name DatasetCatalog.register( panoptic_name, lambda: load_mapillary_vistas_panoptic_json( panoptic_json, image_root, panoptic_root, semantic_root, metadata ), ) MetadataCatalog.get(panoptic_name).set( panoptic_root=panoptic_root, image_root=image_root, panoptic_json=panoptic_json, json_file=instances_json, evaluator_type="mapillary_vistas_panoptic_seg", ignore_label=65, # different from other datasets, Mapillary Vistas sets ignore_label to 65 label_divisor=1000, **metadata, ) _PREDEFINED_SPLITS_ADE20K_PANOPTIC = { "mapillary_vistas_panoptic_train": ( "mapillary_vistas/training/images", "mapillary_vistas/training/panoptic", "mapillary_vistas/training/panoptic/panoptic_2018.json", "mapillary_vistas/training/labels", ), "mapillary_vistas_panoptic_val": ( "mapillary_vistas/validation/images", "mapillary_vistas/validation/panoptic", "mapillary_vistas/validation/panoptic/panoptic_2018.json", "mapillary_vistas/validation/labels", ), } def get_metadata(): meta = {} # The following metadata maps contiguous id from [0, #thing categories + # #stuff categories) to their names and colors. We have to replica of the # same name and color under "thing_*" and "stuff_*" because the current # visualization function in D2 handles thing and class classes differently # due to some heuristic used in Panoptic FPN. We keep the same naming to # enable reusing existing visualization functions. thing_classes = [k["name"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES] thing_colors = [k["color"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES] stuff_classes = [k["name"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES] stuff_colors = [k["color"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES] meta["thing_classes"] = thing_classes meta["thing_colors"] = thing_colors meta["stuff_classes"] = stuff_classes meta["stuff_colors"] = stuff_colors # Convert category id for training: # category id: like semantic segmentation, it is the class id for each # pixel. Since there are some classes not used in evaluation, the category # id is not always contiguous and thus we have two set of category ids: # - original category id: category id in the original dataset, mainly # used for evaluation. # - contiguous category id: [0, #classes), in order to train the linear # softmax classifier. thing_dataset_id_to_contiguous_id = {} stuff_dataset_id_to_contiguous_id = {} for i, cat in enumerate(MAPILLARY_VISTAS_SEM_SEG_CATEGORIES): if cat["isthing"]: thing_dataset_id_to_contiguous_id[cat["id"]] = i # else: # stuff_dataset_id_to_contiguous_id[cat["id"]] = i # in order to use sem_seg evaluator stuff_dataset_id_to_contiguous_id[cat["id"]] = i meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id return meta def register_all_mapillary_vistas_panoptic(root): metadata = get_metadata() for ( prefix, (image_root, panoptic_root, panoptic_json, semantic_root), ) in _PREDEFINED_SPLITS_ADE20K_PANOPTIC.items(): # The "standard" version of COCO panoptic segmentation dataset, # e.g. used by Panoptic-DeepLab register_mapillary_vistas_panoptic( prefix, metadata, os.path.join(root, image_root), os.path.join(root, panoptic_root), os.path.join(root, semantic_root), os.path.join(root, panoptic_json), ) _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_mapillary_vistas_panoptic(_root) ================================================ FILE: mfvis_nococo/mask2former/evaluation/__init__.py ================================================ ================================================ FILE: mfvis_nococo/mask2former/evaluation/__init__.py.new ================================================ ================================================ FILE: mfvis_nococo/mask2former/evaluation/instance_evaluation.py ================================================ import contextlib import copy import io import itertools import json import logging import numpy as np import os import pickle from collections import OrderedDict import pycocotools.mask as mask_util import torch from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval from tabulate import tabulate import detectron2.utils.comm as comm from detectron2.config import CfgNode from detectron2.data import MetadataCatalog from detectron2.data.datasets.coco import convert_to_coco_json from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco from detectron2.evaluation.fast_eval_api import COCOeval_opt from detectron2.structures import Boxes, BoxMode, pairwise_iou from detectron2.utils.file_io import PathManager from detectron2.utils.logger import create_small_table # modified from COCOEvaluator for instance segmetnat class InstanceSegEvaluator(COCOEvaluator): """ Evaluate AR for object proposals, AP for instance detection/segmentation, AP for keypoint detection outputs using COCO's metrics. See http://cocodataset.org/#detection-eval and http://cocodataset.org/#keypoints-eval to understand its metrics. The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means the metric cannot be computed (e.g. due to no predictions made). In addition to COCO, this evaluator is able to support any bounding box detection, instance segmentation, or keypoint detection dataset. """ def _eval_predictions(self, predictions, img_ids=None): """ Evaluate predictions. Fill self._results with the metrics of the tasks. """ self._logger.info("Preparing results for COCO format ...") coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) tasks = self._tasks or self._tasks_from_predictions(coco_results) # unmap the category ids for COCO if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id # all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) # num_classes = len(all_contiguous_ids) # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} for result in coco_results: category_id = result["category_id"] # assert category_id < num_classes, ( # f"A prediction has class={category_id}, " # f"but the dataset only has {num_classes} classes and " # f"predicted class id should be in [0, {num_classes - 1}]." # ) assert category_id in reverse_id_mapping, ( f"A prediction has class={category_id}, " f"but the dataset only has class ids in {dataset_id_to_contiguous_id}." ) result["category_id"] = reverse_id_mapping[category_id] if self._output_dir: file_path = os.path.join(self._output_dir, "coco_instances_results.json") self._logger.info("Saving results to {}".format(file_path)) with PathManager.open(file_path, "w") as f: f.write(json.dumps(coco_results)) f.flush() if not self._do_evaluation: self._logger.info("Annotations are not available for evaluation.") return self._logger.info( "Evaluating predictions with {} COCO API...".format( "unofficial" if self._use_fast_impl else "official" ) ) for task in sorted(tasks): assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!" coco_eval = ( _evaluate_predictions_on_coco( self._coco_api, coco_results, task, kpt_oks_sigmas=self._kpt_oks_sigmas, use_fast_impl=self._use_fast_impl, img_ids=img_ids, max_dets_per_image=self._max_dets_per_image, ) if len(coco_results) > 0 else None # cocoapi does not handle empty results very well ) res = self._derive_coco_results( coco_eval, task, class_names=self._metadata.get("thing_classes") ) self._results[task] = res ================================================ FILE: mfvis_nococo/mask2former/maskformer_model.py ================================================ from typing import Tuple import torch from torch import nn from torch.nn import functional as F from detectron2.config import configurable from detectron2.data import MetadataCatalog from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head from detectron2.modeling.backbone import Backbone from detectron2.modeling.postprocessing import sem_seg_postprocess from detectron2.structures import Boxes, ImageList, Instances, BitMasks from detectron2.utils.memory import retry_if_cuda_oom from .modeling.criterion import SetCriterion from .modeling.matcher import HungarianMatcher from skimage import color import cv2 import numpy as np def unfold_wo_center(x, kernel_size, dilation): assert x.dim() == 4 assert kernel_size % 2 == 1 # using SAME padding padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2 unfolded_x = F.unfold( x, kernel_size=kernel_size, padding=padding, dilation=dilation ) unfolded_x = unfolded_x.reshape( x.size(0), x.size(1), -1, x.size(2), x.size(3) ) # remove the center pixels size = kernel_size ** 2 unfolded_x = torch.cat(( unfolded_x[:, :, :size // 2], unfolded_x[:, :, size // 2 + 1:] ), dim=2) return unfolded_x def get_images_color_similarity(images, kernel_size, dilation): assert images.dim() == 4 assert images.size(0) == 1 unfolded_images = unfold_wo_center( images, kernel_size=kernel_size, dilation=dilation ) diff = images[:, :, None] - unfolded_images similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5) return similarity @META_ARCH_REGISTRY.register() class MaskFormer(nn.Module): """ Main class for mask classification semantic segmentation architectures. """ @configurable def __init__( self, *, backbone: Backbone, sem_seg_head: nn.Module, criterion: nn.Module, num_queries: int, object_mask_threshold: float, overlap_threshold: float, metadata, size_divisibility: int, sem_seg_postprocess_before_inference: bool, pixel_mean: Tuple[float], pixel_std: Tuple[float], # inference semantic_on: bool, panoptic_on: bool, instance_on: bool, test_topk_per_image: int, ): """ Args: backbone: a backbone module, must follow detectron2's backbone interface sem_seg_head: a module that predicts semantic segmentation from backbone features criterion: a module that defines the loss num_queries: int, number of queries object_mask_threshold: float, threshold to filter query based on classification score for panoptic segmentation inference overlap_threshold: overlap threshold used in general inference for panoptic segmentation metadata: dataset meta, get `thing` and `stuff` category names for panoptic segmentation inference size_divisibility: Some backbones require the input height and width to be divisible by a specific integer. We can use this to override such requirement. sem_seg_postprocess_before_inference: whether to resize the prediction back to original input size before semantic segmentation inference or after. For high-resolution dataset like Mapillary, resizing predictions before inference will cause OOM error. pixel_mean, pixel_std: list or tuple with #channels element, representing the per-channel mean and std to be used to normalize the input image semantic_on: bool, whether to output semantic segmentation prediction instance_on: bool, whether to output instance segmentation prediction panoptic_on: bool, whether to output panoptic segmentation prediction test_topk_per_image: int, instance segmentation parameter, keep topk instances per image """ super().__init__() self.backbone = backbone self.sem_seg_head = sem_seg_head self.criterion = criterion self.num_queries = num_queries self.overlap_threshold = overlap_threshold self.object_mask_threshold = object_mask_threshold self.metadata = metadata if size_divisibility < 0: # use backbone size_divisibility if not set size_divisibility = self.backbone.size_divisibility self.size_divisibility = size_divisibility self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False) self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False) # additional args self.semantic_on = semantic_on self.instance_on = instance_on self.panoptic_on = panoptic_on self.test_topk_per_image = test_topk_per_image if not self.semantic_on: assert self.sem_seg_postprocess_before_inference @classmethod def from_config(cls, cfg): backbone = build_backbone(cfg) sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape()) # Loss parameters: deep_supervision = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION no_object_weight = cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT # loss weights class_weight = cfg.MODEL.MASK_FORMER.CLASS_WEIGHT dice_weight = cfg.MODEL.MASK_FORMER.DICE_WEIGHT mask_weight = cfg.MODEL.MASK_FORMER.MASK_WEIGHT # building criterion matcher = HungarianMatcher( cost_class=class_weight, cost_mask=mask_weight, cost_dice=dice_weight, num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS, ) weight_dict = {"loss_ce": class_weight, "loss_mask": mask_weight, "loss_dice": dice_weight, "loss_bound": mask_weight} if deep_supervision: dec_layers = cfg.MODEL.MASK_FORMER.DEC_LAYERS aux_weight_dict = {} for i in range(dec_layers - 1): aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) weight_dict.update(aux_weight_dict) losses = ["labels", "masks"] criterion = SetCriterion( sem_seg_head.num_classes, matcher=matcher, weight_dict=weight_dict, eos_coef=no_object_weight, losses=losses, num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS, oversample_ratio=cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO, importance_sample_ratio=cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO, ) return { "backbone": backbone, "sem_seg_head": sem_seg_head, "criterion": criterion, "num_queries": cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES, "object_mask_threshold": cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD, "overlap_threshold": cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD, "metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]), "size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY, "sem_seg_postprocess_before_inference": ( cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE or cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON or cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON ), "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, # inference "semantic_on": cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON, "instance_on": cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON, "panoptic_on": cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON, "test_topk_per_image": cfg.TEST.DETECTIONS_PER_IMAGE, } @property def device(self): return self.pixel_mean.device def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper`. Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * "image": Tensor, image in (C, H, W) format. * "instances": per-region ground truth * Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model (may be different from input resolution), used in inference. Returns: list[dict]: each dict has the results for one image. The dict contains the following keys: * "sem_seg": A Tensor that represents the per-pixel segmentation prediced by the head. The prediction has shape KxHxW that represents the logits of each class for each pixel. * "panoptic_seg": A tuple that represent panoptic output panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment. segments_info (list[dict]): Describe each segment in `panoptic_seg`. Each dict contains keys "id", "category_id", "isthing". """ images = [x["image"].to(self.device) for x in batched_inputs] if self.training: rs_images = ImageList.from_tensors(images, self.size_divisibility) image_masks = [~ x["padding_mask"].to(self.device) for x in batched_inputs] image_masks_back = [x["padding_mask"].to(self.device) for x in batched_inputs] image_masks_bool = [((m.sum() / (m.shape[0] * m.shape[1])) > 0.25).float()*((m_b.sum() / (m.shape[0] * m.shape[1])) > 0.25).float() for m, m_b in zip(image_masks, image_masks_back)] downsampled_images = F.avg_pool2d(rs_images.tensor.float(), kernel_size=4, stride=4, padding=0) #for img in images] images_lab = [torch.as_tensor(color.rgb2lab(ds_image[[2, 1, 0]].byte().permute(1, 2, 0).cpu().numpy()), device=ds_image.device, dtype=torch.float32).permute(2, 0, 1) for ds_image in downsampled_images] images_lab_sim = [get_images_color_similarity(img_lab.unsqueeze(0), 3, 2) * float(img_m_bool) for img_lab, img_m_bool in zip(images_lab, image_masks_bool)] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, self.size_divisibility) features = self.backbone(images.tensor) outputs = self.sem_seg_head(features) if self.training: # mask classification target if "instances" in batched_inputs[0]: gt_instances = [x["instances"].to(self.device) for x in batched_inputs] targets = self.prepare_targets(gt_instances, images) else: targets = None # bipartite matching-based loss losses = self.criterion(outputs, targets, images_lab_sim) for k in list(losses.keys()): if k in self.criterion.weight_dict: losses[k] *= self.criterion.weight_dict[k] else: # remove this loss if not specified in `weight_dict` losses.pop(k) return losses else: mask_cls_results = outputs["pred_logits"] mask_pred_results = outputs["pred_masks"] # upsample masks mask_pred_results = F.interpolate( mask_pred_results, size=(images.tensor.shape[-2], images.tensor.shape[-1]), mode="bilinear", align_corners=False, ) del outputs processed_results = [] for mask_cls_result, mask_pred_result, input_per_image, image_size in zip( mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes ): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) processed_results.append({}) if self.sem_seg_postprocess_before_inference: mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)( mask_pred_result, image_size, height, width ) mask_cls_result = mask_cls_result.to(mask_pred_result) # semantic segmentation inference if self.semantic_on: r = retry_if_cuda_oom(self.semantic_inference)(mask_cls_result, mask_pred_result) if not self.sem_seg_postprocess_before_inference: r = retry_if_cuda_oom(sem_seg_postprocess)(r, image_size, height, width) processed_results[-1]["sem_seg"] = r # panoptic segmentation inference if self.panoptic_on: panoptic_r = retry_if_cuda_oom(self.panoptic_inference)(mask_cls_result, mask_pred_result) processed_results[-1]["panoptic_seg"] = panoptic_r # instance segmentation inference if self.instance_on: instance_r = retry_if_cuda_oom(self.instance_inference)(mask_cls_result, mask_pred_result) processed_results[-1]["instances"] = instance_r return processed_results def prepare_targets(self, targets, images): h_pad, w_pad = images.tensor.shape[-2:] new_targets = [] for targets_per_image in targets: # pad gt gt_masks = targets_per_image.gt_masks padded_masks = torch.zeros((gt_masks.shape[0], h_pad, w_pad), dtype=gt_masks.dtype, device=gt_masks.device) padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks new_targets.append( { "labels": targets_per_image.gt_classes, "masks": padded_masks, } ) return new_targets def semantic_inference(self, mask_cls, mask_pred): mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1] mask_pred = mask_pred.sigmoid() semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred) return semseg def panoptic_inference(self, mask_cls, mask_pred): scores, labels = F.softmax(mask_cls, dim=-1).max(-1) mask_pred = mask_pred.sigmoid() keep = labels.ne(self.sem_seg_head.num_classes) & (scores > self.object_mask_threshold) cur_scores = scores[keep] cur_classes = labels[keep] cur_masks = mask_pred[keep] cur_mask_cls = mask_cls[keep] cur_mask_cls = cur_mask_cls[:, :-1] cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks h, w = cur_masks.shape[-2:] panoptic_seg = torch.zeros((h, w), dtype=torch.int32, device=cur_masks.device) segments_info = [] current_segment_id = 0 if cur_masks.shape[0] == 0: # We didn't detect any mask :( return panoptic_seg, segments_info else: # take argmax cur_mask_ids = cur_prob_masks.argmax(0) stuff_memory_list = {} for k in range(cur_classes.shape[0]): pred_class = cur_classes[k].item() isthing = pred_class in self.metadata.thing_dataset_id_to_contiguous_id.values() mask_area = (cur_mask_ids == k).sum().item() original_area = (cur_masks[k] >= 0.5).sum().item() mask = (cur_mask_ids == k) & (cur_masks[k] >= 0.5) if mask_area > 0 and original_area > 0 and mask.sum().item() > 0: if mask_area / original_area < self.overlap_threshold: continue # merge stuff regions if not isthing: if int(pred_class) in stuff_memory_list.keys(): panoptic_seg[mask] = stuff_memory_list[int(pred_class)] continue else: stuff_memory_list[int(pred_class)] = current_segment_id + 1 current_segment_id += 1 panoptic_seg[mask] = current_segment_id segments_info.append( { "id": current_segment_id, "isthing": bool(isthing), "category_id": int(pred_class), } ) return panoptic_seg, segments_info def instance_inference(self, mask_cls, mask_pred): # mask_pred is already processed to have the same shape as original input image_size = mask_pred.shape[-2:] # [Q, K] scores = F.softmax(mask_cls, dim=-1)[:, :-1] labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1) # scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.num_queries, sorted=False) scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.test_topk_per_image, sorted=False) labels_per_image = labels[topk_indices] topk_indices = topk_indices // self.sem_seg_head.num_classes # mask_pred = mask_pred.unsqueeze(1).repeat(1, self.sem_seg_head.num_classes, 1).flatten(0, 1) mask_pred = mask_pred[topk_indices] # if this is panoptic segmentation, we only keep the "thing" classes if self.panoptic_on: keep = torch.zeros_like(scores_per_image).bool() for i, lab in enumerate(labels_per_image): keep[i] = lab in self.metadata.thing_dataset_id_to_contiguous_id.values() scores_per_image = scores_per_image[keep] labels_per_image = labels_per_image[keep] mask_pred = mask_pred[keep] result = Instances(image_size) result.pred_masks = (mask_pred > 0).float() result.pred_boxes = BitMasks(mask_pred > 0).get_bounding_boxes() # calculate average mask prob mask_scores_per_image = (mask_pred.sigmoid().flatten(1) * result.pred_masks.flatten(1)).sum(1) / (result.pred_masks.flatten(1).sum(1) + 1e-6) result.scores = scores_per_image * mask_scores_per_image result.pred_classes = labels_per_image return result ================================================ FILE: mfvis_nococo/mask2former/modeling/__init__.py ================================================ from .backbone.swin import D2SwinTransformer from .pixel_decoder.fpn import BasePixelDecoder from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder from .meta_arch.mask_former_head import MaskFormerHead from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead ================================================ FILE: mfvis_nococo/mask2former/modeling/backbone/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. ================================================ FILE: mfvis_nococo/mask2former/modeling/backbone/__init__.py.new ================================================ ================================================ FILE: mfvis_nococo/mask2former/modeling/backbone/swin.py ================================================ # -------------------------------------------------------- # Swin Transformer # Copyright (c) 2021 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ze Liu, Yutong Lin, Yixuan Wei # -------------------------------------------------------- # Modified by Bowen Cheng from https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation/blob/main/mmseg/models/backbones/swin_transformer.py import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as checkpoint from timm.models.layers import DropPath, to_2tuple, trunc_normal_ from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec class Mlp(nn.Module): """Multilayer perceptron.""" def __init__( self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0 ): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x def window_partition(x, window_size): """ Args: x: (B, H, W, C) window_size (int): window size Returns: windows: (num_windows*B, window_size, window_size, C) """ B, H, W, C = x.shape x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) return windows def window_reverse(windows, window_size, H, W): """ Args: windows: (num_windows*B, window_size, window_size, C) window_size (int): Window size H (int): Height of image W (int): Width of image Returns: x: (B, H, W, C) """ B = int(windows.shape[0] / (H * W / window_size / window_size)) x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) return x class WindowAttention(nn.Module): """Window based multi-head self attention (W-MSA) module with relative position bias. It supports both of shifted and non-shifted window. Args: dim (int): Number of input channels. window_size (tuple[int]): The height and width of the window. num_heads (int): Number of attention heads. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 proj_drop (float, optional): Dropout ratio of output. Default: 0.0 """ def __init__( self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0.0, proj_drop=0.0, ): super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim ** -0.5 # define a parameter table of relative position bias self.relative_position_bias_table = nn.Parameter( torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads) ) # 2*Wh-1 * 2*Ww-1, nH # get pair-wise relative position index for each token inside the window coords_h = torch.arange(self.window_size[0]) coords_w = torch.arange(self.window_size[1]) coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww self.register_buffer("relative_position_index", relative_position_index) self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) trunc_normal_(self.relative_position_bias_table, std=0.02) self.softmax = nn.Softmax(dim=-1) def forward(self, x, mask=None): """Forward function. Args: x: input features with shape of (num_windows*B, N, C) mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None """ B_, N, C = x.shape qkv = ( self.qkv(x) .reshape(B_, N, 3, self.num_heads, C // self.num_heads) .permute(2, 0, 3, 1, 4) ) q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) q = q * self.scale attn = q @ k.transpose(-2, -1) relative_position_bias = self.relative_position_bias_table[ self.relative_position_index.view(-1) ].view( self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 ) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.permute( 2, 0, 1 ).contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) if mask is not None: nW = mask.shape[0] attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) attn = attn.view(-1, self.num_heads, N, N) attn = self.softmax(attn) else: attn = self.softmax(attn) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B_, N, C) x = self.proj(x) x = self.proj_drop(x) return x class SwinTransformerBlock(nn.Module): """Swin Transformer Block. Args: dim (int): Number of input channels. num_heads (int): Number of attention heads. window_size (int): Window size. shift_size (int): Shift size for SW-MSA. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float, optional): Stochastic depth rate. Default: 0.0 act_layer (nn.Module, optional): Activation layer. Default: nn.GELU norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__( self, dim, num_heads, window_size=7, shift_size=0, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, ): super().__init__() self.dim = dim self.num_heads = num_heads self.window_size = window_size self.shift_size = shift_size self.mlp_ratio = mlp_ratio assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" self.norm1 = norm_layer(dim) self.attn = WindowAttention( dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop ) self.H = None self.W = None def forward(self, x, mask_matrix): """Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. mask_matrix: Attention mask for cyclic shift. """ B, L, C = x.shape H, W = self.H, self.W assert L == H * W, "input feature has wrong size" shortcut = x x = self.norm1(x) x = x.view(B, H, W, C) # pad feature maps to multiples of window size pad_l = pad_t = 0 pad_r = (self.window_size - W % self.window_size) % self.window_size pad_b = (self.window_size - H % self.window_size) % self.window_size x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) _, Hp, Wp, _ = x.shape # cyclic shift if self.shift_size > 0: shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) attn_mask = mask_matrix else: shifted_x = x attn_mask = None # partition windows x_windows = window_partition( shifted_x, self.window_size ) # nW*B, window_size, window_size, C x_windows = x_windows.view( -1, self.window_size * self.window_size, C ) # nW*B, window_size*window_size, C # W-MSA/SW-MSA attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C # merge windows attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C # reverse cyclic shift if self.shift_size > 0: x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) else: x = shifted_x if pad_r > 0 or pad_b > 0: x = x[:, :H, :W, :].contiguous() x = x.view(B, H * W, C) # FFN x = shortcut + self.drop_path(x) x = x + self.drop_path(self.mlp(self.norm2(x))) return x class PatchMerging(nn.Module): """Patch Merging Layer Args: dim (int): Number of input channels. norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__(self, dim, norm_layer=nn.LayerNorm): super().__init__() self.dim = dim self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) self.norm = norm_layer(4 * dim) def forward(self, x, H, W): """Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. """ B, L, C = x.shape assert L == H * W, "input feature has wrong size" x = x.view(B, H, W, C) # padding pad_input = (H % 2 == 1) or (W % 2 == 1) if pad_input: x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C x = self.norm(x) x = self.reduction(x) return x class BasicLayer(nn.Module): """A basic Swin Transformer layer for one stage. Args: dim (int): Number of feature channels depth (int): Depths of this stage. num_heads (int): Number of attention head. window_size (int): Local window size. Default: 7. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__( self, dim, depth, num_heads, window_size=7, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False, ): super().__init__() self.window_size = window_size self.shift_size = window_size // 2 self.depth = depth self.use_checkpoint = use_checkpoint # build blocks self.blocks = nn.ModuleList( [ SwinTransformerBlock( dim=dim, num_heads=num_heads, window_size=window_size, shift_size=0 if (i % 2 == 0) else window_size // 2, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop, drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, norm_layer=norm_layer, ) for i in range(depth) ] ) # patch merging layer if downsample is not None: self.downsample = downsample(dim=dim, norm_layer=norm_layer) else: self.downsample = None def forward(self, x, H, W): """Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. """ # calculate attention mask for SW-MSA Hp = int(np.ceil(H / self.window_size)) * self.window_size Wp = int(np.ceil(W / self.window_size)) * self.window_size img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1 h_slices = ( slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None), ) w_slices = ( slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None), ) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 mask_windows = window_partition( img_mask, self.window_size ) # nW, window_size, window_size, 1 mask_windows = mask_windows.view(-1, self.window_size * self.window_size) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( attn_mask == 0, float(0.0) ) for blk in self.blocks: blk.H, blk.W = H, W if self.use_checkpoint: x = checkpoint.checkpoint(blk, x, attn_mask) else: x = blk(x, attn_mask) if self.downsample is not None: x_down = self.downsample(x, H, W) Wh, Ww = (H + 1) // 2, (W + 1) // 2 return x, H, W, x_down, Wh, Ww else: return x, H, W, x, H, W class PatchEmbed(nn.Module): """Image to Patch Embedding Args: patch_size (int): Patch token size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. norm_layer (nn.Module, optional): Normalization layer. Default: None """ def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): super().__init__() patch_size = to_2tuple(patch_size) self.patch_size = patch_size self.in_chans = in_chans self.embed_dim = embed_dim self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: self.norm = None def forward(self, x): """Forward function.""" # padding _, _, H, W = x.size() if W % self.patch_size[1] != 0: x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1])) if H % self.patch_size[0] != 0: x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) x = self.proj(x) # B C Wh Ww if self.norm is not None: Wh, Ww = x.size(2), x.size(3) x = x.flatten(2).transpose(1, 2) x = self.norm(x) x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww) return x class SwinTransformer(nn.Module): """Swin Transformer backbone. A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - https://arxiv.org/pdf/2103.14030 Args: pretrain_img_size (int): Input image size for training the pretrained model, used in absolute postion embedding. Default 224. patch_size (int | tuple(int)): Patch size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. depths (tuple[int]): Depths of each Swin Transformer stage. num_heads (tuple[int]): Number of attention head of each stage. window_size (int): Window size. Default: 7. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. drop_rate (float): Dropout rate. attn_drop_rate (float): Attention dropout rate. Default: 0. drop_path_rate (float): Stochastic depth rate. Default: 0.2. norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. ape (bool): If True, add absolute position embedding to the patch embedding. Default: False. patch_norm (bool): If True, add normalization after patch embedding. Default: True. out_indices (Sequence[int]): Output from which stages. frozen_stages (int): Stages to be frozen (stop grad and set eval mode). -1 means not freezing any parameters. use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__( self, pretrain_img_size=224, patch_size=4, in_chans=3, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.2, norm_layer=nn.LayerNorm, ape=False, patch_norm=True, out_indices=(0, 1, 2, 3), frozen_stages=-1, use_checkpoint=False, ): super().__init__() self.pretrain_img_size = pretrain_img_size self.num_layers = len(depths) self.embed_dim = embed_dim self.ape = ape self.patch_norm = patch_norm self.out_indices = out_indices self.frozen_stages = frozen_stages # split image into non-overlapping patches self.patch_embed = PatchEmbed( patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer if self.patch_norm else None, ) # absolute position embedding if self.ape: pretrain_img_size = to_2tuple(pretrain_img_size) patch_size = to_2tuple(patch_size) patches_resolution = [ pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1], ] self.absolute_pos_embed = nn.Parameter( torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]) ) trunc_normal_(self.absolute_pos_embed, std=0.02) self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) ] # stochastic depth decay rule # build layers self.layers = nn.ModuleList() for i_layer in range(self.num_layers): layer = BasicLayer( dim=int(embed_dim * 2 ** i_layer), depth=depths[i_layer], num_heads=num_heads[i_layer], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])], norm_layer=norm_layer, downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, use_checkpoint=use_checkpoint, ) self.layers.append(layer) num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)] self.num_features = num_features # add a norm layer for each output for i_layer in out_indices: layer = norm_layer(num_features[i_layer]) layer_name = f"norm{i_layer}" self.add_module(layer_name, layer) self._freeze_stages() def _freeze_stages(self): if self.frozen_stages >= 0: self.patch_embed.eval() for param in self.patch_embed.parameters(): param.requires_grad = False if self.frozen_stages >= 1 and self.ape: self.absolute_pos_embed.requires_grad = False if self.frozen_stages >= 2: self.pos_drop.eval() for i in range(0, self.frozen_stages - 1): m = self.layers[i] m.eval() for param in m.parameters(): param.requires_grad = False def init_weights(self, pretrained=None): """Initialize the weights in backbone. Args: pretrained (str, optional): Path to pre-trained weights. Defaults to None. """ def _init_weights(m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=0.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def forward(self, x): """Forward function.""" x = self.patch_embed(x) Wh, Ww = x.size(2), x.size(3) if self.ape: # interpolate the position embedding to the corresponding size absolute_pos_embed = F.interpolate( self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic" ) x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C else: x = x.flatten(2).transpose(1, 2) x = self.pos_drop(x) outs = {} for i in range(self.num_layers): layer = self.layers[i] x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) if i in self.out_indices: norm_layer = getattr(self, f"norm{i}") x_out = norm_layer(x_out) out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous() outs["res{}".format(i + 2)] = out return outs def train(self, mode=True): """Convert the model into training mode while keep layers freezed.""" super(SwinTransformer, self).train(mode) self._freeze_stages() @BACKBONE_REGISTRY.register() class D2SwinTransformer(SwinTransformer, Backbone): def __init__(self, cfg, input_shape): pretrain_img_size = cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE patch_size = cfg.MODEL.SWIN.PATCH_SIZE in_chans = 3 embed_dim = cfg.MODEL.SWIN.EMBED_DIM depths = cfg.MODEL.SWIN.DEPTHS num_heads = cfg.MODEL.SWIN.NUM_HEADS window_size = cfg.MODEL.SWIN.WINDOW_SIZE mlp_ratio = cfg.MODEL.SWIN.MLP_RATIO qkv_bias = cfg.MODEL.SWIN.QKV_BIAS qk_scale = cfg.MODEL.SWIN.QK_SCALE drop_rate = cfg.MODEL.SWIN.DROP_RATE attn_drop_rate = cfg.MODEL.SWIN.ATTN_DROP_RATE drop_path_rate = cfg.MODEL.SWIN.DROP_PATH_RATE norm_layer = nn.LayerNorm ape = cfg.MODEL.SWIN.APE patch_norm = cfg.MODEL.SWIN.PATCH_NORM use_checkpoint = cfg.MODEL.SWIN.USE_CHECKPOINT super().__init__( pretrain_img_size, patch_size, in_chans, embed_dim, depths, num_heads, window_size, mlp_ratio, qkv_bias, qk_scale, drop_rate, attn_drop_rate, drop_path_rate, norm_layer, ape, patch_norm, use_checkpoint=use_checkpoint, ) self._out_features = cfg.MODEL.SWIN.OUT_FEATURES self._out_feature_strides = { "res2": 4, "res3": 8, "res4": 16, "res5": 32, } self._out_feature_channels = { "res2": self.num_features[0], "res3": self.num_features[1], "res4": self.num_features[2], "res5": self.num_features[3], } def forward(self, x): """ Args: x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. Returns: dict[str->Tensor]: names and the corresponding features """ assert ( x.dim() == 4 ), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!" outputs = {} y = super().forward(x) for k in y.keys(): if k in self._out_features: outputs[k] = y[k] return outputs def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features } @property def size_divisibility(self): return 32 ================================================ FILE: mfvis_nococo/mask2former/modeling/criterion.py ================================================ # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py """ MaskFormer criterion. """ import logging import torch import torch.nn.functional as F from torch import nn from detectron2.utils.comm import get_world_size from detectron2.projects.point_rend.point_features import ( get_uncertain_point_coords_with_randomness, point_sample, ) from ..utils.misc import is_dist_avail_and_initialized, nested_tensor_from_tensor_list def unfold_wo_center(x, kernel_size, dilation): assert x.dim() == 4 assert kernel_size % 2 == 1 # using SAME padding padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2 unfolded_x = F.unfold( x, kernel_size=kernel_size, padding=padding, dilation=dilation ) unfolded_x = unfolded_x.reshape( x.size(0), x.size(1), -1, x.size(2), x.size(3) ) # remove the center pixels size = kernel_size ** 2 unfolded_x = torch.cat(( unfolded_x[:, :, :size // 2], unfolded_x[:, :, size // 2 + 1:] ), dim=2) return unfolded_x def compute_pairwise_term(mask_logits, pairwise_size, pairwise_dilation): assert mask_logits.dim() == 4 log_fg_prob = F.logsigmoid(mask_logits) log_bg_prob = F.logsigmoid(-mask_logits) log_fg_prob_unfold = unfold_wo_center( log_fg_prob, kernel_size=pairwise_size, dilation=pairwise_dilation ) log_bg_prob_unfold = unfold_wo_center( log_bg_prob, kernel_size=pairwise_size, dilation=pairwise_dilation ) # the probability of making the same prediction = p_i * p_j + (1 - p_i) * (1 - p_j) # we compute the the probability in log space to avoid numerical instability log_same_fg_prob = log_fg_prob[:, :, None] + log_fg_prob_unfold log_same_bg_prob = log_bg_prob[:, :, None] + log_bg_prob_unfold max_ = torch.max(log_same_fg_prob, log_same_bg_prob) log_same_prob = torch.log( torch.exp(log_same_fg_prob - max_) + torch.exp(log_same_bg_prob - max_) ) + max_ # loss = -log(prob) return -log_same_prob[:, 0] def get_incoherent_mask(input_masks, sfact): mask = input_masks.float() w = input_masks.shape[-1] h = input_masks.shape[-2] mask_small = F.interpolate(mask, (h//sfact, w//sfact), mode='bilinear') mask_recover = F.interpolate(mask_small, (h, w), mode='bilinear') mask_uncertain = (mask - mask_recover).abs() mask_uncertain = (mask_uncertain > 0.01).float() return mask_uncertain def dice_coefficient(x, target): eps = 1e-5 n_inst = x.size(0) x = x.reshape(n_inst, -1) target = target.reshape(n_inst, -1) intersection = (x * target).sum(dim=1) union = (x ** 2.0).sum(dim=1) + (target ** 2.0).sum(dim=1) + eps loss = 1. - (2 * intersection / union) return loss def compute_project_term(mask_scores, gt_bitmasks): mask_losses_y = dice_coefficient( mask_scores.max(dim=2, keepdim=True)[0], gt_bitmasks.max(dim=2, keepdim=True)[0] ) mask_losses_x = dice_coefficient( mask_scores.max(dim=3, keepdim=True)[0], gt_bitmasks.max(dim=3, keepdim=True)[0] ) return (mask_losses_x + mask_losses_y).mean() def dice_loss( inputs: torch.Tensor, targets: torch.Tensor, num_masks: float, ): """ Compute the DICE loss, similar to generalized IOU for masks Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). """ inputs = inputs.sigmoid() inputs = inputs.flatten(1) numerator = 2 * (inputs * targets).sum(-1) denominator = inputs.sum(-1) + targets.sum(-1) loss = 1 - (numerator + 1) / (denominator + 1) return loss.sum() / num_masks dice_loss_jit = torch.jit.script( dice_loss ) # type: torch.jit.ScriptModule def sigmoid_ce_loss( inputs: torch.Tensor, targets: torch.Tensor, num_masks: float, ): """ Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). Returns: Loss tensor """ loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") return loss.mean(1).sum() / num_masks sigmoid_ce_loss_jit = torch.jit.script( sigmoid_ce_loss ) # type: torch.jit.ScriptModule def calculate_uncertainty(logits): """ We estimate uncerainty as L1 distance between 0.0 and the logit prediction in 'logits' for the foreground class in `classes`. Args: logits (Tensor): A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is the number of foreground classes. The values are logits. Returns: scores (Tensor): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most uncertain locations having the highest uncertainty score. """ assert logits.shape[1] == 1 gt_class_logits = logits.clone() return -(torch.abs(gt_class_logits)) class SetCriterion(nn.Module): """This class computes the loss for DETR. The process happens in two steps: 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched ground-truth / prediction (supervise class and box) """ def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses, num_points, oversample_ratio, importance_sample_ratio): """Create the criterion. Parameters: num_classes: number of object categories, omitting the special no-object category matcher: module able to compute a matching between targets and proposals weight_dict: dict containing as key the names of the losses and as values their relative weight. eos_coef: relative classification weight applied to the no-object category losses: list of all the losses to be applied. See get_loss for list of available losses. """ super().__init__() self.num_classes = num_classes self.matcher = matcher self.weight_dict = weight_dict self.eos_coef = eos_coef self.losses = losses empty_weight = torch.ones(self.num_classes + 1) empty_weight[-1] = self.eos_coef self.register_buffer("empty_weight", empty_weight) # pointwise mask loss parameters self.num_points = num_points self.oversample_ratio = oversample_ratio self.importance_sample_ratio = importance_sample_ratio self.laplacian_kernel = torch.tensor([-1, -1, -1, -1, 8, -1, -1, -1, -1], dtype=torch.float32).reshape(1, 1, 3, 3).requires_grad_(False) self.register_buffer("_iter", torch.zeros([1])) self._warmup_iters = 1000 #20000 def loss_labels(self, outputs, targets, indices, num_masks): """Classification loss (NLL) targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes] """ assert "pred_logits" in outputs src_logits = outputs["pred_logits"].float() idx = self._get_src_permutation_idx(indices) target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) target_classes = torch.full( src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device ) target_classes[idx] = target_classes_o loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) losses = {"loss_ce": loss_ce} return losses def loss_masks_proj(self, outputs, targets, indices, num_masks, images_lab_sim): assert "pred_masks" in outputs self._iter += 1 src_idx = self._get_src_permutation_idx(indices) tgt_idx = self._get_tgt_permutation_idx(indices) src_masks = outputs["pred_masks"] src_masks = src_masks[src_idx] masks = [t["masks"] for t in targets] # TODO use valid to mask invalid areas due to padding in loss target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() target_masks = target_masks.to(src_masks) target_masks = target_masks[tgt_idx] if len(src_idx[0].tolist()) > 0: images_lab_sim = torch.cat([images_lab_sim[ind] for ind in src_idx[0].tolist()]) # No need to upsample predictions as we are using normalized coordinates :) # N x 1 x H x W src_masks = src_masks[:, None] target_masks = target_masks[:, None] target_masks = F.interpolate(target_masks, (src_masks.shape[-2], src_masks.shape[-1]), mode='bilinear') # print('src masks shape:', src_masks.shape) # print('target masks shape:', target_masks.shape) if src_masks.shape[0] > 0: loss_prj_term = compute_project_term(src_masks.sigmoid(), target_masks) # print('src_masks shape before:', src_masks.shape) pairwise_losses = compute_pairwise_term( src_masks, 3, 2 ) inc_mask = get_incoherent_mask(src_masks.detach().sigmoid() > 0.5, 2) #* images_lab_sim).bool() inc_mask = F.conv2d(inc_mask, self.laplacian_kernel.to(inc_mask.device), padding=1).abs() inc_mask = (inc_mask > 0.5).float() weights = (images_lab_sim >= 0.3).float() * target_masks.float() #* inc_mask loss_pairwise = ((pairwise_losses * weights).sum() / weights.sum().clamp(min=1.0)) * 0.25 warmup_factor = min(self._iter.item() / float(self._warmup_iters), 1.0) loss_pairwise = loss_pairwise * warmup_factor #* 0. else: loss_prj_term = src_masks.sum() * 0. loss_pairwise = src_masks.sum() * 0. losses = { "loss_mask": loss_prj_term, "loss_bound": loss_pairwise, } del src_masks del target_masks return losses def loss_masks(self, outputs, targets, indices, num_masks): """Compute the losses related to the masks: the focal loss and the dice loss. targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w] """ assert "pred_masks" in outputs src_idx = self._get_src_permutation_idx(indices) tgt_idx = self._get_tgt_permutation_idx(indices) src_masks = outputs["pred_masks"] src_masks = src_masks[src_idx] masks = [t["masks"] for t in targets] # TODO use valid to mask invalid areas due to padding in loss target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() target_masks = target_masks.to(src_masks) target_masks = target_masks[tgt_idx] # No need to upsample predictions as we are using normalized coordinates :) # N x 1 x H x W src_masks = src_masks[:, None] target_masks = target_masks[:, None] with torch.no_grad(): # sample point_coords point_coords = get_uncertain_point_coords_with_randomness( src_masks, lambda logits: calculate_uncertainty(logits), self.num_points, self.oversample_ratio, self.importance_sample_ratio, ) # get gt labels point_labels = point_sample( target_masks, point_coords, align_corners=False, ).squeeze(1) point_logits = point_sample( src_masks, point_coords, align_corners=False, ).squeeze(1) losses = { "loss_mask": sigmoid_ce_loss_jit(point_logits, point_labels, num_masks), "loss_dice": dice_loss_jit(point_logits, point_labels, num_masks), } del src_masks del target_masks return losses def _get_src_permutation_idx(self, indices): # permute predictions following indices batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) src_idx = torch.cat([src for (src, _) in indices]) return batch_idx, src_idx def _get_tgt_permutation_idx(self, indices): # permute targets following indices batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) tgt_idx = torch.cat([tgt for (_, tgt) in indices]) return batch_idx, tgt_idx def get_loss(self, loss, outputs, targets, indices, num_masks, images_lab_sim): loss_map = { 'labels': self.loss_labels, 'masks': self.loss_masks_proj, } assert loss in loss_map, f"do you really want to compute {loss} loss?" if loss == 'masks': return loss_map[loss](outputs, targets, indices, num_masks, images_lab_sim) else: return loss_map[loss](outputs, targets, indices, num_masks) def forward(self, outputs, targets, images_lab_sim): """This performs the loss computation. Parameters: outputs: dict of tensors, see the output specification of the model for the format targets: list of dicts, such that len(targets) == batch_size. The expected keys in each dict depends on the losses applied, see each loss' doc """ outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"} # Retrieve the matching between the outputs of the last layer and the targets indices = self.matcher(outputs_without_aux, targets) # Compute the average number of target boxes accross all nodes, for normalization purposes num_masks = sum(len(t["labels"]) for t in targets) num_masks = torch.as_tensor( [num_masks], dtype=torch.float, device=next(iter(outputs.values())).device ) if is_dist_avail_and_initialized(): torch.distributed.all_reduce(num_masks) num_masks = torch.clamp(num_masks / get_world_size(), min=1).item() # Compute all the requested losses losses = {} for loss in self.losses: losses.update(self.get_loss(loss, outputs, targets, indices, num_masks, images_lab_sim)) # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. if "aux_outputs" in outputs: for i, aux_outputs in enumerate(outputs["aux_outputs"]): indices = self.matcher(aux_outputs, targets) for loss in self.losses: l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_masks, images_lab_sim) l_dict = {k + f"_{i}": v for k, v in l_dict.items()} losses.update(l_dict) return losses def __repr__(self): head = "Criterion " + self.__class__.__name__ body = [ "matcher: {}".format(self.matcher.__repr__(_repr_indent=8)), "losses: {}".format(self.losses), "weight_dict: {}".format(self.weight_dict), "num_classes: {}".format(self.num_classes), "eos_coef: {}".format(self.eos_coef), "num_points: {}".format(self.num_points), "oversample_ratio: {}".format(self.oversample_ratio), "importance_sample_ratio: {}".format(self.importance_sample_ratio), ] _repr_indent = 4 lines = [head] + [" " * _repr_indent + line for line in body] return "\n".join(lines) ================================================ FILE: mfvis_nococo/mask2former/modeling/matcher.py ================================================ # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py """ Modules to compute the matching cost and solve the corresponding LSAP. """ import torch import torch.nn.functional as F from scipy.optimize import linear_sum_assignment from torch import nn from torch.cuda.amp import autocast from detectron2.projects.point_rend.point_features import point_sample from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou, generalized_multi_box_iou def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor): """ Compute the DICE loss, similar to generalized IOU for masks Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). """ inputs = inputs #.sigmoid() inputs = inputs.flatten(1) numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets) denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :] loss = 1 - (numerator + 1) / (denominator + 1) return loss batch_dice_loss_jit = torch.jit.script( batch_dice_loss ) # type: torch.jit.ScriptModule def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor): """ Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). Returns: Loss tensor """ hw = inputs.shape[1] pos = F.binary_cross_entropy( inputs, torch.ones_like(inputs), reduction="none" ) neg = F.binary_cross_entropy( inputs, torch.zeros_like(inputs), reduction="none" ) loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum( "nc,mc->nm", neg, (1 - targets) ) return loss / hw batch_sigmoid_ce_loss_jit = torch.jit.script( batch_sigmoid_ce_loss ) # type: torch.jit.ScriptModule def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor: """ Compute the bounding boxes around the provided masks. Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. Args: masks (Tensor[N, H, W]): masks to transform where N is the number of masks and (H, W) are the spatial dimensions. Returns: Tensor[N, 4]: bounding boxes """ if masks.numel() == 0: return masks n = masks.shape[0] for index, mask in enumerate(masks): y, x = torch.where(mask != 0) if len(x) * len(y) == 0: continue h = torch.max(y) - torch.min(y) w = torch.max(x) - torch.min(x) masks[index, torch.min(y):torch.max(y), torch.min(x):torch.max(x)] = 1.0 return masks def masks_to_boxes_cc(masks: torch.Tensor) -> torch.Tensor: """ Compute the bounding boxes around the provided masks. Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. Args: masks (Tensor[N, H, W]): masks to transform where N is the number of masks and (H, W) are the spatial dimensions. Returns: Tensor[N, 4]: bounding boxes """ if masks.numel() == 0: return torch.zeros((0, 4), device=masks.device, dtype=torch.float) n = masks.shape[0] h = masks.shape[1] w = masks.shape[2] bounding_boxes = torch.zeros((n, 4), device=masks.device, dtype=torch.float) for index, mask in enumerate(masks): y, x = torch.where(mask != 0) if len(x) * len(y) == 0: continue bounding_boxes[index, 0] = torch.min(x) / float(w) bounding_boxes[index, 1] = torch.min(y) / float(h) bounding_boxes[index, 2] = torch.max(x) / float(w) bounding_boxes[index, 3] = torch.max(y) / float(h) return bounding_boxes class HungarianMatcher(nn.Module): """This class computes an assignment between the targets and the predictions of the network For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are un-matched (and thus treated as non-objects). """ def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0): """Creates the matcher Params: cost_class: This is the relative weight of the classification error in the matching cost cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost """ super().__init__() self.cost_class = cost_class self.cost_mask = cost_mask self.cost_dice = cost_dice self.cost_giou = 2.0 self.cost_bbox = 5.0 assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0" self.num_points = num_points @torch.no_grad() def memory_efficient_forward(self, outputs, targets): """More memory-friendly matching""" bs, num_queries = outputs["pred_logits"].shape[:2] indices = [] # Iterate through batch size for b in range(bs): out_prob = outputs["pred_logits"][b].softmax(-1) # [num_queries, num_classes] tgt_ids = targets[b]["labels"] # Compute the classification cost. Contrary to the loss, we don't use the NLL, # but approximate it in 1 - proba[target class]. # The 1 is a constant that doesn't change the matching, it can be ommitted. cost_class = -out_prob[:, tgt_ids] out_mask = outputs["pred_masks"][b] # [num_queries, H_pred, W_pred] out_mask_box = masks_to_boxes_cc((out_mask.sigmoid() > 0.5).float()) # gt masks are already padded when preparing target tgt_mask = targets[b]["masks"].to(out_mask) tgt_mask_box = masks_to_boxes_cc(tgt_mask) # print('tgt_mask_box shape:', tgt_mask_box.shape) with autocast(enabled=False): cost_bbox = torch.cdist(out_mask_box, tgt_mask_box) cost_giou = -generalized_box_iou(out_mask_box, tgt_mask_box) if torch.isnan(cost_bbox).any(): print('cost_bbox:', cost_bbox) if torch.isnan(cost_giou).any(): print('cost_giou:', cost_giou) C = ( self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou ) C = C.reshape(num_queries, -1).cpu() indices.append(linear_sum_assignment(C)) return [ (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices ] @torch.no_grad() def forward(self, outputs, targets): """Performs the matching Params: outputs: This is a dict that contains at least these entries: "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth objects in the target) containing the class labels "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) """ return self.memory_efficient_forward(outputs, targets) def __repr__(self, _repr_indent=4): head = "Matcher " + self.__class__.__name__ body = [ "cost_class: {}".format(self.cost_class), "cost_mask: {}".format(self.cost_mask), "cost_dice: {}".format(self.cost_dice), ] lines = [head] + [" " * _repr_indent + line for line in body] return "\n".join(lines) ================================================ FILE: mfvis_nococo/mask2former/modeling/meta_arch/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. ================================================ FILE: mfvis_nococo/mask2former/modeling/meta_arch/__init__.py.new ================================================ ================================================ FILE: mfvis_nococo/mask2former/modeling/meta_arch/mask_former_head.py ================================================ import logging from copy import deepcopy from typing import Callable, Dict, List, Optional, Tuple, Union import fvcore.nn.weight_init as weight_init from torch import nn from torch.nn import functional as F from detectron2.config import configurable from detectron2.layers import Conv2d, ShapeSpec, get_norm from detectron2.modeling import SEM_SEG_HEADS_REGISTRY from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder from ..pixel_decoder.fpn import build_pixel_decoder @SEM_SEG_HEADS_REGISTRY.register() class MaskFormerHead(nn.Module): _version = 2 def _load_from_state_dict( self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ): version = local_metadata.get("version", None) if version is None or version < 2: # Do not warn if train from scratch scratch = True logger = logging.getLogger(__name__) for k in list(state_dict.keys()): newk = k ''' if "sem_seg_head" in k and not k.startswith(prefix + "predictor"): newk = k.replace(prefix, prefix + "pixel_decoder.") # logger.debug(f"{k} ==> {newk}") ''' if newk != k: state_dict[newk] = state_dict[k] del state_dict[k] scratch = False if not scratch: logger.warning( f"Weight format of {self.__class__.__name__} have changed! " "Please upgrade your models. Applying automatic conversion now ..." ) @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, num_classes: int, pixel_decoder: nn.Module, loss_weight: float = 1.0, ignore_value: int = -1, # extra parameters transformer_predictor: nn.Module, transformer_in_feature: str, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features num_classes: number of classes to predict pixel_decoder: the pixel decoder module loss_weight: loss weight ignore_value: category id to be ignored during training. transformer_predictor: the transformer decoder that makes prediction transformer_in_feature: input feature name to the transformer_predictor """ super().__init__() input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) self.in_features = [k for k, v in input_shape] feature_strides = [v.stride for k, v in input_shape] feature_channels = [v.channels for k, v in input_shape] self.ignore_value = ignore_value self.common_stride = 4 self.loss_weight = loss_weight self.pixel_decoder = pixel_decoder self.predictor = transformer_predictor self.transformer_in_feature = transformer_in_feature self.num_classes = num_classes @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): # figure out in_channels to transformer predictor if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder": transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding": transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder": # for maskformer2 transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM else: transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels return { "input_shape": { k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES }, "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, "pixel_decoder": build_pixel_decoder(cfg, input_shape), "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE, "transformer_predictor": build_transformer_decoder( cfg, transformer_predictor_in_channels, mask_classification=True, ), } def forward(self, features, mask=None): return self.layers(features, mask) def layers(self, features, mask=None): mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features) if self.transformer_in_feature == "multi_scale_pixel_decoder": predictions = self.predictor(multi_scale_features, mask_features, mask) else: if self.transformer_in_feature == "transformer_encoder": assert ( transformer_encoder_features is not None ), "Please use the TransformerEncoderPixelDecoder." predictions = self.predictor(transformer_encoder_features, mask_features, mask) elif self.transformer_in_feature == "pixel_embedding": predictions = self.predictor(mask_features, mask_features, mask) else: predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask) return predictions ================================================ FILE: mfvis_nococo/mask2former/modeling/meta_arch/per_pixel_baseline.py ================================================ import logging from typing import Callable, Dict, List, Optional, Tuple, Union import fvcore.nn.weight_init as weight_init from torch import nn from torch.nn import functional as F from detectron2.config import configurable from detectron2.layers import Conv2d, ShapeSpec, get_norm from detectron2.modeling import SEM_SEG_HEADS_REGISTRY from ..transformer_decoder.maskformer_transformer_decoder import StandardTransformerDecoder from ..pixel_decoder.fpn import build_pixel_decoder @SEM_SEG_HEADS_REGISTRY.register() class PerPixelBaselineHead(nn.Module): _version = 2 def _load_from_state_dict( self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ): version = local_metadata.get("version", None) if version is None or version < 2: logger = logging.getLogger(__name__) # Do not warn if train from scratch scratch = True logger = logging.getLogger(__name__) for k in list(state_dict.keys()): newk = k if "sem_seg_head" in k and not k.startswith(prefix + "predictor"): newk = k.replace(prefix, prefix + "pixel_decoder.") # logger.warning(f"{k} ==> {newk}") if newk != k: state_dict[newk] = state_dict[k] del state_dict[k] scratch = False if not scratch: logger.warning( f"Weight format of {self.__class__.__name__} have changed! " "Please upgrade your models. Applying automatic conversion now ..." ) @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, num_classes: int, pixel_decoder: nn.Module, loss_weight: float = 1.0, ignore_value: int = -1, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features num_classes: number of classes to predict pixel_decoder: the pixel decoder module loss_weight: loss weight ignore_value: category id to be ignored during training. """ super().__init__() input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) self.in_features = [k for k, v in input_shape] feature_strides = [v.stride for k, v in input_shape] feature_channels = [v.channels for k, v in input_shape] self.ignore_value = ignore_value self.common_stride = 4 self.loss_weight = loss_weight self.pixel_decoder = pixel_decoder self.predictor = Conv2d( self.pixel_decoder.mask_dim, num_classes, kernel_size=1, stride=1, padding=0 ) weight_init.c2_msra_fill(self.predictor) @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): return { "input_shape": { k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES }, "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, "pixel_decoder": build_pixel_decoder(cfg, input_shape), "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, } def forward(self, features, targets=None): """ Returns: In training, returns (None, dict of losses) In inference, returns (CxHxW logits, {}) """ x = self.layers(features) if self.training: return None, self.losses(x, targets) else: x = F.interpolate( x, scale_factor=self.common_stride, mode="bilinear", align_corners=False ) return x, {} def layers(self, features): x, _, _ = self.pixel_decoder.forward_features(features) x = self.predictor(x) return x def losses(self, predictions, targets): predictions = predictions.float() # https://github.com/pytorch/pytorch/issues/48163 predictions = F.interpolate( predictions, scale_factor=self.common_stride, mode="bilinear", align_corners=False ) loss = F.cross_entropy( predictions, targets, reduction="mean", ignore_index=self.ignore_value ) losses = {"loss_sem_seg": loss * self.loss_weight} return losses @SEM_SEG_HEADS_REGISTRY.register() class PerPixelBaselinePlusHead(PerPixelBaselineHead): def _load_from_state_dict( self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ): version = local_metadata.get("version", None) if version is None or version < 2: # Do not warn if train from scratch scratch = True logger = logging.getLogger(__name__) for k in list(state_dict.keys()): newk = k if "sem_seg_head" in k and not k.startswith(prefix + "predictor"): newk = k.replace(prefix, prefix + "pixel_decoder.") logger.debug(f"{k} ==> {newk}") if newk != k: state_dict[newk] = state_dict[k] del state_dict[k] scratch = False if not scratch: logger.warning( f"Weight format of {self.__class__.__name__} have changed! " "Please upgrade your models. Applying automatic conversion now ..." ) @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, # extra parameters transformer_predictor: nn.Module, transformer_in_feature: str, deep_supervision: bool, # inherit parameters num_classes: int, pixel_decoder: nn.Module, loss_weight: float = 1.0, ignore_value: int = -1, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features transformer_predictor: the transformer decoder that makes prediction transformer_in_feature: input feature name to the transformer_predictor deep_supervision: whether or not to add supervision to the output of every transformer decoder layer num_classes: number of classes to predict pixel_decoder: the pixel decoder module loss_weight: loss weight ignore_value: category id to be ignored during training. """ super().__init__( input_shape, num_classes=num_classes, pixel_decoder=pixel_decoder, loss_weight=loss_weight, ignore_value=ignore_value, ) del self.predictor self.predictor = transformer_predictor self.transformer_in_feature = transformer_in_feature self.deep_supervision = deep_supervision @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): ret = super().from_config(cfg, input_shape) ret["transformer_in_feature"] = cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder": in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM else: in_channels = input_shape[ret["transformer_in_feature"]].channels ret["transformer_predictor"] = StandardTransformerDecoder( cfg, in_channels, mask_classification=False ) ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION return ret def forward(self, features, targets=None): """ Returns: In training, returns (None, dict of losses) In inference, returns (CxHxW logits, {}) """ x, aux_outputs = self.layers(features) if self.training: if self.deep_supervision: losses = self.losses(x, targets) for i, aux_output in enumerate(aux_outputs): losses["loss_sem_seg" + f"_{i}"] = self.losses( aux_output["pred_masks"], targets )["loss_sem_seg"] return None, losses else: return None, self.losses(x, targets) else: x = F.interpolate( x, scale_factor=self.common_stride, mode="bilinear", align_corners=False ) return x, {} def layers(self, features): mask_features, transformer_encoder_features, _ = self.pixel_decoder.forward_features(features) if self.transformer_in_feature == "transformer_encoder": assert ( transformer_encoder_features is not None ), "Please use the TransformerEncoderPixelDecoder." predictions = self.predictor(transformer_encoder_features, mask_features) else: predictions = self.predictor(features[self.transformer_in_feature], mask_features) if self.deep_supervision: return predictions["pred_masks"], predictions["aux_outputs"] else: return predictions["pred_masks"], None ================================================ FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. ================================================ FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/__init__.py.new ================================================ ================================================ FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/fpn.py ================================================ import logging import numpy as np from typing import Callable, Dict, List, Optional, Tuple, Union import fvcore.nn.weight_init as weight_init import torch from torch import nn from torch.nn import functional as F from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_ from torch.cuda.amp import autocast from detectron2.config import configurable from detectron2.layers import Conv2d, DeformConv, ShapeSpec, get_norm from detectron2.modeling import SEM_SEG_HEADS_REGISTRY from ..transformer_decoder.position_encoding import PositionEmbeddingSine from ..transformer_decoder.transformer import TransformerEncoder, TransformerEncoderLayer, _get_clones, _get_activation_fn def build_pixel_decoder(cfg, input_shape): """ Build a pixel decoder from `cfg.MODEL.MASK_FORMER.PIXEL_DECODER_NAME`. """ name = cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME model = SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape) forward_features = getattr(model, "forward_features", None) if not callable(forward_features): raise ValueError( "Only SEM_SEG_HEADS with forward_features method can be used as pixel decoder. " f"Please implement forward_features for {name} to only return mask features." ) return model # This is a modified FPN decoder. @SEM_SEG_HEADS_REGISTRY.register() class BasePixelDecoder(nn.Module): @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, conv_dim: int, mask_dim: int, norm: Optional[Union[str, Callable]] = None, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features conv_dims: number of output channels for the intermediate conv layers. mask_dim: number of output channels for the final conv layer. norm (str or callable): normalization for all conv layers """ super().__init__() input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5" feature_channels = [v.channels for k, v in input_shape] lateral_convs = [] output_convs = [] use_bias = norm == "" for idx, in_channels in enumerate(feature_channels): if idx == len(self.in_features) - 1: output_norm = get_norm(norm, conv_dim) output_conv = Conv2d( in_channels, conv_dim, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ) weight_init.c2_xavier_fill(output_conv) self.add_module("layer_{}".format(idx + 1), output_conv) lateral_convs.append(None) output_convs.append(output_conv) else: lateral_norm = get_norm(norm, conv_dim) output_norm = get_norm(norm, conv_dim) lateral_conv = Conv2d( in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm ) output_conv = Conv2d( conv_dim, conv_dim, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ) weight_init.c2_xavier_fill(lateral_conv) weight_init.c2_xavier_fill(output_conv) self.add_module("adapter_{}".format(idx + 1), lateral_conv) self.add_module("layer_{}".format(idx + 1), output_conv) lateral_convs.append(lateral_conv) output_convs.append(output_conv) # Place convs into top-down order (from low to high resolution) # to make the top-down computation in forward clearer. self.lateral_convs = lateral_convs[::-1] self.output_convs = output_convs[::-1] self.mask_dim = mask_dim self.mask_features = Conv2d( conv_dim, mask_dim, kernel_size=3, stride=1, padding=1, ) weight_init.c2_xavier_fill(self.mask_features) self.maskformer_num_feature_levels = 3 # always use 3 scales @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): ret = {} ret["input_shape"] = { k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES } ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM return ret def forward_features(self, features): multi_scale_features = [] num_cur_levels = 0 # Reverse feature maps into top-down order (from low to high resolution) for idx, f in enumerate(self.in_features[::-1]): x = features[f] lateral_conv = self.lateral_convs[idx] output_conv = self.output_convs[idx] if lateral_conv is None: y = output_conv(x) else: cur_fpn = lateral_conv(x) # Following FPN implementation, we use nearest upsampling here y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest") y = output_conv(y) if num_cur_levels < self.maskformer_num_feature_levels: multi_scale_features.append(y) num_cur_levels += 1 return self.mask_features(y), None, multi_scale_features def forward(self, features, targets=None): logger = logging.getLogger(__name__) logger.warning("Calling forward() may cause unpredicted behavior of PixelDecoder module.") return self.forward_features(features) class TransformerEncoderOnly(nn.Module): def __init__( self, d_model=512, nhead=8, num_encoder_layers=6, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False, ): super().__init__() encoder_layer = TransformerEncoderLayer( d_model, nhead, dim_feedforward, dropout, activation, normalize_before ) encoder_norm = nn.LayerNorm(d_model) if normalize_before else None self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) self._reset_parameters() self.d_model = d_model self.nhead = nhead def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def forward(self, src, mask, pos_embed): # flatten NxCxHxW to HWxNxC bs, c, h, w = src.shape src = src.flatten(2).permute(2, 0, 1) pos_embed = pos_embed.flatten(2).permute(2, 0, 1) if mask is not None: mask = mask.flatten(1) memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) return memory.permute(1, 2, 0).view(bs, c, h, w) # This is a modified FPN decoder with extra Transformer encoder that processes the lowest-resolution feature map. @SEM_SEG_HEADS_REGISTRY.register() class TransformerEncoderPixelDecoder(BasePixelDecoder): @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, transformer_dropout: float, transformer_nheads: int, transformer_dim_feedforward: int, transformer_enc_layers: int, transformer_pre_norm: bool, conv_dim: int, mask_dim: int, norm: Optional[Union[str, Callable]] = None, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features transformer_dropout: dropout probability in transformer transformer_nheads: number of heads in transformer transformer_dim_feedforward: dimension of feedforward network transformer_enc_layers: number of transformer encoder layers transformer_pre_norm: whether to use pre-layernorm or not conv_dims: number of output channels for the intermediate conv layers. mask_dim: number of output channels for the final conv layer. norm (str or callable): normalization for all conv layers """ super().__init__(input_shape, conv_dim=conv_dim, mask_dim=mask_dim, norm=norm) input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5" feature_strides = [v.stride for k, v in input_shape] feature_channels = [v.channels for k, v in input_shape] in_channels = feature_channels[len(self.in_features) - 1] self.input_proj = Conv2d(in_channels, conv_dim, kernel_size=1) weight_init.c2_xavier_fill(self.input_proj) self.transformer = TransformerEncoderOnly( d_model=conv_dim, dropout=transformer_dropout, nhead=transformer_nheads, dim_feedforward=transformer_dim_feedforward, num_encoder_layers=transformer_enc_layers, normalize_before=transformer_pre_norm, ) N_steps = conv_dim // 2 self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) # update layer use_bias = norm == "" output_norm = get_norm(norm, conv_dim) output_conv = Conv2d( conv_dim, conv_dim, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ) weight_init.c2_xavier_fill(output_conv) delattr(self, "layer_{}".format(len(self.in_features))) self.add_module("layer_{}".format(len(self.in_features)), output_conv) self.output_convs[0] = output_conv @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): ret = super().from_config(cfg, input_shape) ret["transformer_dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT ret["transformer_nheads"] = cfg.MODEL.MASK_FORMER.NHEADS ret["transformer_dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD ret[ "transformer_enc_layers" ] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS # a separate config ret["transformer_pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM return ret def forward_features(self, features): multi_scale_features = [] num_cur_levels = 0 # Reverse feature maps into top-down order (from low to high resolution) for idx, f in enumerate(self.in_features[::-1]): x = features[f] lateral_conv = self.lateral_convs[idx] output_conv = self.output_convs[idx] if lateral_conv is None: transformer = self.input_proj(x) pos = self.pe_layer(x) transformer = self.transformer(transformer, None, pos) y = output_conv(transformer) # save intermediate feature as input to Transformer decoder transformer_encoder_features = transformer else: cur_fpn = lateral_conv(x) # Following FPN implementation, we use nearest upsampling here y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest") y = output_conv(y) if num_cur_levels < self.maskformer_num_feature_levels: multi_scale_features.append(y) num_cur_levels += 1 return self.mask_features(y), transformer_encoder_features, multi_scale_features def forward(self, features, targets=None): logger = logging.getLogger(__name__) logger.warning("Calling forward() may cause unpredicted behavior of PixelDecoder module.") return self.forward_features(features) ================================================ FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/msdeformattn.py ================================================ import logging import numpy as np from typing import Callable, Dict, List, Optional, Tuple, Union import fvcore.nn.weight_init as weight_init import torch from torch import nn from torch.nn import functional as F from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_ from torch.cuda.amp import autocast from detectron2.config import configurable from detectron2.layers import Conv2d, ShapeSpec, get_norm from detectron2.modeling import SEM_SEG_HEADS_REGISTRY from ..transformer_decoder.position_encoding import PositionEmbeddingSine from ..transformer_decoder.transformer import _get_clones, _get_activation_fn from .ops.modules import MSDeformAttn # MSDeformAttn Transformer encoder in deformable detr class MSDeformAttnTransformerEncoderOnly(nn.Module): def __init__(self, d_model=256, nhead=8, num_encoder_layers=6, dim_feedforward=1024, dropout=0.1, activation="relu", num_feature_levels=4, enc_n_points=4, ): super().__init__() self.d_model = d_model self.nhead = nhead encoder_layer = MSDeformAttnTransformerEncoderLayer(d_model, dim_feedforward, dropout, activation, num_feature_levels, nhead, enc_n_points) self.encoder = MSDeformAttnTransformerEncoder(encoder_layer, num_encoder_layers) self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model)) self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) for m in self.modules(): if isinstance(m, MSDeformAttn): m._reset_parameters() normal_(self.level_embed) def get_valid_ratio(self, mask): _, H, W = mask.shape valid_H = torch.sum(~mask[:, :, 0], 1) valid_W = torch.sum(~mask[:, 0, :], 1) valid_ratio_h = valid_H.float() / H valid_ratio_w = valid_W.float() / W valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1) return valid_ratio def forward(self, srcs, pos_embeds): masks = [torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) for x in srcs] # prepare input for encoder src_flatten = [] mask_flatten = [] lvl_pos_embed_flatten = [] spatial_shapes = [] for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)): bs, c, h, w = src.shape spatial_shape = (h, w) spatial_shapes.append(spatial_shape) src = src.flatten(2).transpose(1, 2) mask = mask.flatten(1) pos_embed = pos_embed.flatten(2).transpose(1, 2) lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1) lvl_pos_embed_flatten.append(lvl_pos_embed) src_flatten.append(src) mask_flatten.append(mask) src_flatten = torch.cat(src_flatten, 1) mask_flatten = torch.cat(mask_flatten, 1) lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device) level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) # encoder memory = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten) return memory, spatial_shapes, level_start_index class MSDeformAttnTransformerEncoderLayer(nn.Module): def __init__(self, d_model=256, d_ffn=1024, dropout=0.1, activation="relu", n_levels=4, n_heads=8, n_points=4): super().__init__() # self attention self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points) self.dropout1 = nn.Dropout(dropout) self.norm1 = nn.LayerNorm(d_model) # ffn self.linear1 = nn.Linear(d_model, d_ffn) self.activation = _get_activation_fn(activation) self.dropout2 = nn.Dropout(dropout) self.linear2 = nn.Linear(d_ffn, d_model) self.dropout3 = nn.Dropout(dropout) self.norm2 = nn.LayerNorm(d_model) @staticmethod def with_pos_embed(tensor, pos): return tensor if pos is None else tensor + pos def forward_ffn(self, src): src2 = self.linear2(self.dropout2(self.activation(self.linear1(src)))) src = src + self.dropout3(src2) src = self.norm2(src) return src def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None): # self attention src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask) src = src + self.dropout1(src2) src = self.norm1(src) # ffn src = self.forward_ffn(src) return src class MSDeformAttnTransformerEncoder(nn.Module): def __init__(self, encoder_layer, num_layers): super().__init__() self.layers = _get_clones(encoder_layer, num_layers) self.num_layers = num_layers @staticmethod def get_reference_points(spatial_shapes, valid_ratios, device): reference_points_list = [] for lvl, (H_, W_) in enumerate(spatial_shapes): ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device), torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device)) ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_) ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_) ref = torch.stack((ref_x, ref_y), -1) reference_points_list.append(ref) reference_points = torch.cat(reference_points_list, 1) reference_points = reference_points[:, :, None] * valid_ratios[:, None] return reference_points def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None): output = src reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device) for _, layer in enumerate(self.layers): output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask) return output @SEM_SEG_HEADS_REGISTRY.register() class MSDeformAttnPixelDecoder(nn.Module): @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, transformer_dropout: float, transformer_nheads: int, transformer_dim_feedforward: int, transformer_enc_layers: int, conv_dim: int, mask_dim: int, norm: Optional[Union[str, Callable]] = None, # deformable transformer encoder args transformer_in_features: List[str], common_stride: int, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features transformer_dropout: dropout probability in transformer transformer_nheads: number of heads in transformer transformer_dim_feedforward: dimension of feedforward network transformer_enc_layers: number of transformer encoder layers conv_dims: number of output channels for the intermediate conv layers. mask_dim: number of output channels for the final conv layer. norm (str or callable): normalization for all conv layers """ super().__init__() transformer_input_shape = { k: v for k, v in input_shape.items() if k in transformer_in_features } # this is the input shape of pixel decoder input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5" self.feature_strides = [v.stride for k, v in input_shape] self.feature_channels = [v.channels for k, v in input_shape] # this is the input shape of transformer encoder (could use less features than pixel decoder transformer_input_shape = sorted(transformer_input_shape.items(), key=lambda x: x[1].stride) self.transformer_in_features = [k for k, v in transformer_input_shape] # starting from "res2" to "res5" transformer_in_channels = [v.channels for k, v in transformer_input_shape] self.transformer_feature_strides = [v.stride for k, v in transformer_input_shape] # to decide extra FPN layers self.transformer_num_feature_levels = len(self.transformer_in_features) if self.transformer_num_feature_levels > 1: input_proj_list = [] # from low resolution to high resolution (res5 -> res2) for in_channels in transformer_in_channels[::-1]: input_proj_list.append(nn.Sequential( nn.Conv2d(in_channels, conv_dim, kernel_size=1), nn.GroupNorm(32, conv_dim), )) self.input_proj = nn.ModuleList(input_proj_list) else: self.input_proj = nn.ModuleList([ nn.Sequential( nn.Conv2d(transformer_in_channels[-1], conv_dim, kernel_size=1), nn.GroupNorm(32, conv_dim), )]) for proj in self.input_proj: nn.init.xavier_uniform_(proj[0].weight, gain=1) nn.init.constant_(proj[0].bias, 0) self.transformer = MSDeformAttnTransformerEncoderOnly( d_model=conv_dim, dropout=transformer_dropout, nhead=transformer_nheads, dim_feedforward=transformer_dim_feedforward, num_encoder_layers=transformer_enc_layers, num_feature_levels=self.transformer_num_feature_levels, ) N_steps = conv_dim // 2 self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) self.mask_dim = mask_dim # use 1x1 conv instead self.mask_features = Conv2d( conv_dim, mask_dim, kernel_size=1, stride=1, padding=0, ) weight_init.c2_xavier_fill(self.mask_features) self.maskformer_num_feature_levels = 3 # always use 3 scales self.common_stride = common_stride # extra fpn levels stride = min(self.transformer_feature_strides) self.num_fpn_levels = int(np.log2(stride) - np.log2(self.common_stride)) lateral_convs = [] output_convs = [] use_bias = norm == "" for idx, in_channels in enumerate(self.feature_channels[:self.num_fpn_levels]): lateral_norm = get_norm(norm, conv_dim) output_norm = get_norm(norm, conv_dim) lateral_conv = Conv2d( in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm ) output_conv = Conv2d( conv_dim, conv_dim, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ) weight_init.c2_xavier_fill(lateral_conv) weight_init.c2_xavier_fill(output_conv) self.add_module("adapter_{}".format(idx + 1), lateral_conv) self.add_module("layer_{}".format(idx + 1), output_conv) lateral_convs.append(lateral_conv) output_convs.append(output_conv) # Place convs into top-down order (from low to high resolution) # to make the top-down computation in forward clearer. self.lateral_convs = lateral_convs[::-1] self.output_convs = output_convs[::-1] @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): ret = {} ret["input_shape"] = { k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES } ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM ret["transformer_dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT ret["transformer_nheads"] = cfg.MODEL.MASK_FORMER.NHEADS # ret["transformer_dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD ret["transformer_dim_feedforward"] = 1024 # use 1024 for deformable transformer encoder ret[ "transformer_enc_layers" ] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS # a separate config ret["transformer_in_features"] = cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES ret["common_stride"] = cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE return ret @autocast(enabled=False) def forward_features(self, features): srcs = [] pos = [] # Reverse feature maps into top-down order (from low to high resolution) for idx, f in enumerate(self.transformer_in_features[::-1]): x = features[f].float() # deformable detr does not support half precision srcs.append(self.input_proj[idx](x)) pos.append(self.pe_layer(x)) y, spatial_shapes, level_start_index = self.transformer(srcs, pos) bs = y.shape[0] split_size_or_sections = [None] * self.transformer_num_feature_levels for i in range(self.transformer_num_feature_levels): if i < self.transformer_num_feature_levels - 1: split_size_or_sections[i] = level_start_index[i + 1] - level_start_index[i] else: split_size_or_sections[i] = y.shape[1] - level_start_index[i] y = torch.split(y, split_size_or_sections, dim=1) out = [] multi_scale_features = [] num_cur_levels = 0 for i, z in enumerate(y): out.append(z.transpose(1, 2).view(bs, -1, spatial_shapes[i][0], spatial_shapes[i][1])) # append `out` with extra FPN levels # Reverse feature maps into top-down order (from low to high resolution) for idx, f in enumerate(self.in_features[:self.num_fpn_levels][::-1]): x = features[f].float() lateral_conv = self.lateral_convs[idx] output_conv = self.output_convs[idx] cur_fpn = lateral_conv(x) # Following FPN implementation, we use nearest upsampling here y = cur_fpn + F.interpolate(out[-1], size=cur_fpn.shape[-2:], mode="bilinear", align_corners=False) y = output_conv(y) out.append(y) for o in out: if num_cur_levels < self.maskformer_num_feature_levels: multi_scale_features.append(o) num_cur_levels += 1 return self.mask_features(out[-1]), out[0], multi_scale_features ================================================ FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/functions/__init__.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from .ms_deform_attn_func import MSDeformAttnFunction ================================================ FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from __future__ import absolute_import from __future__ import print_function from __future__ import division import torch import torch.nn.functional as F from torch.autograd import Function from torch.autograd.function import once_differentiable try: import MultiScaleDeformableAttention as MSDA except ModuleNotFoundError as e: info_string = ( "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" "\t`cd mask2former/modeling/pixel_decoder/ops`\n" "\t`sh make.sh`\n" ) raise ModuleNotFoundError(info_string) class MSDeformAttnFunction(Function): @staticmethod def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): ctx.im2col_step = im2col_step output = MSDA.ms_deform_attn_forward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) return output @staticmethod @once_differentiable def backward(ctx, grad_output): value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors grad_value, grad_sampling_loc, grad_attn_weight = \ MSDA.ms_deform_attn_backward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): # for debug and test only, # need to use cuda version instead N_, S_, M_, D_ = value.shape _, Lq_, M_, L_, P_, _ = sampling_locations.shape value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) sampling_grids = 2 * sampling_locations - 1 sampling_value_list = [] for lid_, (H_, W_) in enumerate(value_spatial_shapes): # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) # N_*M_, D_, Lq_, P_ sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, mode='bilinear', padding_mode='zeros', align_corners=False) sampling_value_list.append(sampling_value_l_) # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) return output.transpose(1, 2).contiguous() ================================================ FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/make.sh ================================================ #!/usr/bin/env bash # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR python setup.py build install ================================================ FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/modules/__init__.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from .ms_deform_attn import MSDeformAttn ================================================ FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from __future__ import absolute_import from __future__ import print_function from __future__ import division import warnings import math import torch from torch import nn import torch.nn.functional as F from torch.nn.init import xavier_uniform_, constant_ from ..functions import MSDeformAttnFunction from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) return (n & (n-1) == 0) and n != 0 class MSDeformAttn(nn.Module): def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): """ Multi-Scale Deformable Attention Module :param d_model hidden dimension :param n_levels number of feature levels :param n_heads number of attention heads :param n_points number of sampling points per attention head per feature level """ super().__init__() if d_model % n_heads != 0: raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) _d_per_head = d_model // n_heads # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation if not _is_power_of_2(_d_per_head): warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " "which is more efficient in our CUDA implementation.") self.im2col_step = 128 self.d_model = d_model self.n_levels = n_levels self.n_heads = n_heads self.n_points = n_points self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) self.value_proj = nn.Linear(d_model, d_model) self.output_proj = nn.Linear(d_model, d_model) self._reset_parameters() def _reset_parameters(self): constant_(self.sampling_offsets.weight.data, 0.) thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) for i in range(self.n_points): grid_init[:, :, i, :] *= i + 1 with torch.no_grad(): self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) constant_(self.attention_weights.weight.data, 0.) constant_(self.attention_weights.bias.data, 0.) xavier_uniform_(self.value_proj.weight.data) constant_(self.value_proj.bias.data, 0.) xavier_uniform_(self.output_proj.weight.data) constant_(self.output_proj.bias.data, 0.) def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): """ :param query (N, Length_{query}, C) :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements :return output (N, Length_{query}, C) """ N, Len_q, _ = query.shape N, Len_in, _ = input_flatten.shape assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in value = self.value_proj(input_flatten) if input_padding_mask is not None: value = value.masked_fill(input_padding_mask[..., None], float(0)) value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) # N, Len_q, n_heads, n_levels, n_points, 2 if reference_points.shape[-1] == 2: offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) sampling_locations = reference_points[:, :, None, :, None, :] \ + sampling_offsets / offset_normalizer[None, None, None, :, None, :] elif reference_points.shape[-1] == 4: sampling_locations = reference_points[:, :, None, :, None, :2] \ + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 else: raise ValueError( 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) try: output = MSDeformAttnFunction.apply( value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) except: # CPU output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) # # For FLOPs calculation only # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) output = self.output_proj(output) return output ================================================ FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/setup.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR import os import glob import torch from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CppExtension from torch.utils.cpp_extension import CUDAExtension from setuptools import find_packages from setuptools import setup requirements = ["torch", "torchvision"] def get_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) extensions_dir = os.path.join(this_dir, "src") main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) sources = main_file + source_cpu extension = CppExtension extra_compile_args = {"cxx": []} define_macros = [] # Force cuda since torch ask for a device, not if cuda is in fact available. if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: extension = CUDAExtension sources += source_cuda define_macros += [("WITH_CUDA", None)] extra_compile_args["nvcc"] = [ "-DCUDA_HAS_FP16=1", "-D__CUDA_NO_HALF_OPERATORS__", "-D__CUDA_NO_HALF_CONVERSIONS__", "-D__CUDA_NO_HALF2_OPERATORS__", ] else: if CUDA_HOME is None: raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') else: raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') sources = [os.path.join(extensions_dir, s) for s in sources] include_dirs = [extensions_dir] ext_modules = [ extension( "MultiScaleDeformableAttention", sources, include_dirs=include_dirs, define_macros=define_macros, extra_compile_args=extra_compile_args, ) ] return ext_modules setup( name="MultiScaleDeformableAttention", version="1.0", author="Weijie Su", url="https://github.com/fundamentalvision/Deformable-DETR", description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", packages=find_packages(exclude=("configs", "tests",)), ext_modules=get_extensions(), cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, ) ================================================ FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include #include #include at::Tensor ms_deform_attn_cpu_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step) { AT_ERROR("Not implement on cpu"); } std::vector ms_deform_attn_cpu_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step) { AT_ERROR("Not implement on cpu"); } ================================================ FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #pragma once #include at::Tensor ms_deform_attn_cpu_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step); std::vector ms_deform_attn_cpu_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step); ================================================ FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include #include "cuda/ms_deform_im2col_cuda.cuh" #include #include #include #include at::Tensor ms_deform_attn_cuda_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step) { AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); const int batch = value.size(0); const int spatial_size = value.size(1); const int num_heads = value.size(2); const int channels = value.size(3); const int num_levels = spatial_shapes.size(0); const int num_query = sampling_loc.size(1); const int num_point = sampling_loc.size(4); const int im2col_step_ = std::min(batch, im2col_step); AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); const int batch_n = im2col_step_; auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); auto per_value_size = spatial_size * num_heads * channels; auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; for (int n = 0; n < batch/im2col_step_; ++n) { auto columns = output_n.select(0, n); AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), value.data() + n * im2col_step_ * per_value_size, spatial_shapes.data(), level_start_index.data(), sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, attn_weight.data() + n * im2col_step_ * per_attn_weight_size, batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, columns.data()); })); } output = output.view({batch, num_query, num_heads*channels}); return output; } std::vector ms_deform_attn_cuda_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step) { AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); const int batch = value.size(0); const int spatial_size = value.size(1); const int num_heads = value.size(2); const int channels = value.size(3); const int num_levels = spatial_shapes.size(0); const int num_query = sampling_loc.size(1); const int num_point = sampling_loc.size(4); const int im2col_step_ = std::min(batch, im2col_step); AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); auto grad_value = at::zeros_like(value); auto grad_sampling_loc = at::zeros_like(sampling_loc); auto grad_attn_weight = at::zeros_like(attn_weight); const int batch_n = im2col_step_; auto per_value_size = spatial_size * num_heads * channels; auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); for (int n = 0; n < batch/im2col_step_; ++n) { auto grad_output_g = grad_output_n.select(0, n); AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), grad_output_g.data(), value.data() + n * im2col_step_ * per_value_size, spatial_shapes.data(), level_start_index.data(), sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, attn_weight.data() + n * im2col_step_ * per_attn_weight_size, batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value.data() + n * im2col_step_ * per_value_size, grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); })); } return { grad_value, grad_sampling_loc, grad_attn_weight }; } ================================================ FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #pragma once #include at::Tensor ms_deform_attn_cuda_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step); std::vector ms_deform_attn_cuda_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step); ================================================ FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh ================================================ /*! ************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************** * Modified from DCN (https://github.com/msracver/Deformable-ConvNets) * Copyright (c) 2018 Microsoft ************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include #include #include #include #include #include #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ i < (n); \ i += blockDim.x * gridDim.x) const int CUDA_NUM_THREADS = 1024; inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } template __device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; } const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad, const scalar_t &attn_weight, scalar_t* &grad_value, scalar_t* grad_sampling_loc, scalar_t* grad_attn_weight) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t top_grad_value = top_grad * attn_weight; scalar_t grad_h_weight = 0, grad_w_weight = 0; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_value+ptr1, w1*top_grad_value); } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_value+ptr2, w2*top_grad_value); } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_value+ptr3, w3*top_grad_value); } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_value+ptr4, w4*top_grad_value); } const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); *grad_attn_weight = top_grad * val; *grad_sampling_loc = width * grad_w_weight * top_grad_value; *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value; } template __device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad, const scalar_t &attn_weight, scalar_t* &grad_value, scalar_t* grad_sampling_loc, scalar_t* grad_attn_weight) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t top_grad_value = top_grad * attn_weight; scalar_t grad_h_weight = 0, grad_w_weight = 0; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_value+ptr1, w1*top_grad_value); } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_value+ptr2, w2*top_grad_value); } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_value+ptr3, w3*top_grad_value); } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_value+ptr4, w4*top_grad_value); } const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); atomicAdd(grad_attn_weight, top_grad * val); atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value); atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value); } template __global__ void ms_deformable_im2col_gpu_kernel(const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *data_col) { CUDA_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; scalar_t *data_col_ptr = data_col + index; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; scalar_t col = 0; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride); for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight; } data_weight_ptr += 1; data_loc_w_ptr += 2; } } *data_col_ptr = col; } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; __shared__ scalar_t cache_grad_attn_weight[blockSize]; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); if (tid == 0) { scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; int sid=2; for (unsigned int tid = 1; tid < blockSize; ++tid) { _grad_w += cache_grad_sampling_loc[sid]; _grad_h += cache_grad_sampling_loc[sid + 1]; _grad_a += cache_grad_attn_weight[tid]; sid += 2; } *grad_sampling_loc = _grad_w; *(grad_sampling_loc + 1) = _grad_h; *grad_attn_weight = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; __shared__ scalar_t cache_grad_attn_weight[blockSize]; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); for (unsigned int s=blockSize/2; s>0; s>>=1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; } __syncthreads(); } if (tid == 0) { *grad_sampling_loc = cache_grad_sampling_loc[0]; *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; *grad_attn_weight = cache_grad_attn_weight[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); if (tid == 0) { scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; int sid=2; for (unsigned int tid = 1; tid < blockDim.x; ++tid) { _grad_w += cache_grad_sampling_loc[sid]; _grad_h += cache_grad_sampling_loc[sid + 1]; _grad_a += cache_grad_attn_weight[tid]; sid += 2; } *grad_sampling_loc = _grad_w; *(grad_sampling_loc + 1) = _grad_h; *grad_attn_weight = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { *grad_sampling_loc = cache_grad_sampling_loc[0]; *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; *grad_attn_weight = cache_grad_attn_weight[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]); atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]); atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]); } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_gm(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear_gm( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, grad_sampling_loc, grad_attn_weight); } data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t* data_value, const int64_t* data_spatial_shapes, const int64_t* data_level_start_index, const scalar_t* data_sampling_loc, const scalar_t* data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t* data_col) { const int num_kernels = batch_size * num_query * num_heads * channels; const int num_actual_kernels = batch_size * num_query * num_heads * channels; const int num_threads = CUDA_NUM_THREADS; ms_deformable_im2col_gpu_kernel <<>>( num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); } } template void ms_deformable_col2im_cuda(cudaStream_t stream, const scalar_t* grad_col, const scalar_t* data_value, const int64_t * data_spatial_shapes, const int64_t * data_level_start_index, const scalar_t * data_sampling_loc, const scalar_t * data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t* grad_value, scalar_t* grad_sampling_loc, scalar_t* grad_attn_weight) { const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels; const int num_kernels = batch_size * num_query * num_heads * channels; const int num_actual_kernels = batch_size * num_query * num_heads * channels; if (channels > 1024) { if ((channels & 1023) == 0) { ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } else { ms_deformable_col2im_gpu_kernel_gm <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } } else{ switch(channels) { case 1: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 2: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 4: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 8: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 16: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 32: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 64: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 128: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 256: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 512: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 1024: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; default: if (channels < 64) { ms_deformable_col2im_gpu_kernel_shm_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } else { ms_deformable_col2im_gpu_kernel_shm_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } } } cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); } } ================================================ FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #pragma once #include "cpu/ms_deform_attn_cpu.h" #ifdef WITH_CUDA #include "cuda/ms_deform_attn_cuda.h" #endif at::Tensor ms_deform_attn_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step) { if (value.type().is_cuda()) { #ifdef WITH_CUDA return ms_deform_attn_cuda_forward( value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } std::vector ms_deform_attn_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step) { if (value.type().is_cuda()) { #ifdef WITH_CUDA return ms_deform_attn_cuda_backward( value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/vision.cpp ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include "ms_deform_attn.h" PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); } ================================================ FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/test.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from __future__ import absolute_import from __future__ import print_function from __future__ import division import time import torch import torch.nn as nn from torch.autograd import gradcheck from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch N, M, D = 1, 2, 2 Lq, L, P = 2, 2, 2 shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) S = sum([(H*W).item() for H, W in shapes]) torch.manual_seed(3) @torch.no_grad() def check_forward_equal_with_pytorch_double(): value = torch.rand(N, S, M, D).cuda() * 0.01 sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) im2col_step = 2 output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() fwdok = torch.allclose(output_cuda, output_pytorch) max_abs_err = (output_cuda - output_pytorch).abs().max() max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') @torch.no_grad() def check_forward_equal_with_pytorch_float(): value = torch.rand(N, S, M, D).cuda() * 0.01 sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) im2col_step = 2 output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) max_abs_err = (output_cuda - output_pytorch).abs().max() max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): value = torch.rand(N, S, M, channels).cuda() * 0.01 sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) im2col_step = 2 func = MSDeformAttnFunction.apply value.requires_grad = grad_value sampling_locations.requires_grad = grad_sampling_loc attention_weights.requires_grad = grad_attn_weight gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) print(f'* {gradok} check_gradient_numerical(D={channels})') if __name__ == '__main__': check_forward_equal_with_pytorch_double() check_forward_equal_with_pytorch_float() for channels in [30, 32, 64, 71, 1025, 2048, 3096]: check_gradient_numerical(channels, True, True, True) ================================================ FILE: mfvis_nococo/mask2former/modeling/transformer_decoder/__init__.py ================================================ from .maskformer_transformer_decoder import StandardTransformerDecoder from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder ================================================ FILE: mfvis_nococo/mask2former/modeling/transformer_decoder/mask2former_transformer_decoder.py ================================================ # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py import logging import fvcore.nn.weight_init as weight_init from typing import Optional import torch from torch import nn, Tensor from torch.nn import functional as F from detectron2.config import configurable from detectron2.layers import Conv2d from .position_encoding import PositionEmbeddingSine from .maskformer_transformer_decoder import TRANSFORMER_DECODER_REGISTRY class SelfAttentionLayer(nn.Module): def __init__(self, d_model, nhead, dropout=0.0, activation="relu", normalize_before=False): super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.norm = nn.LayerNorm(d_model) self.dropout = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post(self, tgt, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): q = k = self.with_pos_embed(tgt, query_pos) tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) tgt = self.norm(tgt) return tgt def forward_pre(self, tgt, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): tgt2 = self.norm(tgt) q = k = self.with_pos_embed(tgt2, query_pos) tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) return tgt def forward(self, tgt, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): if self.normalize_before: return self.forward_pre(tgt, tgt_mask, tgt_key_padding_mask, query_pos) return self.forward_post(tgt, tgt_mask, tgt_key_padding_mask, query_pos) class CrossAttentionLayer(nn.Module): def __init__(self, d_model, nhead, dropout=0.0, activation="relu", normalize_before=False): super().__init__() self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.norm = nn.LayerNorm(d_model) self.dropout = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post(self, tgt, memory, memory_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos), key=self.with_pos_embed(memory, pos), value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) tgt = self.norm(tgt) return tgt def forward_pre(self, tgt, memory, memory_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): tgt2 = self.norm(tgt) tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos), key=self.with_pos_embed(memory, pos), value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) return tgt def forward(self, tgt, memory, memory_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): if self.normalize_before: return self.forward_pre(tgt, memory, memory_mask, memory_key_padding_mask, pos, query_pos) return self.forward_post(tgt, memory, memory_mask, memory_key_padding_mask, pos, query_pos) class FFNLayer(nn.Module): def __init__(self, d_model, dim_feedforward=2048, dropout=0.0, activation="relu", normalize_before=False): super().__init__() # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm = nn.LayerNorm(d_model) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post(self, tgt): tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) tgt = tgt + self.dropout(tgt2) tgt = self.norm(tgt) return tgt def forward_pre(self, tgt): tgt2 = self.norm(tgt) tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) tgt = tgt + self.dropout(tgt2) return tgt def forward(self, tgt): if self.normalize_before: return self.forward_pre(tgt) return self.forward_post(tgt) def _get_activation_fn(activation): """Return an activation function given a string""" if activation == "relu": return F.relu if activation == "gelu": return F.gelu if activation == "glu": return F.glu raise RuntimeError(F"activation should be relu/gelu, not {activation}.") class MLP(nn.Module): """ Very simple multi-layer perceptron (also called FFN)""" def __init__(self, input_dim, hidden_dim, output_dim, num_layers): super().__init__() self.num_layers = num_layers h = [hidden_dim] * (num_layers - 1) self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) def forward(self, x): for i, layer in enumerate(self.layers): x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) return x @TRANSFORMER_DECODER_REGISTRY.register() class MultiScaleMaskedTransformerDecoder(nn.Module): _version = 2 def _load_from_state_dict( self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ): version = local_metadata.get("version", None) if version is None or version < 2: # Do not warn if train from scratch scratch = True logger = logging.getLogger(__name__) for k in list(state_dict.keys()): newk = k if "static_query" in k: newk = k.replace("static_query", "query_feat") if newk != k: state_dict[newk] = state_dict[k] del state_dict[k] scratch = False if not scratch: logger.warning( f"Weight format of {self.__class__.__name__} have changed! " "Please upgrade your models. Applying automatic conversion now ..." ) @configurable def __init__( self, in_channels, mask_classification=True, *, num_classes: int, hidden_dim: int, num_queries: int, nheads: int, dim_feedforward: int, dec_layers: int, pre_norm: bool, mask_dim: int, enforce_input_project: bool, ): """ NOTE: this interface is experimental. Args: in_channels: channels of the input features mask_classification: whether to add mask classifier or not num_classes: number of classes hidden_dim: Transformer feature dimension num_queries: number of queries nheads: number of heads dim_feedforward: feature dimension in feedforward network enc_layers: number of Transformer encoder layers dec_layers: number of Transformer decoder layers pre_norm: whether to use pre-LayerNorm or not mask_dim: mask feature dimension enforce_input_project: add input project 1x1 conv even if input channels and hidden dim is identical """ super().__init__() assert mask_classification, "Only support mask classification model" self.mask_classification = mask_classification # positional encoding N_steps = hidden_dim // 2 self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) # define Transformer decoder here self.num_heads = nheads self.num_layers = dec_layers self.transformer_self_attention_layers = nn.ModuleList() self.transformer_cross_attention_layers = nn.ModuleList() self.transformer_ffn_layers = nn.ModuleList() for _ in range(self.num_layers): self.transformer_self_attention_layers.append( SelfAttentionLayer( d_model=hidden_dim, nhead=nheads, dropout=0.0, normalize_before=pre_norm, ) ) self.transformer_cross_attention_layers.append( CrossAttentionLayer( d_model=hidden_dim, nhead=nheads, dropout=0.0, normalize_before=pre_norm, ) ) self.transformer_ffn_layers.append( FFNLayer( d_model=hidden_dim, dim_feedforward=dim_feedforward, dropout=0.0, normalize_before=pre_norm, ) ) self.decoder_norm = nn.LayerNorm(hidden_dim) self.num_queries = num_queries # learnable query features self.query_feat = nn.Embedding(num_queries, hidden_dim) # learnable query p.e. self.query_embed = nn.Embedding(num_queries, hidden_dim) # level embedding (we always use 3 scales) self.num_feature_levels = 3 self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim) self.input_proj = nn.ModuleList() for _ in range(self.num_feature_levels): if in_channels != hidden_dim or enforce_input_project: self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1)) weight_init.c2_xavier_fill(self.input_proj[-1]) else: self.input_proj.append(nn.Sequential()) # output FFNs if self.mask_classification: self.class_embed = nn.Linear(hidden_dim, num_classes + 1) self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) @classmethod def from_config(cls, cfg, in_channels, mask_classification): ret = {} ret["in_channels"] = in_channels ret["mask_classification"] = mask_classification ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES # Transformer parameters: ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD # NOTE: because we add learnable query features which requires supervision, # we add minus 1 to decoder layers to be consistent with our loss # implementation: that is, number of auxiliary losses is always # equal to number of decoder layers. With learnable query features, the number of # auxiliary losses equals number of decoders plus 1. assert cfg.MODEL.MASK_FORMER.DEC_LAYERS >= 1 ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS - 1 ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM return ret def forward(self, x, mask_features, mask = None): # x is a list of multi-scale feature assert len(x) == self.num_feature_levels src = [] pos = [] size_list = [] # disable mask, it does not affect performance del mask for i in range(self.num_feature_levels): size_list.append(x[i].shape[-2:]) pos.append(self.pe_layer(x[i], None).flatten(2)) src.append(self.input_proj[i](x[i]).flatten(2) + self.level_embed.weight[i][None, :, None]) # flatten NxCxHxW to HWxNxC pos[-1] = pos[-1].permute(2, 0, 1) src[-1] = src[-1].permute(2, 0, 1) _, bs, _ = src[0].shape # QxNxC query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1) # query_embed = None # print('come here==========') output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1) predictions_class = [] predictions_mask = [] # prediction heads on learnable query features outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0]) predictions_class.append(outputs_class) predictions_mask.append(outputs_mask) for i in range(self.num_layers): level_index = i % self.num_feature_levels attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False # attention: cross-attention first output = self.transformer_cross_attention_layers[i]( output, src[level_index], memory_mask=attn_mask, memory_key_padding_mask=None, # here we do not apply masking on padded region pos=pos[level_index], query_pos=query_embed ) output = self.transformer_self_attention_layers[i]( output, tgt_mask=None, tgt_key_padding_mask=None, query_pos=query_embed ) # FFN output = self.transformer_ffn_layers[i]( output ) outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels]) predictions_class.append(outputs_class) predictions_mask.append(outputs_mask) assert len(predictions_class) == self.num_layers + 1 # print('len mask predictions:', len(predictions_mask)) out = { 'pred_logits': predictions_class[-1], 'pred_masks': predictions_mask[-1], 'aux_outputs': self._set_aux_loss( predictions_class if self.mask_classification else None, predictions_mask ) } return out def forward_prediction_heads(self, output, mask_features, attn_mask_target_size): decoder_output = self.decoder_norm(output) decoder_output = decoder_output.transpose(0, 1) outputs_class = self.class_embed(decoder_output) mask_embed = self.mask_embed(decoder_output) outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features) # NOTE: prediction is of higher-resolution # [B, Q, H, W] -> [B, Q, H*W] -> [B, h, Q, H*W] -> [B*h, Q, HW] attn_mask = F.interpolate(outputs_mask, size=attn_mask_target_size, mode="bilinear", align_corners=False) # must use bool type # If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged. attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool() attn_mask = attn_mask.detach() return outputs_class, outputs_mask, attn_mask @torch.jit.unused def _set_aux_loss(self, outputs_class, outputs_seg_masks): # this is a workaround to make torchscript happy, as torchscript # doesn't support dictionary with non-homogeneous values, such # as a dict having both a Tensor and a list. if self.mask_classification: return [ {"pred_logits": a, "pred_masks": b} for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1]) ] else: return [{"pred_masks": b} for b in outputs_seg_masks[:-1]] ================================================ FILE: mfvis_nococo/mask2former/modeling/transformer_decoder/maskformer_transformer_decoder.py ================================================ # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py import fvcore.nn.weight_init as weight_init import torch from torch import nn from torch.nn import functional as F from detectron2.config import configurable from detectron2.layers import Conv2d from detectron2.utils.registry import Registry from .position_encoding import PositionEmbeddingSine from .transformer import Transformer TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE") TRANSFORMER_DECODER_REGISTRY.__doc__ = """ Registry for transformer module in MaskFormer. """ def build_transformer_decoder(cfg, in_channels, mask_classification=True): """ Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`. """ name = cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification) @TRANSFORMER_DECODER_REGISTRY.register() class StandardTransformerDecoder(nn.Module): @configurable def __init__( self, in_channels, mask_classification=True, *, num_classes: int, hidden_dim: int, num_queries: int, nheads: int, dropout: float, dim_feedforward: int, enc_layers: int, dec_layers: int, pre_norm: bool, deep_supervision: bool, mask_dim: int, enforce_input_project: bool, ): """ NOTE: this interface is experimental. Args: in_channels: channels of the input features mask_classification: whether to add mask classifier or not num_classes: number of classes hidden_dim: Transformer feature dimension num_queries: number of queries nheads: number of heads dropout: dropout in Transformer dim_feedforward: feature dimension in feedforward network enc_layers: number of Transformer encoder layers dec_layers: number of Transformer decoder layers pre_norm: whether to use pre-LayerNorm or not deep_supervision: whether to add supervision to every decoder layers mask_dim: mask feature dimension enforce_input_project: add input project 1x1 conv even if input channels and hidden dim is identical """ super().__init__() self.mask_classification = mask_classification # positional encoding N_steps = hidden_dim // 2 self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) transformer = Transformer( d_model=hidden_dim, dropout=dropout, nhead=nheads, dim_feedforward=dim_feedforward, num_encoder_layers=enc_layers, num_decoder_layers=dec_layers, normalize_before=pre_norm, return_intermediate_dec=deep_supervision, ) self.num_queries = num_queries self.transformer = transformer hidden_dim = transformer.d_model self.query_embed = nn.Embedding(num_queries, hidden_dim) if in_channels != hidden_dim or enforce_input_project: self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1) weight_init.c2_xavier_fill(self.input_proj) else: self.input_proj = nn.Sequential() self.aux_loss = deep_supervision # output FFNs if self.mask_classification: self.class_embed = nn.Linear(hidden_dim, num_classes + 1) self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) @classmethod def from_config(cls, cfg, in_channels, mask_classification): ret = {} ret["in_channels"] = in_channels ret["mask_classification"] = mask_classification ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES # Transformer parameters: ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM return ret def forward(self, x, mask_features, mask=None): if mask is not None: mask = F.interpolate(mask[None].float(), size=x.shape[-2:]).to(torch.bool)[0] pos = self.pe_layer(x, mask) src = x hs, memory = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos) if self.mask_classification: outputs_class = self.class_embed(hs) out = {"pred_logits": outputs_class[-1]} else: out = {} if self.aux_loss: # [l, bs, queries, embed] mask_embed = self.mask_embed(hs) outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features) out["pred_masks"] = outputs_seg_masks[-1] out["aux_outputs"] = self._set_aux_loss( outputs_class if self.mask_classification else None, outputs_seg_masks ) else: # FIXME h_boxes takes the last one computed, keep this in mind # [bs, queries, embed] mask_embed = self.mask_embed(hs[-1]) outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features) out["pred_masks"] = outputs_seg_masks return out @torch.jit.unused def _set_aux_loss(self, outputs_class, outputs_seg_masks): # this is a workaround to make torchscript happy, as torchscript # doesn't support dictionary with non-homogeneous values, such # as a dict having both a Tensor and a list. if self.mask_classification: return [ {"pred_logits": a, "pred_masks": b} for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1]) ] else: return [{"pred_masks": b} for b in outputs_seg_masks[:-1]] class MLP(nn.Module): """Very simple multi-layer perceptron (also called FFN)""" def __init__(self, input_dim, hidden_dim, output_dim, num_layers): super().__init__() self.num_layers = num_layers h = [hidden_dim] * (num_layers - 1) self.layers = nn.ModuleList( nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]) ) def forward(self, x): for i, layer in enumerate(self.layers): x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) return x ================================================ FILE: mfvis_nococo/mask2former/modeling/transformer_decoder/position_encoding.py ================================================ # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py """ Various positional encodings for the transformer. """ import math import torch from torch import nn class PositionEmbeddingSine(nn.Module): """ This is a more standard version of the position embedding, very similar to the one used by the Attention is all you need paper, generalized to work on images. """ def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): super().__init__() self.num_pos_feats = num_pos_feats self.temperature = temperature self.normalize = normalize if scale is not None and normalize is False: raise ValueError("normalize should be True if scale is passed") if scale is None: scale = 2 * math.pi self.scale = scale def forward(self, x, mask=None): if mask is None: mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) not_mask = ~mask y_embed = not_mask.cumsum(1, dtype=torch.float32) x_embed = not_mask.cumsum(2, dtype=torch.float32) if self.normalize: eps = 1e-6 y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) pos_x = x_embed[:, :, :, None] / dim_t pos_y = y_embed[:, :, :, None] / dim_t pos_x = torch.stack( (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 ).flatten(3) pos_y = torch.stack( (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 ).flatten(3) pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) return pos def __repr__(self, _repr_indent=4): head = "Positional encoding " + self.__class__.__name__ body = [ "num_pos_feats: {}".format(self.num_pos_feats), "temperature: {}".format(self.temperature), "normalize: {}".format(self.normalize), "scale: {}".format(self.scale), ] # _repr_indent = 4 lines = [head] + [" " * _repr_indent + line for line in body] return "\n".join(lines) ================================================ FILE: mfvis_nococo/mask2former/modeling/transformer_decoder/transformer.py ================================================ # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/transformer.py """ Transformer class. Copy-paste from torch.nn.Transformer with modifications: * positional encodings are passed in MHattention * extra LN at the end of encoder is removed * decoder returns a stack of activations from all decoding layers """ import copy from typing import List, Optional import torch import torch.nn.functional as F from torch import Tensor, nn class Transformer(nn.Module): def __init__( self, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False, return_intermediate_dec=False, ): super().__init__() encoder_layer = TransformerEncoderLayer( d_model, nhead, dim_feedforward, dropout, activation, normalize_before ) encoder_norm = nn.LayerNorm(d_model) if normalize_before else None self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) decoder_layer = TransformerDecoderLayer( d_model, nhead, dim_feedforward, dropout, activation, normalize_before ) decoder_norm = nn.LayerNorm(d_model) self.decoder = TransformerDecoder( decoder_layer, num_decoder_layers, decoder_norm, return_intermediate=return_intermediate_dec, ) self._reset_parameters() self.d_model = d_model self.nhead = nhead def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def forward(self, src, mask, query_embed, pos_embed): # flatten NxCxHxW to HWxNxC bs, c, h, w = src.shape src = src.flatten(2).permute(2, 0, 1) pos_embed = pos_embed.flatten(2).permute(2, 0, 1) query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) if mask is not None: mask = mask.flatten(1) tgt = torch.zeros_like(query_embed) memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) hs = self.decoder( tgt, memory, memory_key_padding_mask=mask, pos=pos_embed, query_pos=query_embed ) return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w) class TransformerEncoder(nn.Module): def __init__(self, encoder_layer, num_layers, norm=None): super().__init__() self.layers = _get_clones(encoder_layer, num_layers) self.num_layers = num_layers self.norm = norm def forward( self, src, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, ): output = src for layer in self.layers: output = layer( output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, pos=pos ) if self.norm is not None: output = self.norm(output) return output class TransformerDecoder(nn.Module): def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): super().__init__() self.layers = _get_clones(decoder_layer, num_layers) self.num_layers = num_layers self.norm = norm self.return_intermediate = return_intermediate def forward( self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None, ): output = tgt intermediate = [] for layer in self.layers: output = layer( output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask, pos=pos, query_pos=query_pos, ) if self.return_intermediate: intermediate.append(self.norm(output)) if self.norm is not None: output = self.norm(output) if self.return_intermediate: intermediate.pop() intermediate.append(output) if self.return_intermediate: return torch.stack(intermediate) return output.unsqueeze(0) class TransformerEncoderLayer(nn.Module): def __init__( self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False, ): super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post( self, src, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, ): q = k = self.with_pos_embed(src, pos) src2 = self.self_attn( q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask )[0] src = src + self.dropout1(src2) src = self.norm1(src) src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) src = src + self.dropout2(src2) src = self.norm2(src) return src def forward_pre( self, src, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, ): src2 = self.norm1(src) q = k = self.with_pos_embed(src2, pos) src2 = self.self_attn( q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask )[0] src = src + self.dropout1(src2) src2 = self.norm2(src) src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) src = src + self.dropout2(src2) return src def forward( self, src, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, ): if self.normalize_before: return self.forward_pre(src, src_mask, src_key_padding_mask, pos) return self.forward_post(src, src_mask, src_key_padding_mask, pos) class TransformerDecoderLayer(nn.Module): def __init__( self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False, ): super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post( self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None, ): q = k = self.with_pos_embed(tgt, query_pos) tgt2 = self.self_attn( q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask )[0] tgt = tgt + self.dropout1(tgt2) tgt = self.norm1(tgt) tgt2 = self.multihead_attn( query=self.with_pos_embed(tgt, query_pos), key=self.with_pos_embed(memory, pos), value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask, )[0] tgt = tgt + self.dropout2(tgt2) tgt = self.norm2(tgt) tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) tgt = tgt + self.dropout3(tgt2) tgt = self.norm3(tgt) return tgt def forward_pre( self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None, ): tgt2 = self.norm1(tgt) q = k = self.with_pos_embed(tgt2, query_pos) tgt2 = self.self_attn( q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask )[0] tgt = tgt + self.dropout1(tgt2) tgt2 = self.norm2(tgt) tgt2 = self.multihead_attn( query=self.with_pos_embed(tgt2, query_pos), key=self.with_pos_embed(memory, pos), value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask, )[0] tgt = tgt + self.dropout2(tgt2) tgt2 = self.norm3(tgt) tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) tgt = tgt + self.dropout3(tgt2) return tgt def forward( self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None, ): if self.normalize_before: return self.forward_pre( tgt, memory, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos, ) return self.forward_post( tgt, memory, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos, ) def _get_clones(module, N): return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) def _get_activation_fn(activation): """Return an activation function given a string""" if activation == "relu": return F.relu if activation == "gelu": return F.gelu if activation == "glu": return F.glu raise RuntimeError(f"activation should be relu/gelu, not {activation}.") ================================================ FILE: mfvis_nococo/mask2former/test_time_augmentation.py ================================================ import copy import logging from itertools import count import numpy as np import torch from fvcore.transforms import HFlipTransform from torch import nn from torch.nn.parallel import DistributedDataParallel from detectron2.data.detection_utils import read_image from detectron2.modeling import DatasetMapperTTA __all__ = [ "SemanticSegmentorWithTTA", ] class SemanticSegmentorWithTTA(nn.Module): """ A SemanticSegmentor with test-time augmentation enabled. Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`. """ def __init__(self, cfg, model, tta_mapper=None, batch_size=1): """ Args: cfg (CfgNode): model (SemanticSegmentor): a SemanticSegmentor to apply TTA on. tta_mapper (callable): takes a dataset dict and returns a list of augmented versions of the dataset dict. Defaults to `DatasetMapperTTA(cfg)`. batch_size (int): batch the augmented images into this batch size for inference. """ super().__init__() if isinstance(model, DistributedDataParallel): model = model.module self.cfg = cfg.clone() self.model = model if tta_mapper is None: tta_mapper = DatasetMapperTTA(cfg) self.tta_mapper = tta_mapper self.batch_size = batch_size def __call__(self, batched_inputs): """ Same input/output format as :meth:`SemanticSegmentor.forward` """ def _maybe_read_image(dataset_dict): ret = copy.copy(dataset_dict) if "image" not in ret: image = read_image(ret.pop("file_name"), self.model.input_format) image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW ret["image"] = image if "height" not in ret and "width" not in ret: ret["height"] = image.shape[1] ret["width"] = image.shape[2] return ret processed_results = [] for x in batched_inputs: result = self._inference_one_image(_maybe_read_image(x)) processed_results.append(result) return processed_results def _inference_one_image(self, input): """ Args: input (dict): one dataset dict with "image" field being a CHW tensor Returns: dict: one output dict """ orig_shape = (input["height"], input["width"]) augmented_inputs, tfms = self._get_augmented_inputs(input) final_predictions = None count_predictions = 0 for input, tfm in zip(augmented_inputs, tfms): count_predictions += 1 with torch.no_grad(): if final_predictions is None: if any(isinstance(t, HFlipTransform) for t in tfm.transforms): final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2]) else: final_predictions = self.model([input])[0].pop("sem_seg") else: if any(isinstance(t, HFlipTransform) for t in tfm.transforms): final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2]) else: final_predictions += self.model([input])[0].pop("sem_seg") final_predictions = final_predictions / count_predictions return {"sem_seg": final_predictions} def _get_augmented_inputs(self, input): augmented_inputs = self.tta_mapper(input) tfms = [x.pop("transforms") for x in augmented_inputs] return augmented_inputs, tfms ================================================ FILE: mfvis_nococo/mask2former/utils/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. ================================================ FILE: mfvis_nococo/mask2former/utils/__init__.py.new ================================================ ================================================ FILE: mfvis_nococo/mask2former/utils/misc.py ================================================ # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py """ Misc functions, including distributed helpers. Mostly copy-paste from torchvision references. """ from typing import List, Optional import torch import torch.distributed as dist import torchvision from torch import Tensor def _max_by_axis(the_list): # type: (List[List[int]]) -> List[int] maxes = the_list[0] for sublist in the_list[1:]: for index, item in enumerate(sublist): maxes[index] = max(maxes[index], item) return maxes class NestedTensor(object): def __init__(self, tensors, mask: Optional[Tensor]): self.tensors = tensors self.mask = mask def to(self, device): # type: (Device) -> NestedTensor # noqa cast_tensor = self.tensors.to(device) mask = self.mask if mask is not None: assert mask is not None cast_mask = mask.to(device) else: cast_mask = None return NestedTensor(cast_tensor, cast_mask) def decompose(self): return self.tensors, self.mask def __repr__(self): return str(self.tensors) def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): # TODO make this more general if tensor_list[0].ndim == 3: if torchvision._is_tracing(): # nested_tensor_from_tensor_list() does not export well to ONNX # call _onnx_nested_tensor_from_tensor_list() instead return _onnx_nested_tensor_from_tensor_list(tensor_list) # TODO make it support different-sized images max_size = _max_by_axis([list(img.shape) for img in tensor_list]) # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) batch_shape = [len(tensor_list)] + max_size b, c, h, w = batch_shape dtype = tensor_list[0].dtype device = tensor_list[0].device tensor = torch.zeros(batch_shape, dtype=dtype, device=device) mask = torch.ones((b, h, w), dtype=torch.bool, device=device) for img, pad_img, m in zip(tensor_list, tensor, mask): pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) m[: img.shape[1], : img.shape[2]] = False else: raise ValueError("not supported") return NestedTensor(tensor, mask) # _onnx_nested_tensor_from_tensor_list() is an implementation of # nested_tensor_from_tensor_list() that is supported by ONNX tracing. @torch.jit.unused def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: max_size = [] for i in range(tensor_list[0].dim()): max_size_i = torch.max( torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) ).to(torch.int64) max_size.append(max_size_i) max_size = tuple(max_size) # work around for # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) # m[: img.shape[1], :img.shape[2]] = False # which is not yet supported in onnx padded_imgs = [] padded_masks = [] for img in tensor_list: padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) padded_imgs.append(padded_img) m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) padded_masks.append(padded_mask.to(torch.bool)) tensor = torch.stack(padded_imgs) mask = torch.stack(padded_masks) return NestedTensor(tensor, mask=mask) def is_dist_avail_and_initialized(): if not dist.is_available(): return False if not dist.is_initialized(): return False return True ================================================ FILE: mfvis_nococo/mask2former_video/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from . import modeling # config from .config import add_maskformer2_video_config # models from .video_maskformer_model import VideoMaskFormer # video from .data_video import ( YTVISDatasetMapper, YTVISEvaluator, build_detection_train_loader, build_detection_test_loader, get_detection_dataset_dicts, ) ================================================ FILE: mfvis_nococo/mask2former_video/config.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. from detectron2.config import CfgNode as CN def add_maskformer2_video_config(cfg): # video data # DataLoader cfg.INPUT.SAMPLING_FRAME_NUM = 5 cfg.INPUT.SAMPLING_FRAME_RANGE = 5 cfg.INPUT.SAMPLING_FRAME_SHUFFLE = True cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation" ================================================ FILE: mfvis_nococo/mask2former_video/data_video/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC from .dataset_mapper import YTVISDatasetMapper, CocoClipDatasetMapper from .build import * from .datasets import * from .ytvis_eval import YTVISEvaluator ================================================ FILE: mfvis_nococo/mask2former_video/data_video/augmentation.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC import numpy as np import logging import sys from fvcore.transforms.transform import ( HFlipTransform, NoOpTransform, VFlipTransform, ) from PIL import Image from detectron2.data import transforms as T class ResizeShortestEdge(T.Augmentation): """ Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge. If `max_size` is reached, then downscale so that the longer edge does not exceed max_size. """ def __init__( self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR, clip_frame_cnt=1 ): """ Args: short_edge_length (list[int]): If ``sample_style=="range"``, a [min, max] interval from which to sample the shortest edge length. If ``sample_style=="choice"``, a list of shortest edge lengths to sample from. max_size (int): maximum allowed longest edge length. sample_style (str): either "range" or "choice". """ super().__init__() assert sample_style in ["range", "choice", "range_by_clip", "choice_by_clip"], sample_style self.is_range = ("range" in sample_style) if isinstance(short_edge_length, int): short_edge_length = (short_edge_length, short_edge_length) if self.is_range: assert len(short_edge_length) == 2, ( "short_edge_length must be two values using 'range' sample style." f" Got {short_edge_length}!" ) self._cnt = 0 self._init(locals()) def get_transform(self, image): if self._cnt % self.clip_frame_cnt == 0: if self.is_range: self.size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1) else: self.size = np.random.choice(self.short_edge_length) if self.size == 0: return NoOpTransform() self._cnt = 0 # avoiding overflow self._cnt += 1 h, w = image.shape[:2] scale = self.size * 1.0 / min(h, w) if h < w: newh, neww = self.size, scale * w else: newh, neww = scale * h, self.size if max(newh, neww) > self.max_size: scale = self.max_size * 1.0 / max(newh, neww) newh = newh * scale neww = neww * scale neww = int(neww + 0.5) newh = int(newh + 0.5) return T.ResizeTransform(h, w, newh, neww, self.interp) class RandomFlip(T.Augmentation): """ Flip the image horizontally or vertically with the given probability. """ def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_frame_cnt=1): """ Args: prob (float): probability of flip. horizontal (boolean): whether to apply horizontal flipping vertical (boolean): whether to apply vertical flipping """ super().__init__() if horizontal and vertical: raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.") if not horizontal and not vertical: raise ValueError("At least one of horiz or vert has to be True!") self._cnt = 0 self._init(locals()) def get_transform(self, image): if self._cnt % self.clip_frame_cnt == 0: self.do = self._rand_range() < self.prob self._cnt = 0 # avoiding overflow self._cnt += 1 h, w = image.shape[:2] if self.do: if self.horizontal: return HFlipTransform(w) elif self.vertical: return VFlipTransform(h) else: return NoOpTransform() def build_augmentation(cfg, is_train): logger = logging.getLogger(__name__) aug_list = [] if is_train: # Crop if cfg.INPUT.CROP.ENABLED: aug_list.append(T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE)) # Resize min_size = cfg.INPUT.MIN_SIZE_TRAIN max_size = cfg.INPUT.MAX_SIZE_TRAIN sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING ms_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM if "by_clip" in cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING else 1 aug_list.append(ResizeShortestEdge(min_size, max_size, sample_style, clip_frame_cnt=ms_clip_frame_cnt)) # Flip if cfg.INPUT.RANDOM_FLIP != "none": if cfg.INPUT.RANDOM_FLIP == "flip_by_clip": flip_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM else: flip_clip_frame_cnt = 1 aug_list.append( # NOTE using RandomFlip modified for the support of flip maintenance RandomFlip( horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"), vertical=cfg.INPUT.RANDOM_FLIP == "vertical", clip_frame_cnt=flip_clip_frame_cnt, ) ) # Additional augmentations : brightness, contrast, saturation, rotation augmentations = cfg.INPUT.AUGMENTATIONS if "brightness" in augmentations: aug_list.append(T.RandomBrightness(0.9, 1.1)) if "contrast" in augmentations: aug_list.append(T.RandomContrast(0.9, 1.1)) if "saturation" in augmentations: aug_list.append(T.RandomSaturation(0.9, 1.1)) if "rotation" in augmentations: aug_list.append( T.RandomRotation( [-15, 15], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], sample_style="range" ) ) else: # Resize min_size = cfg.INPUT.MIN_SIZE_TEST max_size = cfg.INPUT.MAX_SIZE_TEST sample_style = "choice" aug_list.append(T.ResizeShortestEdge(min_size, max_size, sample_style)) return aug_list ================================================ FILE: mfvis_nococo/mask2former_video/data_video/build.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC import itertools import logging import torch.utils.data from detectron2.config import CfgNode, configurable from detectron2.data.build import ( build_batch_data_loader, load_proposals_into_dataset, trivial_batch_collator, ) from detectron2.data.catalog import DatasetCatalog from detectron2.data.common import DatasetFromList, MapDataset from detectron2.data.dataset_mapper import DatasetMapper from detectron2.data.samplers import InferenceSampler, TrainingSampler from detectron2.utils.comm import get_world_size def _compute_num_images_per_worker(cfg: CfgNode): num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers ) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers ) images_per_worker = images_per_batch // num_workers return images_per_worker def filter_images_with_only_crowd_annotations(dataset_dicts, dataset_names): """ Filter out images with none annotations or only crowd annotations (i.e., images without non-crowd annotations). A common training-time preprocessing on COCO dataset. Args: dataset_dicts (list[dict]): annotations in Detectron2 Dataset format. Returns: list[dict]: the same format, but filtered. """ num_before = len(dataset_dicts) def valid(anns): for ann in anns: if isinstance(ann, list): for instance in ann: if instance.get("iscrowd", 0) == 0: return True else: if ann.get("iscrowd", 0) == 0: return True return False dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])] num_after = len(dataset_dicts) logger = logging.getLogger(__name__) logger.info( "Removed {} images with no usable annotations. {} images left.".format( num_before - num_after, num_after ) ) return dataset_dicts def get_detection_dataset_dicts( dataset_names, filter_empty=True, proposal_files=None ): """ Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation. Args: dataset_names (str or list[str]): a dataset name or a list of dataset names filter_empty (bool): whether to filter out images without instance annotations proposal_files (list[str]): if given, a list of object proposal files that match each dataset in `dataset_names`. Returns: list[dict]: a list of dicts following the standard dataset dict format. """ if isinstance(dataset_names, str): dataset_names = [dataset_names] assert len(dataset_names) dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in dataset_names] for dataset_name, dicts in zip(dataset_names, dataset_dicts): assert len(dicts), "Dataset '{}' is empty!".format(dataset_name) if proposal_files is not None: assert len(dataset_names) == len(proposal_files) # load precomputed proposals from proposal files dataset_dicts = [ load_proposals_into_dataset(dataset_i_dicts, proposal_file) for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files) ] dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts)) has_instances = "annotations" in dataset_dicts[0] if filter_empty and has_instances: dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts, dataset_names) assert len(dataset_dicts), "No valid data found in {}.".format(",".join(dataset_names)) return dataset_dicts def _train_loader_from_config(cfg, mapper, *, dataset=None, sampler=None): if dataset is None: dataset = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) if mapper is None: mapper = DatasetMapper(cfg, True) if sampler is None: sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) sampler = TrainingSampler(len(dataset)) return { "dataset": dataset, "sampler": sampler, "mapper": mapper, "total_batch_size": cfg.SOLVER.IMS_PER_BATCH, "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING, "num_workers": cfg.DATALOADER.NUM_WORKERS, } # TODO can allow dataset as an iterable or IterableDataset to make this function more general @configurable(from_config=_train_loader_from_config) def build_detection_train_loader( dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0 ): """ Build a dataloader for object detection with some default features. This interface is experimental. Args: dataset (list or torch.utils.data.Dataset): a list of dataset dicts, or a map-style pytorch dataset. They can be obtained by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`. mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``. sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces indices to be applied on ``dataset``. Default to :class:`TrainingSampler`, which coordinates a random shuffle sequence across all workers. total_batch_size (int): total batch size across all workers. Batching simply puts data into a list. aspect_ratio_grouping (bool): whether to group images with similar aspect ratio for efficiency. When enabled, it requires each element in dataset be a dict with keys "width" and "height". num_workers (int): number of parallel data loading workers Returns: torch.utils.data.DataLoader: a dataloader. Each output from it is a ``list[mapped_element]`` of length ``total_batch_size / num_workers``, where ``mapped_element`` is produced by the ``mapper``. """ if isinstance(dataset, list): dataset = DatasetFromList(dataset, copy=False) if mapper is not None: dataset = MapDataset(dataset, mapper) if sampler is None: sampler = TrainingSampler(len(dataset)) assert isinstance(sampler, torch.utils.data.sampler.Sampler) return build_batch_data_loader( dataset, sampler, total_batch_size, aspect_ratio_grouping=aspect_ratio_grouping, num_workers=num_workers, ) def _test_loader_from_config(cfg, dataset_name, mapper=None): """ Uses the given `dataset_name` argument (instead of the names in cfg), because the standard practice is to evaluate each test set individually (not combining them). """ dataset = get_detection_dataset_dicts( [dataset_name], filter_empty=False, proposal_files=[ cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(dataset_name)] ] if cfg.MODEL.LOAD_PROPOSALS else None, ) if mapper is None: mapper = DatasetMapper(cfg, False) return {"dataset": dataset, "mapper": mapper, "num_workers": cfg.DATALOADER.NUM_WORKERS} @configurable(from_config=_test_loader_from_config) def build_detection_test_loader(dataset, *, mapper, num_workers=0): """ Similar to `build_detection_train_loader`, but uses a batch size of 1. This interface is experimental. Args: dataset (list or torch.utils.data.Dataset): a list of dataset dicts, or a map-style pytorch dataset. They can be obtained by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`. mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``. num_workers (int): number of parallel data loading workers Returns: DataLoader: a torch DataLoader, that loads the given detection dataset, with test-time transformation and batching. Examples: :: data_loader = build_detection_test_loader( DatasetRegistry.get("my_test"), mapper=DatasetMapper(...)) # or, instantiate with a CfgNode: data_loader = build_detection_test_loader(cfg, "my_test") """ if isinstance(dataset, list): dataset = DatasetFromList(dataset, copy=False) if mapper is not None: dataset = MapDataset(dataset, mapper) sampler = InferenceSampler(len(dataset)) # Always use 1 image per worker during inference since this is the # standard when reporting inference time in papers. batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False) data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, ) return data_loader ================================================ FILE: mfvis_nococo/mask2former_video/data_video/dataset_mapper.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC import copy import logging import random import numpy as np from typing import List, Union import torch from detectron2.config import configurable from detectron2.structures import ( BitMasks, Boxes, BoxMode, Instances, ) from detectron2.data import detection_utils as utils from detectron2.data import transforms as T from .augmentation import build_augmentation import os __all__ = ["YTVISDatasetMapper", "CocoClipDatasetMapper"] def seed_everything(seed): random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed) torch.manual_seed(seed) # torch.cuda.manual_seed(seed) # torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = True def filter_empty_instances(instances, by_box=True, by_mask=True, box_threshold=1e-5): """ Filter out empty instances in an `Instances` object. Args: instances (Instances): by_box (bool): whether to filter out instances with empty boxes by_mask (bool): whether to filter out instances with empty masks box_threshold (float): minimum width and height to be considered non-empty Returns: Instances: the filtered instances. """ assert by_box or by_mask r = [] if by_box: r.append(instances.gt_boxes.nonempty(threshold=box_threshold)) if instances.has("gt_masks") and by_mask: r.append(instances.gt_masks.nonempty()) if not r: return instances m = r[0] for x in r[1:]: m = m & x instances.gt_ids[~m] = -1 return instances def _get_dummy_anno(num_classes): return { "iscrowd": 0, "category_id": num_classes, "id": -1, "bbox": np.array([0, 0, 0, 0]), "bbox_mode": BoxMode.XYXY_ABS, "segmentation": [np.array([0.0] * 6)] } def ytvis_annotations_to_instances(annos, image_size): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: It will contain fields "gt_boxes", "gt_classes", "gt_ids", "gt_masks", if they can be obtained from `annos`. This is the format that builtin models expect. """ boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos] target = Instances(image_size) target.gt_boxes = Boxes(boxes) classes = [int(obj["category_id"]) for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes ids = [int(obj["id"]) for obj in annos] ids = torch.tensor(ids, dtype=torch.int64) target.gt_ids = ids if len(annos) and "segmentation" in annos[0]: segms = [obj["segmentation"] for obj in annos] masks = [] for segm in segms: assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim ) # mask array masks.append(segm) # torch.from_numpy does not support array with negative stride. masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in masks]) ) target.gt_masks = masks return target class YTVISDatasetMapper: """ A callable which takes a dataset dict in YouTube-VIS Dataset format, and map it into a format used by the model. """ @configurable def __init__( self, is_train: bool, *, augmentations: List[Union[T.Augmentation, T.Transform]], image_format: str, use_instance_mask: bool = False, sampling_frame_num: int = 2, sampling_frame_range: int = 5, sampling_frame_shuffle: bool = False, num_classes: int = 40, ): """ NOTE: this interface is experimental. Args: is_train: whether it's used in training or inference augmentations: a list of augmentations or deterministic transforms to apply image_format: an image format supported by :func:`detection_utils.read_image`. use_instance_mask: whether to process instance segmentation annotations, if available """ # fmt: off self.is_train = is_train self.augmentations = T.AugmentationList(augmentations) self.image_format = image_format self.use_instance_mask = use_instance_mask self.sampling_frame_num = sampling_frame_num self.sampling_frame_range = sampling_frame_range self.sampling_frame_shuffle = sampling_frame_shuffle self.num_classes = num_classes # fmt: on logger = logging.getLogger(__name__) mode = "training" if is_train else "inference" logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}") seed_everything(29118357) @classmethod def from_config(cls, cfg, is_train: bool = True): augs = build_augmentation(cfg, is_train) sampling_frame_num = cfg.INPUT.SAMPLING_FRAME_NUM sampling_frame_range = cfg.INPUT.SAMPLING_FRAME_RANGE sampling_frame_shuffle = cfg.INPUT.SAMPLING_FRAME_SHUFFLE ret = { "is_train": is_train, "augmentations": augs, "image_format": cfg.INPUT.FORMAT, "use_instance_mask": cfg.MODEL.MASK_ON, "sampling_frame_num": sampling_frame_num, "sampling_frame_range": sampling_frame_range, "sampling_frame_shuffle": sampling_frame_shuffle, "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, } return ret def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one video, in YTVIS Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ # TODO consider examining below deepcopy as it costs huge amount of computations. dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below video_length = dataset_dict["length"] if self.is_train: ref_frame = random.randrange(video_length) start_idx = max(0, ref_frame-self.sampling_frame_range) end_idx = min(video_length, ref_frame+self.sampling_frame_range + 1) selected_idx = np.random.choice( np.array(list(range(start_idx, ref_frame)) + list(range(ref_frame+1, end_idx))), self.sampling_frame_num - 1, ) selected_idx = selected_idx.tolist() + [ref_frame] selected_idx = sorted(selected_idx) # print('selected_idx:', selected_idx) if self.sampling_frame_shuffle: random.shuffle(selected_idx) else: selected_idx = range(video_length) video_annos = dataset_dict.pop("annotations", None) file_names = dataset_dict.pop("file_names", None) if self.is_train: _ids = set() for frame_idx in selected_idx: _ids.update([anno["id"] for anno in video_annos[frame_idx]]) ids = dict() for i, _id in enumerate(_ids): ids[_id] = i dataset_dict["image"] = [] dataset_dict["instances"] = [] dataset_dict["file_names"] = [] for frame_idx in selected_idx: dataset_dict["file_names"].append(file_names[frame_idx]) # Read image image = utils.read_image(file_names[frame_idx], format=self.image_format) utils.check_image_size(dataset_dict, image) aug_input = T.AugInput(image) transforms = self.augmentations(aug_input) image = aug_input.image image_shape = image.shape[:2] # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"].append(torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))) if (video_annos is None) or (not self.is_train): continue # NOTE copy() is to prevent annotations getting changed from applying augmentations _frame_annos = [] for anno in video_annos[frame_idx]: _anno = {} for k, v in anno.items(): _anno[k] = copy.deepcopy(v) _frame_annos.append(_anno) # USER: Implement additional transformations if you have other types of data annos = [ utils.transform_instance_annotations(obj, transforms, image_shape) for obj in _frame_annos if obj.get("iscrowd", 0) == 0 ] sorted_annos = [_get_dummy_anno(self.num_classes) for _ in range(len(ids))] for _anno in annos: idx = ids[_anno["id"]] sorted_annos[idx] = _anno _gt_ids = [_anno["id"] for _anno in sorted_annos] instances = utils.annotations_to_instances(sorted_annos, image_shape, mask_format="bitmask") instances.gt_ids = torch.tensor(_gt_ids) if instances.has("gt_masks"): instances.gt_boxes = instances.gt_masks.get_bounding_boxes() instances = filter_empty_instances(instances) else: instances.gt_masks = BitMasks(torch.empty((0, *image_shape))) dataset_dict["instances"].append(instances) return dataset_dict class CocoClipDatasetMapper: """ A callable which takes a COCO image which converts into multiple frames, and map it into a format used by the model. """ @configurable def __init__( self, is_train: bool, *, augmentations: List[Union[T.Augmentation, T.Transform]], image_format: str, use_instance_mask: bool = False, sampling_frame_num: int = 2, ): """ NOTE: this interface is experimental. Args: is_train: whether it's used in training or inference augmentations: a list of augmentations or deterministic transforms to apply image_format: an image format supported by :func:`detection_utils.read_image`. use_instance_mask: whether to process instance segmentation annotations, if available """ # fmt: off self.is_train = is_train self.augmentations = T.AugmentationList(augmentations) self.image_format = image_format self.use_instance_mask = use_instance_mask self.sampling_frame_num = sampling_frame_num # fmt: on logger = logging.getLogger(__name__) mode = "training" if is_train else "inference" logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}") @classmethod def from_config(cls, cfg, is_train: bool = True): augs = build_augmentation(cfg, is_train) sampling_frame_num = cfg.INPUT.SAMPLING_FRAME_NUM ret = { "is_train": is_train, "augmentations": augs, "image_format": cfg.INPUT.FORMAT, "use_instance_mask": cfg.MODEL.MASK_ON, "sampling_frame_num": sampling_frame_num, } return ret def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below img_annos = dataset_dict.pop("annotations", None) file_name = dataset_dict.pop("file_name", None) original_image = utils.read_image(file_name, format=self.image_format) dataset_dict["image"] = [] dataset_dict["instances"] = [] dataset_dict["file_names"] = [file_name] * self.sampling_frame_num for _ in range(self.sampling_frame_num): utils.check_image_size(dataset_dict, original_image) aug_input = T.AugInput(original_image) transforms = self.augmentations(aug_input) image = aug_input.image image_shape = image.shape[:2] # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"].append(torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))) if (img_annos is None) or (not self.is_train): continue _img_annos = [] for anno in img_annos: _anno = {} for k, v in anno.items(): _anno[k] = copy.deepcopy(v) _img_annos.append(_anno) # USER: Implement additional transformations if you have other types of data annos = [ utils.transform_instance_annotations(obj, transforms, image_shape) for obj in _img_annos if obj.get("iscrowd", 0) == 0 ] _gt_ids = list(range(len(annos))) for idx in range(len(annos)): if len(annos[idx]["segmentation"]) == 0: annos[idx]["segmentation"] = [np.array([0.0] * 6)] instances = utils.annotations_to_instances(annos, image_shape, mask_format="bitmask") instances.gt_ids = torch.tensor(_gt_ids) if instances.has("gt_masks"): instances.gt_boxes = instances.gt_masks.get_bounding_boxes() instances = filter_empty_instances(instances) else: instances.gt_masks = BitMasks(torch.empty((0, *image_shape))) dataset_dict["instances"].append(instances) return dataset_dict ================================================ FILE: mfvis_nococo/mask2former_video/data_video/datasets/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC from . import builtin # ensure the builtin datasets are registered __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")] ================================================ FILE: mfvis_nococo/mask2former_video/data_video/datasets/builtin.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC import os from .ytvis import ( register_ytvis_instances, _get_ytvis_2019_instances_meta, _get_ytvis_2021_instances_meta, ) # ==== Predefined splits for YTVIS 2019 =========== _PREDEFINED_SPLITS_YTVIS_2019 = { "ytvis_2019_train": ("ytvis_2019/train/JPEGImages", "ytvis_2019/train.json"), "ytvis_2019_val": ("ytvis_2019/valid/JPEGImages", "ytvis_2019/valid.json"), "ytvis_2019_test": ("ytvis_2019/test/JPEGImages", "ytvis_2019/test.json"), } # ==== Predefined splits for YTVIS 2021 =========== _PREDEFINED_SPLITS_YTVIS_2021 = { "ytvis_2021_train": ("ytvis_2021/train/JPEGImages", "ytvis_2021/train.json"), "ytvis_2021_val": ("ytvis_2021/valid/JPEGImages", "ytvis_2021/valid.json"), "ytvis_2021_test": ("ytvis_2021/test/JPEGImages", "ytvis_2021/test.json"), } def register_all_ytvis_2019(root): for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2019.items(): # Assume pre-defined datasets live in `./datasets`. register_ytvis_instances( key, _get_ytvis_2019_instances_meta(), os.path.join(root, json_file) if "://" not in json_file else json_file, os.path.join(root, image_root), ) def register_all_ytvis_2021(root): for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2021.items(): # Assume pre-defined datasets live in `./datasets`. register_ytvis_instances( key, _get_ytvis_2021_instances_meta(), os.path.join(root, json_file) if "://" not in json_file else json_file, os.path.join(root, image_root), ) if __name__.endswith(".builtin"): # Assume pre-defined datasets live in `./datasets`. _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_ytvis_2019(_root) register_all_ytvis_2021(_root) ================================================ FILE: mfvis_nococo/mask2former_video/data_video/datasets/ytvis.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC import contextlib import io import json import logging import numpy as np import os import pycocotools.mask as mask_util from fvcore.common.file_io import PathManager from fvcore.common.timer import Timer from detectron2.structures import Boxes, BoxMode, PolygonMasks from detectron2.data import DatasetCatalog, MetadataCatalog """ This file contains functions to parse YTVIS dataset of COCO-format annotations into dicts in "Detectron2 format". """ logger = logging.getLogger(__name__) __all__ = ["load_ytvis_json", "register_ytvis_instances"] YTVIS_CATEGORIES_2019 = [ {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"}, {"color": [0, 82, 0], "isthing": 1, "id": 2, "name": "giant_panda"}, {"color": [119, 11, 32], "isthing": 1, "id": 3, "name": "lizard"}, {"color": [165, 42, 42], "isthing": 1, "id": 4, "name": "parrot"}, {"color": [134, 134, 103], "isthing": 1, "id": 5, "name": "skateboard"}, {"color": [0, 0, 142], "isthing": 1, "id": 6, "name": "sedan"}, {"color": [255, 109, 65], "isthing": 1, "id": 7, "name": "ape"}, {"color": [0, 226, 252], "isthing": 1, "id": 8, "name": "dog"}, {"color": [5, 121, 0], "isthing": 1, "id": 9, "name": "snake"}, {"color": [0, 60, 100], "isthing": 1, "id": 10, "name": "monkey"}, {"color": [250, 170, 30], "isthing": 1, "id": 11, "name": "hand"}, {"color": [100, 170, 30], "isthing": 1, "id": 12, "name": "rabbit"}, {"color": [179, 0, 194], "isthing": 1, "id": 13, "name": "duck"}, {"color": [255, 77, 255], "isthing": 1, "id": 14, "name": "cat"}, {"color": [120, 166, 157], "isthing": 1, "id": 15, "name": "cow"}, {"color": [73, 77, 174], "isthing": 1, "id": 16, "name": "fish"}, {"color": [0, 80, 100], "isthing": 1, "id": 17, "name": "train"}, {"color": [182, 182, 255], "isthing": 1, "id": 18, "name": "horse"}, {"color": [0, 143, 149], "isthing": 1, "id": 19, "name": "turtle"}, {"color": [174, 57, 255], "isthing": 1, "id": 20, "name": "bear"}, {"color": [0, 0, 230], "isthing": 1, "id": 21, "name": "motorbike"}, {"color": [72, 0, 118], "isthing": 1, "id": 22, "name": "giraffe"}, {"color": [255, 179, 240], "isthing": 1, "id": 23, "name": "leopard"}, {"color": [0, 125, 92], "isthing": 1, "id": 24, "name": "fox"}, {"color": [209, 0, 151], "isthing": 1, "id": 25, "name": "deer"}, {"color": [188, 208, 182], "isthing": 1, "id": 26, "name": "owl"}, {"color": [145, 148, 174], "isthing": 1, "id": 27, "name": "surfboard"}, {"color": [106, 0, 228], "isthing": 1, "id": 28, "name": "airplane"}, {"color": [0, 0, 70], "isthing": 1, "id": 29, "name": "truck"}, {"color": [199, 100, 0], "isthing": 1, "id": 30, "name": "zebra"}, {"color": [166, 196, 102], "isthing": 1, "id": 31, "name": "tiger"}, {"color": [110, 76, 0], "isthing": 1, "id": 32, "name": "elephant"}, {"color": [133, 129, 255], "isthing": 1, "id": 33, "name": "snowboard"}, {"color": [0, 0, 192], "isthing": 1, "id": 34, "name": "boat"}, {"color": [183, 130, 88], "isthing": 1, "id": 35, "name": "shark"}, {"color": [130, 114, 135], "isthing": 1, "id": 36, "name": "mouse"}, {"color": [107, 142, 35], "isthing": 1, "id": 37, "name": "frog"}, {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "eagle"}, {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "earless_seal"}, {"color": [255, 208, 186], "isthing": 1, "id": 40, "name": "tennis_racket"}, ] YTVIS_CATEGORIES_2021 = [ {"color": [106, 0, 228], "isthing": 1, "id": 1, "name": "airplane"}, {"color": [174, 57, 255], "isthing": 1, "id": 2, "name": "bear"}, {"color": [255, 109, 65], "isthing": 1, "id": 3, "name": "bird"}, {"color": [0, 0, 192], "isthing": 1, "id": 4, "name": "boat"}, {"color": [0, 0, 142], "isthing": 1, "id": 5, "name": "car"}, {"color": [255, 77, 255], "isthing": 1, "id": 6, "name": "cat"}, {"color": [120, 166, 157], "isthing": 1, "id": 7, "name": "cow"}, {"color": [209, 0, 151], "isthing": 1, "id": 8, "name": "deer"}, {"color": [0, 226, 252], "isthing": 1, "id": 9, "name": "dog"}, {"color": [179, 0, 194], "isthing": 1, "id": 10, "name": "duck"}, {"color": [174, 255, 243], "isthing": 1, "id": 11, "name": "earless_seal"}, {"color": [110, 76, 0], "isthing": 1, "id": 12, "name": "elephant"}, {"color": [73, 77, 174], "isthing": 1, "id": 13, "name": "fish"}, {"color": [250, 170, 30], "isthing": 1, "id": 14, "name": "flying_disc"}, {"color": [0, 125, 92], "isthing": 1, "id": 15, "name": "fox"}, {"color": [107, 142, 35], "isthing": 1, "id": 16, "name": "frog"}, {"color": [0, 82, 0], "isthing": 1, "id": 17, "name": "giant_panda"}, {"color": [72, 0, 118], "isthing": 1, "id": 18, "name": "giraffe"}, {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"}, {"color": [255, 179, 240], "isthing": 1, "id": 20, "name": "leopard"}, {"color": [119, 11, 32], "isthing": 1, "id": 21, "name": "lizard"}, {"color": [0, 60, 100], "isthing": 1, "id": 22, "name": "monkey"}, {"color": [0, 0, 230], "isthing": 1, "id": 23, "name": "motorbike"}, {"color": [130, 114, 135], "isthing": 1, "id": 24, "name": "mouse"}, {"color": [165, 42, 42], "isthing": 1, "id": 25, "name": "parrot"}, {"color": [220, 20, 60], "isthing": 1, "id": 26, "name": "person"}, {"color": [100, 170, 30], "isthing": 1, "id": 27, "name": "rabbit"}, {"color": [183, 130, 88], "isthing": 1, "id": 28, "name": "shark"}, {"color": [134, 134, 103], "isthing": 1, "id": 29, "name": "skateboard"}, {"color": [5, 121, 0], "isthing": 1, "id": 30, "name": "snake"}, {"color": [133, 129, 255], "isthing": 1, "id": 31, "name": "snowboard"}, {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "squirrel"}, {"color": [145, 148, 174], "isthing": 1, "id": 33, "name": "surfboard"}, {"color": [255, 208, 186], "isthing": 1, "id": 34, "name": "tennis_racket"}, {"color": [166, 196, 102], "isthing": 1, "id": 35, "name": "tiger"}, {"color": [0, 80, 100], "isthing": 1, "id": 36, "name": "train"}, {"color": [0, 0, 70], "isthing": 1, "id": 37, "name": "truck"}, {"color": [0, 143, 149], "isthing": 1, "id": 38, "name": "turtle"}, {"color": [0, 228, 0], "isthing": 1, "id": 39, "name": "whale"}, {"color": [199, 100, 0], "isthing": 1, "id": 40, "name": "zebra"}, ] def _get_ytvis_2019_instances_meta(): thing_ids = [k["id"] for k in YTVIS_CATEGORIES_2019 if k["isthing"] == 1] thing_colors = [k["color"] for k in YTVIS_CATEGORIES_2019 if k["isthing"] == 1] assert len(thing_ids) == 40, len(thing_ids) # Mapping from the incontiguous YTVIS category id to an id in [0, 39] thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} thing_classes = [k["name"] for k in YTVIS_CATEGORIES_2019 if k["isthing"] == 1] ret = { "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, "thing_classes": thing_classes, "thing_colors": thing_colors, } return ret def _get_ytvis_2021_instances_meta(): thing_ids = [k["id"] for k in YTVIS_CATEGORIES_2021 if k["isthing"] == 1] thing_colors = [k["color"] for k in YTVIS_CATEGORIES_2021 if k["isthing"] == 1] assert len(thing_ids) == 40, len(thing_ids) # Mapping from the incontiguous YTVIS category id to an id in [0, 39] thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} thing_classes = [k["name"] for k in YTVIS_CATEGORIES_2021 if k["isthing"] == 1] ret = { "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, "thing_classes": thing_classes, "thing_colors": thing_colors, } return ret def load_ytvis_json(json_file, image_root, dataset_name=None, extra_annotation_keys=None): from .ytvis_api.ytvos import YTVOS timer = Timer() json_file = PathManager.get_local_path(json_file) with contextlib.redirect_stdout(io.StringIO()): ytvis_api = YTVOS(json_file) if timer.seconds() > 1: logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds())) id_map = None if dataset_name is not None: meta = MetadataCatalog.get(dataset_name) cat_ids = sorted(ytvis_api.getCatIds()) cats = ytvis_api.loadCats(cat_ids) # The categories in a custom json file may not be sorted. thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])] meta.thing_classes = thing_classes # In COCO, certain category ids are artificially removed, # and by convention they are always ignored. # We deal with COCO's id issue and translate # the category ids to contiguous ids in [0, 80). # It works by looking at the "categories" field in the json, therefore # if users' own json also have incontiguous ids, we'll # apply this mapping as well but print a warning. if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)): if "coco" not in dataset_name: logger.warning( """ Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you. """ ) id_map = {v: i for i, v in enumerate(cat_ids)} meta.thing_dataset_id_to_contiguous_id = id_map # sort indices for reproducible results vid_ids = sorted(ytvis_api.vids.keys()) # vids is a list of dicts, each looks something like: # {'license': 1, # 'flickr_url': ' ', # 'file_names': ['ff25f55852/00000.jpg', 'ff25f55852/00005.jpg', ..., 'ff25f55852/00175.jpg'], # 'height': 720, # 'width': 1280, # 'length': 36, # 'date_captured': '2019-04-11 00:55:41.903902', # 'id': 2232} vids = ytvis_api.loadVids(vid_ids) anns = [ytvis_api.vidToAnns[vid_id] for vid_id in vid_ids] total_num_valid_anns = sum([len(x) for x in anns]) total_num_anns = len(ytvis_api.anns) if total_num_valid_anns < total_num_anns: logger.warning( f"{json_file} contains {total_num_anns} annotations, but only " f"{total_num_valid_anns} of them match to images in the file." ) vids_anns = list(zip(vids, anns)) logger.info("Loaded {} videos in YTVIS format from {}".format(len(vids_anns), json_file)) dataset_dicts = [] ann_keys = ["iscrowd", "category_id", "id"] + (extra_annotation_keys or []) num_instances_without_valid_segmentation = 0 for (vid_dict, anno_dict_list) in vids_anns: record = {} record["file_names"] = [os.path.join(image_root, vid_dict["file_names"][i]) for i in range(vid_dict["length"])] record["height"] = vid_dict["height"] record["width"] = vid_dict["width"] record["length"] = vid_dict["length"] video_id = record["video_id"] = vid_dict["id"] video_objs = [] for frame_idx in range(record["length"]): frame_objs = [] for anno in anno_dict_list: assert anno["video_id"] == video_id obj = {key: anno[key] for key in ann_keys if key in anno} _bboxes = anno.get("bboxes", None) _segm = anno.get("segmentations", None) if not (_bboxes and _segm and _bboxes[frame_idx] and _segm[frame_idx]): continue bbox = _bboxes[frame_idx] segm = _segm[frame_idx] obj["bbox"] = bbox obj["bbox_mode"] = BoxMode.XYWH_ABS if isinstance(segm, dict): if isinstance(segm["counts"], list): # convert to compressed RLE segm = mask_util.frPyObjects(segm, *segm["size"]) elif segm: # filter out invalid polygons (< 3 points) segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6] if len(segm) == 0: num_instances_without_valid_segmentation += 1 continue # ignore this instance obj["segmentation"] = segm if id_map: obj["category_id"] = id_map[obj["category_id"]] frame_objs.append(obj) video_objs.append(frame_objs) record["annotations"] = video_objs dataset_dicts.append(record) if num_instances_without_valid_segmentation > 0: logger.warning( "Filtered out {} instances without valid segmentation. ".format( num_instances_without_valid_segmentation ) + "There might be issues in your dataset generation process. " "A valid polygon should be a list[float] with even length >= 6." ) return dataset_dicts def register_ytvis_instances(name, metadata, json_file, image_root): """ Register a dataset in YTVIS's json annotation format for instance tracking. Args: name (str): the name that identifies a dataset, e.g. "ytvis_train". metadata (dict): extra metadata associated with this dataset. You can leave it as an empty dict. json_file (str): path to the json instance annotation file. image_root (str or path-like): directory which contains all the images. """ assert isinstance(name, str), name assert isinstance(json_file, (str, os.PathLike)), json_file assert isinstance(image_root, (str, os.PathLike)), image_root # 1. register a function which returns dicts DatasetCatalog.register(name, lambda: load_ytvis_json(json_file, image_root, name)) # 2. Optionally, add metadata about this dataset, # since they might be useful in evaluation, visualization or logging MetadataCatalog.get(name).set( json_file=json_file, image_root=image_root, evaluator_type="ytvis", **metadata ) if __name__ == "__main__": """ Test the YTVIS json dataset loader. """ from detectron2.utils.logger import setup_logger from detectron2.utils.visualizer import Visualizer import detectron2.data.datasets # noqa # add pre-defined metadata import sys from PIL import Image logger = setup_logger(name=__name__) #assert sys.argv[3] in DatasetCatalog.list() meta = MetadataCatalog.get("ytvis_2019_train") json_file = "./datasets/ytvis/instances_train_sub.json" image_root = "./datasets/ytvis/train/JPEGImages" dicts = load_ytvis_json(json_file, image_root, dataset_name="ytvis_2019_train") logger.info("Done loading {} samples.".format(len(dicts))) dirname = "ytvis-data-vis" os.makedirs(dirname, exist_ok=True) def extract_frame_dic(dic, frame_idx): import copy frame_dic = copy.deepcopy(dic) annos = frame_dic.get("annotations", None) if annos: frame_dic["annotations"] = annos[frame_idx] return frame_dic for d in dicts: vid_name = d["file_names"][0].split('/')[-2] os.makedirs(os.path.join(dirname, vid_name), exist_ok=True) for idx, file_name in enumerate(d["file_names"]): img = np.array(Image.open(file_name)) visualizer = Visualizer(img, metadata=meta) vis = visualizer.draw_dataset_dict(extract_frame_dic(d, idx)) fpath = os.path.join(dirname, vid_name, file_name.split('/')[-1]) vis.save(fpath) ================================================ FILE: mfvis_nococo/mask2former_video/data_video/datasets/ytvis_api/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi ================================================ FILE: mfvis_nococo/mask2former_video/data_video/datasets/ytvis_api/ytvos.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi __author__ = 'ychfan' # Interface for accessing the YouTubeVIS dataset. # The following API functions are defined: # YTVOS - YTVOS api class that loads YouTubeVIS annotation file and prepare data structures. # decodeMask - Decode binary mask M encoded via run-length encoding. # encodeMask - Encode binary mask M using run-length encoding. # getAnnIds - Get ann ids that satisfy given filter conditions. # getCatIds - Get cat ids that satisfy given filter conditions. # getImgIds - Get img ids that satisfy given filter conditions. # loadAnns - Load anns with the specified ids. # loadCats - Load cats with the specified ids. # loadImgs - Load imgs with the specified ids. # annToMask - Convert segmentation in an annotation to binary mask. # loadRes - Load algorithm results and create API for accessing them. # Microsoft COCO Toolbox. version 2.0 # Data, paper, and tutorials available at: http://mscoco.org/ # Code written by Piotr Dollar and Tsung-Yi Lin, 2014. # Licensed under the Simplified BSD License [see bsd.txt] import json import time import matplotlib.pyplot as plt from matplotlib.collections import PatchCollection from matplotlib.patches import Polygon import numpy as np import copy import itertools from pycocotools import mask as maskUtils import os from collections import defaultdict import sys PYTHON_VERSION = sys.version_info[0] if PYTHON_VERSION == 2: from urllib import urlretrieve elif PYTHON_VERSION == 3: from urllib.request import urlretrieve def _isArrayLike(obj): return hasattr(obj, '__iter__') and hasattr(obj, '__len__') class YTVOS: def __init__(self, annotation_file=None): """ Constructor of Microsoft COCO helper class for reading and visualizing annotations. :param annotation_file (str): location of annotation file :param image_folder (str): location to the folder that hosts images. :return: """ # load dataset self.dataset,self.anns,self.cats,self.vids = dict(),dict(),dict(),dict() self.vidToAnns, self.catToVids = defaultdict(list), defaultdict(list) if not annotation_file == None: print('loading annotations into memory...') tic = time.time() dataset = json.load(open(annotation_file, 'r')) assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset)) print('Done (t={:0.2f}s)'.format(time.time()- tic)) self.dataset = dataset self.createIndex() def createIndex(self): # create index print('creating index...') anns, cats, vids = {}, {}, {} vidToAnns,catToVids = defaultdict(list),defaultdict(list) if 'annotations' in self.dataset: for ann in self.dataset['annotations']: vidToAnns[ann['video_id']].append(ann) anns[ann['id']] = ann if 'videos' in self.dataset: for vid in self.dataset['videos']: vids[vid['id']] = vid if 'categories' in self.dataset: for cat in self.dataset['categories']: cats[cat['id']] = cat if 'annotations' in self.dataset and 'categories' in self.dataset: for ann in self.dataset['annotations']: catToVids[ann['category_id']].append(ann['video_id']) print('index created!') # create class members self.anns = anns self.vidToAnns = vidToAnns self.catToVids = catToVids self.vids = vids self.cats = cats def info(self): """ Print information about the annotation file. :return: """ for key, value in self.dataset['info'].items(): print('{}: {}'.format(key, value)) def getAnnIds(self, vidIds=[], catIds=[], areaRng=[], iscrowd=None): """ Get ann ids that satisfy given filter conditions. default skips that filter :param vidIds (int array) : get anns for given vids catIds (int array) : get anns for given cats areaRng (float array) : get anns for given area range (e.g. [0 inf]) iscrowd (boolean) : get anns for given crowd label (False or True) :return: ids (int array) : integer array of ann ids """ vidIds = vidIds if _isArrayLike(vidIds) else [vidIds] catIds = catIds if _isArrayLike(catIds) else [catIds] if len(vidIds) == len(catIds) == len(areaRng) == 0: anns = self.dataset['annotations'] else: if not len(vidIds) == 0: lists = [self.vidToAnns[vidId] for vidId in vidIds if vidId in self.vidToAnns] anns = list(itertools.chain.from_iterable(lists)) else: anns = self.dataset['annotations'] anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds] anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['avg_area'] > areaRng[0] and ann['avg_area'] < areaRng[1]] if not iscrowd == None: ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd] else: ids = [ann['id'] for ann in anns] return ids def getCatIds(self, catNms=[], supNms=[], catIds=[]): """ filtering parameters. default skips that filter. :param catNms (str array) : get cats for given cat names :param supNms (str array) : get cats for given supercategory names :param catIds (int array) : get cats for given cat ids :return: ids (int array) : integer array of cat ids """ catNms = catNms if _isArrayLike(catNms) else [catNms] supNms = supNms if _isArrayLike(supNms) else [supNms] catIds = catIds if _isArrayLike(catIds) else [catIds] if len(catNms) == len(supNms) == len(catIds) == 0: cats = self.dataset['categories'] else: cats = self.dataset['categories'] cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms] cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms] cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds] ids = [cat['id'] for cat in cats] return ids def getVidIds(self, vidIds=[], catIds=[]): ''' Get vid ids that satisfy given filter conditions. :param vidIds (int array) : get vids for given ids :param catIds (int array) : get vids with all given cats :return: ids (int array) : integer array of vid ids ''' vidIds = vidIds if _isArrayLike(vidIds) else [vidIds] catIds = catIds if _isArrayLike(catIds) else [catIds] if len(vidIds) == len(catIds) == 0: ids = self.vids.keys() else: ids = set(vidIds) for i, catId in enumerate(catIds): if i == 0 and len(ids) == 0: ids = set(self.catToVids[catId]) else: ids &= set(self.catToVids[catId]) return list(ids) def loadAnns(self, ids=[]): """ Load anns with the specified ids. :param ids (int array) : integer ids specifying anns :return: anns (object array) : loaded ann objects """ if _isArrayLike(ids): return [self.anns[id] for id in ids] elif type(ids) == int: return [self.anns[ids]] def loadCats(self, ids=[]): """ Load cats with the specified ids. :param ids (int array) : integer ids specifying cats :return: cats (object array) : loaded cat objects """ if _isArrayLike(ids): return [self.cats[id] for id in ids] elif type(ids) == int: return [self.cats[ids]] def loadVids(self, ids=[]): """ Load anns with the specified ids. :param ids (int array) : integer ids specifying vid :return: vids (object array) : loaded vid objects """ if _isArrayLike(ids): return [self.vids[id] for id in ids] elif type(ids) == int: return [self.vids[ids]] def loadRes(self, resFile): """ Load result file and return a result api object. :param resFile (str) : file name of result file :return: res (obj) : result api object """ res = YTVOS() res.dataset['videos'] = [img for img in self.dataset['videos']] print('Loading and preparing results...') tic = time.time() if type(resFile) == str or (PYTHON_VERSION == 2 and type(resFile) == unicode): anns = json.load(open(resFile)) elif type(resFile) == np.ndarray: anns = self.loadNumpyAnnotations(resFile) else: anns = resFile assert type(anns) == list, 'results in not an array of objects' annsVidIds = [ann['video_id'] for ann in anns] assert set(annsVidIds) == (set(annsVidIds) & set(self.getVidIds())), \ 'Results do not correspond to current coco set' if 'segmentations' in anns[0]: res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) for id, ann in enumerate(anns): ann['areas'] = [] if not 'bboxes' in ann: ann['bboxes'] = [] for seg in ann['segmentations']: # now only support compressed RLE format as segmentation results if seg: ann['areas'].append(maskUtils.area(seg)) if len(ann['bboxes']) < len(ann['areas']): ann['bboxes'].append(maskUtils.toBbox(seg)) else: ann['areas'].append(None) if len(ann['bboxes']) < len(ann['areas']): ann['bboxes'].append(None) ann['id'] = id+1 l = [a for a in ann['areas'] if a] if len(l)==0: ann['avg_area'] = 0 else: ann['avg_area'] = np.array(l).mean() ann['iscrowd'] = 0 print('DONE (t={:0.2f}s)'.format(time.time()- tic)) res.dataset['annotations'] = anns res.createIndex() return res def annToRLE(self, ann, frameId): """ Convert annotation which can be polygons, uncompressed RLE to RLE. :return: binary mask (numpy 2D array) """ t = self.vids[ann['video_id']] h, w = t['height'], t['width'] segm = ann['segmentations'][frameId] if type(segm) == list: # polygon -- a single object might consist of multiple parts # we merge all parts into one mask rle code rles = maskUtils.frPyObjects(segm, h, w) rle = maskUtils.merge(rles) elif type(segm['counts']) == list: # uncompressed RLE rle = maskUtils.frPyObjects(segm, h, w) else: # rle rle = segm return rle def annToMask(self, ann, frameId): """ Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask. :return: binary mask (numpy 2D array) """ rle = self.annToRLE(ann, frameId) m = maskUtils.decode(rle) return m ================================================ FILE: mfvis_nococo/mask2former_video/data_video/datasets/ytvis_api/ytvoseval.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi __author__ = 'ychfan' import numpy as np import datetime import time from collections import defaultdict from pycocotools import mask as maskUtils import copy class YTVOSeval: # Interface for evaluating video instance segmentation on the YouTubeVIS dataset. # # The usage for YTVOSeval is as follows: # cocoGt=..., cocoDt=... # load dataset and results # E = YTVOSeval(cocoGt,cocoDt); # initialize YTVOSeval object # E.params.recThrs = ...; # set parameters as desired # E.evaluate(); # run per image evaluation # E.accumulate(); # accumulate per image results # E.summarize(); # display summary metrics of results # For example usage see evalDemo.m and http://mscoco.org/. # # The evaluation parameters are as follows (defaults in brackets): # imgIds - [all] N img ids to use for evaluation # catIds - [all] K cat ids to use for evaluation # iouThrs - [.5:.05:.95] T=10 IoU thresholds for evaluation # recThrs - [0:.01:1] R=101 recall thresholds for evaluation # areaRng - [...] A=4 object area ranges for evaluation # maxDets - [1 10 100] M=3 thresholds on max detections per image # iouType - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints' # iouType replaced the now DEPRECATED useSegm parameter. # useCats - [1] if true use category labels for evaluation # Note: if useCats=0 category labels are ignored as in proposal scoring. # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified. # # evaluate(): evaluates detections on every image and every category and # concats the results into the "evalImgs" with fields: # dtIds - [1xD] id for each of the D detections (dt) # gtIds - [1xG] id for each of the G ground truths (gt) # dtMatches - [TxD] matching gt id at each IoU or 0 # gtMatches - [TxG] matching dt id at each IoU or 0 # dtScores - [1xD] confidence of each dt # gtIgnore - [1xG] ignore flag for each gt # dtIgnore - [TxD] ignore flag for each dt at each IoU # # accumulate(): accumulates the per-image, per-category evaluation # results in "evalImgs" into the dictionary "eval" with fields: # params - parameters used for evaluation # date - date evaluation was performed # counts - [T,R,K,A,M] parameter dimensions (see above) # precision - [TxRxKxAxM] precision for every evaluation setting # recall - [TxKxAxM] max recall for every evaluation setting # Note: precision and recall==-1 for settings with no gt objects. # # See also coco, mask, pycocoDemo, pycocoEvalDemo # # Microsoft COCO Toolbox. version 2.0 # Data, paper, and tutorials available at: http://mscoco.org/ # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. # Licensed under the Simplified BSD License [see coco/license.txt] def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'): ''' Initialize CocoEval using coco APIs for gt and dt :param cocoGt: coco object with ground truth annotations :param cocoDt: coco object with detection results :return: None ''' if not iouType: print('iouType not specified. use default iouType segm') self.cocoGt = cocoGt # ground truth COCO API self.cocoDt = cocoDt # detections COCO API self.params = {} # evaluation parameters self.evalVids = defaultdict(list) # per-image per-category evaluation results [KxAxI] elements self.eval = {} # accumulated evaluation results self._gts = defaultdict(list) # gt for evaluation self._dts = defaultdict(list) # dt for evaluation self.params = Params(iouType=iouType) # parameters self._paramsEval = {} # parameters for evaluation self.stats = [] # result summarization self.ious = {} # ious between all gts and dts if not cocoGt is None: self.params.vidIds = sorted(cocoGt.getVidIds()) self.params.catIds = sorted(cocoGt.getCatIds()) def _prepare(self): ''' Prepare ._gts and ._dts for evaluation based on params :return: None ''' def _toMask(anns, coco): # modify ann['segmentation'] by reference for ann in anns: for i, a in enumerate(ann['segmentations']): if a: rle = coco.annToRLE(ann, i) ann['segmentations'][i] = rle l = [a for a in ann['areas'] if a] if len(l)==0: ann['avg_area'] = 0 else: ann['avg_area'] = np.array(l).mean() p = self.params if p.useCats: gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(vidIds=p.vidIds, catIds=p.catIds)) dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(vidIds=p.vidIds, catIds=p.catIds)) else: gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(vidIds=p.vidIds)) dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(vidIds=p.vidIds)) # convert ground truth to mask if iouType == 'segm' if p.iouType == 'segm': _toMask(gts, self.cocoGt) _toMask(dts, self.cocoDt) # set ignore flag for gt in gts: gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0 gt['ignore'] = 'iscrowd' in gt and gt['iscrowd'] if p.iouType == 'keypoints': gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore'] self._gts = defaultdict(list) # gt for evaluation self._dts = defaultdict(list) # dt for evaluation for gt in gts: self._gts[gt['video_id'], gt['category_id']].append(gt) for dt in dts: self._dts[dt['video_id'], dt['category_id']].append(dt) self.evalVids = defaultdict(list) # per-image per-category evaluation results self.eval = {} # accumulated evaluation results def evaluate(self): ''' Run per image evaluation on given images and store results (a list of dict) in self.evalVids :return: None ''' tic = time.time() print('Running per image evaluation...') p = self.params # add backward compatibility if useSegm is specified in params if not p.useSegm is None: p.iouType = 'segm' if p.useSegm == 1 else 'bbox' print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType)) print('Evaluate annotation type *{}*'.format(p.iouType)) p.vidIds = list(np.unique(p.vidIds)) if p.useCats: p.catIds = list(np.unique(p.catIds)) p.maxDets = sorted(p.maxDets) self.params=p self._prepare() # loop through images, area range, max detection number catIds = p.catIds if p.useCats else [-1] if p.iouType == 'segm' or p.iouType == 'bbox': computeIoU = self.computeIoU elif p.iouType == 'keypoints': computeIoU = self.computeOks self.ious = {(vidId, catId): computeIoU(vidId, catId) \ for vidId in p.vidIds for catId in catIds} evaluateVid = self.evaluateVid maxDet = p.maxDets[-1] self.evalImgs = [evaluateVid(vidId, catId, areaRng, maxDet) for catId in catIds for areaRng in p.areaRng for vidId in p.vidIds ] self._paramsEval = copy.deepcopy(self.params) toc = time.time() print('DONE (t={:0.2f}s).'.format(toc-tic)) def computeIoU(self, vidId, catId): p = self.params if p.useCats: gt = self._gts[vidId,catId] dt = self._dts[vidId,catId] else: gt = [_ for cId in p.catIds for _ in self._gts[vidId,cId]] dt = [_ for cId in p.catIds for _ in self._dts[vidId,cId]] if len(gt) == 0 and len(dt) ==0: return [] inds = np.argsort([-d['score'] for d in dt], kind='mergesort') dt = [dt[i] for i in inds] if len(dt) > p.maxDets[-1]: dt=dt[0:p.maxDets[-1]] if p.iouType == 'segm': g = [g['segmentations'] for g in gt] d = [d['segmentations'] for d in dt] elif p.iouType == 'bbox': g = [g['bboxes'] for g in gt] d = [d['bboxes'] for d in dt] else: raise Exception('unknown iouType for iou computation') # compute iou between each dt and gt region iscrowd = [int(o['iscrowd']) for o in gt] #ious = maskUtils.iou(d,g,iscrowd) def iou_seq(d_seq, g_seq): i = .0 u = .0 for d, g in zip(d_seq, g_seq): if d and g: i += maskUtils.area(maskUtils.merge([d, g], True)) u += maskUtils.area(maskUtils.merge([d, g], False)) elif not d and g: u += maskUtils.area(g) elif d and not g: u += maskUtils.area(d) if not u > .0: print("Mask sizes in video {} and category {} may not match!".format(vidId, catId)) iou = i / u if u > .0 else .0 return iou ious = np.zeros([len(d), len(g)]) for i, j in np.ndindex(ious.shape): ious[i, j] = iou_seq(d[i], g[j]) #print(vidId, catId, ious.shape, ious) return ious def computeOks(self, imgId, catId): p = self.params # dimention here should be Nxm gts = self._gts[imgId, catId] dts = self._dts[imgId, catId] inds = np.argsort([-d['score'] for d in dts], kind='mergesort') dts = [dts[i] for i in inds] if len(dts) > p.maxDets[-1]: dts = dts[0:p.maxDets[-1]] # if len(gts) == 0 and len(dts) == 0: if len(gts) == 0 or len(dts) == 0: return [] ious = np.zeros((len(dts), len(gts))) sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62,.62, 1.07, 1.07, .87, .87, .89, .89])/10.0 vars = (sigmas * 2)**2 k = len(sigmas) # compute oks between each detection and ground truth object for j, gt in enumerate(gts): # create bounds for ignore regions(double the gt bbox) g = np.array(gt['keypoints']) xg = g[0::3]; yg = g[1::3]; vg = g[2::3] k1 = np.count_nonzero(vg > 0) bb = gt['bbox'] x0 = bb[0] - bb[2]; x1 = bb[0] + bb[2] * 2 y0 = bb[1] - bb[3]; y1 = bb[1] + bb[3] * 2 for i, dt in enumerate(dts): d = np.array(dt['keypoints']) xd = d[0::3]; yd = d[1::3] if k1>0: # measure the per-keypoint distance if keypoints visible dx = xd - xg dy = yd - yg else: # measure minimum distance to keypoints in (x0,y0) & (x1,y1) z = np.zeros((k)) dx = np.max((z, x0-xd),axis=0)+np.max((z, xd-x1),axis=0) dy = np.max((z, y0-yd),axis=0)+np.max((z, yd-y1),axis=0) e = (dx**2 + dy**2) / vars / (gt['avg_area']+np.spacing(1)) / 2 if k1 > 0: e=e[vg > 0] ious[i, j] = np.sum(np.exp(-e)) / e.shape[0] return ious def evaluateVid(self, vidId, catId, aRng, maxDet): ''' perform evaluation for single category and image :return: dict (single image results) ''' p = self.params if p.useCats: gt = self._gts[vidId,catId] dt = self._dts[vidId,catId] else: gt = [_ for cId in p.catIds for _ in self._gts[vidId,cId]] dt = [_ for cId in p.catIds for _ in self._dts[vidId,cId]] if len(gt) == 0 and len(dt) ==0: return None for g in gt: if g['ignore'] or (g['avg_area']aRng[1]): g['_ignore'] = 1 else: g['_ignore'] = 0 # sort dt highest score first, sort gt ignore last gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort') gt = [gt[i] for i in gtind] dtind = np.argsort([-d['score'] for d in dt], kind='mergesort') dt = [dt[i] for i in dtind[0:maxDet]] iscrowd = [int(o['iscrowd']) for o in gt] # load computed ious ious = self.ious[vidId, catId][:, gtind] if len(self.ious[vidId, catId]) > 0 else self.ious[vidId, catId] T = len(p.iouThrs) G = len(gt) D = len(dt) gtm = np.zeros((T,G)) dtm = np.zeros((T,D)) gtIg = np.array([g['_ignore'] for g in gt]) dtIg = np.zeros((T,D)) if not len(ious)==0: for tind, t in enumerate(p.iouThrs): for dind, d in enumerate(dt): # information about best match so far (m=-1 -> unmatched) iou = min([t,1-1e-10]) m = -1 for gind, g in enumerate(gt): # if this gt already matched, and not a crowd, continue if gtm[tind,gind]>0 and not iscrowd[gind]: continue # if dt matched to reg gt, and on ignore gt, stop if m>-1 and gtIg[m]==0 and gtIg[gind]==1: break # continue to next gt unless better match made if ious[dind,gind] < iou: continue # if match successful and best so far, store appropriately iou=ious[dind,gind] m=gind # if match made store id of match for both dt and gt if m ==-1: continue dtIg[tind,dind] = gtIg[m] dtm[tind,dind] = gt[m]['id'] gtm[tind,m] = d['id'] # set unmatched detections outside of area range to ignore a = np.array([d['avg_area']aRng[1] for d in dt]).reshape((1, len(dt))) dtIg = np.logical_or(dtIg, np.logical_and(dtm==0, np.repeat(a,T,0))) # store results for given image and category return { 'video_id': vidId, 'category_id': catId, 'aRng': aRng, 'maxDet': maxDet, 'dtIds': [d['id'] for d in dt], 'gtIds': [g['id'] for g in gt], 'dtMatches': dtm, 'gtMatches': gtm, 'dtScores': [d['score'] for d in dt], 'gtIgnore': gtIg, 'dtIgnore': dtIg, } def accumulate(self, p = None): ''' Accumulate per image evaluation results and store the result in self.eval :param p: input params for evaluation :return: None ''' print('Accumulating evaluation results...') tic = time.time() if not self.evalImgs: print('Please run evaluate() first') # allows input customized parameters if p is None: p = self.params p.catIds = p.catIds if p.useCats == 1 else [-1] T = len(p.iouThrs) R = len(p.recThrs) K = len(p.catIds) if p.useCats else 1 A = len(p.areaRng) M = len(p.maxDets) precision = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories recall = -np.ones((T,K,A,M)) scores = -np.ones((T,R,K,A,M)) # create dictionary for future indexing _pe = self._paramsEval catIds = _pe.catIds if _pe.useCats else [-1] setK = set(catIds) setA = set(map(tuple, _pe.areaRng)) setM = set(_pe.maxDets) setI = set(_pe.vidIds) # get inds to evaluate k_list = [n for n, k in enumerate(p.catIds) if k in setK] m_list = [m for n, m in enumerate(p.maxDets) if m in setM] a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA] i_list = [n for n, i in enumerate(p.vidIds) if i in setI] I0 = len(_pe.vidIds) A0 = len(_pe.areaRng) # retrieve E at each category, area range, and max number of detections for k, k0 in enumerate(k_list): Nk = k0*A0*I0 for a, a0 in enumerate(a_list): Na = a0*I0 for m, maxDet in enumerate(m_list): E = [self.evalImgs[Nk + Na + i] for i in i_list] E = [e for e in E if not e is None] if len(E) == 0: continue dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E]) # different sorting method generates slightly different results. # mergesort is used to be consistent as Matlab implementation. inds = np.argsort(-dtScores, kind='mergesort') dtScoresSorted = dtScores[inds] dtm = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds] dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet] for e in E], axis=1)[:,inds] gtIg = np.concatenate([e['gtIgnore'] for e in E]) npig = np.count_nonzero(gtIg==0 ) if npig == 0: continue tps = np.logical_and( dtm, np.logical_not(dtIg) ) fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) ) tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float) fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float) for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)): tp = np.array(tp) fp = np.array(fp) nd = len(tp) rc = tp / npig pr = tp / (fp+tp+np.spacing(1)) q = np.zeros((R,)) ss = np.zeros((R,)) if nd: recall[t,k,a,m] = rc[-1] else: recall[t,k,a,m] = 0 # numpy is slow without cython optimization for accessing elements # use python array gets significant speed improvement pr = pr.tolist(); q = q.tolist() for i in range(nd-1, 0, -1): if pr[i] > pr[i-1]: pr[i-1] = pr[i] inds = np.searchsorted(rc, p.recThrs, side='left') try: for ri, pi in enumerate(inds): q[ri] = pr[pi] ss[ri] = dtScoresSorted[pi] except: pass precision[t,:,k,a,m] = np.array(q) scores[t,:,k,a,m] = np.array(ss) self.eval = { 'params': p, 'counts': [T, R, K, A, M], 'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'precision': precision, 'recall': recall, 'scores': scores, } toc = time.time() print('DONE (t={:0.2f}s).'.format( toc-tic)) def summarize(self): ''' Compute and display summary metrics for evaluation results. Note this functin can *only* be applied on the default parameter setting ''' def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ): p = self.params iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}' titleStr = 'Average Precision' if ap == 1 else 'Average Recall' typeStr = '(AP)' if ap==1 else '(AR)' iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \ if iouThr is None else '{:0.2f}'.format(iouThr) aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng] mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets] if ap == 1: # dimension of precision: [TxRxKxAxM] s = self.eval['precision'] # IoU if iouThr is not None: t = np.where(iouThr == p.iouThrs)[0] s = s[t] s = s[:,:,:,aind,mind] else: # dimension of recall: [TxKxAxM] s = self.eval['recall'] if iouThr is not None: t = np.where(iouThr == p.iouThrs)[0] s = s[t] s = s[:,:,aind,mind] if len(s[s>-1])==0: mean_s = -1 else: mean_s = np.mean(s[s>-1]) print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s)) return mean_s def _summarizeDets(): stats = np.zeros((12,)) stats[0] = _summarize(1) stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2]) stats[2] = _summarize(1, iouThr=.75, maxDets=self.params.maxDets[2]) stats[3] = _summarize(1, areaRng='small', maxDets=self.params.maxDets[2]) stats[4] = _summarize(1, areaRng='medium', maxDets=self.params.maxDets[2]) stats[5] = _summarize(1, areaRng='large', maxDets=self.params.maxDets[2]) stats[6] = _summarize(0, maxDets=self.params.maxDets[0]) stats[7] = _summarize(0, maxDets=self.params.maxDets[1]) stats[8] = _summarize(0, maxDets=self.params.maxDets[2]) stats[9] = _summarize(0, areaRng='small', maxDets=self.params.maxDets[2]) stats[10] = _summarize(0, areaRng='medium', maxDets=self.params.maxDets[2]) stats[11] = _summarize(0, areaRng='large', maxDets=self.params.maxDets[2]) return stats def _summarizeKps(): stats = np.zeros((10,)) stats[0] = _summarize(1, maxDets=20) stats[1] = _summarize(1, maxDets=20, iouThr=.5) stats[2] = _summarize(1, maxDets=20, iouThr=.75) stats[3] = _summarize(1, maxDets=20, areaRng='medium') stats[4] = _summarize(1, maxDets=20, areaRng='large') stats[5] = _summarize(0, maxDets=20) stats[6] = _summarize(0, maxDets=20, iouThr=.5) stats[7] = _summarize(0, maxDets=20, iouThr=.75) stats[8] = _summarize(0, maxDets=20, areaRng='medium') stats[9] = _summarize(0, maxDets=20, areaRng='large') return stats if not self.eval: raise Exception('Please run accumulate() first') iouType = self.params.iouType if iouType == 'segm' or iouType == 'bbox': summarize = _summarizeDets elif iouType == 'keypoints': summarize = _summarizeKps self.stats = summarize() def __str__(self): self.summarize() class Params: ''' Params for coco evaluation api ''' def setDetParams(self): self.vidIds = [] self.catIds = [] # np.arange causes trouble. the data point on arange is slightly larger than the true value #self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True) #self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True) self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True) self.maxDets = [1, 10, 100] self.areaRng = [[0 ** 2, 1e5 ** 2], [0 ** 2, 128 ** 2], [ 128 ** 2, 256 ** 2], [256 ** 2, 1e5 ** 2]] self.areaRngLbl = ['all', 'small', 'medium', 'large'] self.useCats = 1 def setKpParams(self): self.vidIds = [] self.catIds = [] # np.arange causes trouble. the data point on arange is slightly larger than the true value self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True) self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True) self.maxDets = [20] self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]] self.areaRngLbl = ['all', 'medium', 'large'] self.useCats = 1 def __init__(self, iouType='segm'): if iouType == 'segm' or iouType == 'bbox': self.setDetParams() elif iouType == 'keypoints': self.setKpParams() else: raise Exception('iouType not supported') self.iouType = iouType # useSegm is deprecated self.useSegm = None ================================================ FILE: mfvis_nococo/mask2former_video/data_video/ytvis_eval.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC import contextlib import copy import io import itertools import json import logging import numpy as np import os from collections import OrderedDict import pycocotools.mask as mask_util import torch from .datasets.ytvis_api.ytvos import YTVOS from .datasets.ytvis_api.ytvoseval import YTVOSeval from tabulate import tabulate import detectron2.utils.comm as comm from detectron2.config import CfgNode from detectron2.data import MetadataCatalog from detectron2.evaluation import DatasetEvaluator from detectron2.utils.file_io import PathManager from detectron2.utils.logger import create_small_table class YTVISEvaluator(DatasetEvaluator): """ Evaluate AR for object proposals, AP for instance detection/segmentation, AP for keypoint detection outputs using COCO's metrics. See http://cocodataset.org/#detection-eval and http://cocodataset.org/#keypoints-eval to understand its metrics. In addition to COCO, this evaluator is able to support any bounding box detection, instance segmentation, or keypoint detection dataset. """ def __init__( self, dataset_name, tasks=None, distributed=True, output_dir=None, *, use_fast_impl=True, ): """ Args: dataset_name (str): name of the dataset to be evaluated. It must have either the following corresponding metadata: "json_file": the path to the COCO format annotation Or it must be in detectron2's standard dataset format so it can be converted to COCO format automatically. tasks (tuple[str]): tasks that can be evaluated under the given configuration. A task is one of "bbox", "segm", "keypoints". By default, will infer this automatically from predictions. distributed (True): if True, will collect results from all ranks and run evaluation in the main process. Otherwise, will only evaluate the results in the current process. output_dir (str): optional, an output directory to dump all results predicted on the dataset. The dump contains two files: 1. "instances_predictions.pth" a file in torch serialization format that contains all the raw original predictions. 2. "coco_instances_results.json" a json file in COCO's result format. use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP. Although the results should be very close to the official implementation in COCO API, it is still recommended to compute results with the official API for use in papers. The faster implementation also uses more RAM. """ self._logger = logging.getLogger(__name__) self._distributed = distributed self._output_dir = output_dir self._use_fast_impl = use_fast_impl if tasks is not None and isinstance(tasks, CfgNode): self._logger.warning( "COCO Evaluator instantiated using config, this is deprecated behavior." " Please pass in explicit arguments instead." ) self._tasks = None # Infering it from predictions should be better else: self._tasks = tasks self._cpu_device = torch.device("cpu") self._metadata = MetadataCatalog.get(dataset_name) json_file = PathManager.get_local_path(self._metadata.json_file) with contextlib.redirect_stdout(io.StringIO()): self._ytvis_api = YTVOS(json_file) # Test set json files do not contain annotations (evaluation must be # performed using the COCO evaluation server). self._do_evaluation = "annotations" in self._ytvis_api.dataset def reset(self): self._predictions = [] def process(self, inputs, outputs): """ Args: inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). It is a list of dict. Each dict corresponds to an image and contains keys like "height", "width", "file_name", "image_id". outputs: the outputs of a COCO model. It is a list of dicts with key "instances" that contains :class:`Instances`. """ prediction = instances_to_coco_json_video(inputs, outputs) self._predictions.extend(prediction) def evaluate(self): """ Args: img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset """ if self._distributed: comm.synchronize() predictions = comm.gather(self._predictions, dst=0) predictions = list(itertools.chain(*predictions)) if not comm.is_main_process(): return {} else: predictions = self._predictions if len(predictions) == 0: self._logger.warning("[COCOEvaluator] Did not receive valid predictions.") return {} if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "instances_predictions.pth") with PathManager.open(file_path, "wb") as f: torch.save(predictions, f) self._results = OrderedDict() self._eval_predictions(predictions) # Copy so the caller can do whatever with results return copy.deepcopy(self._results) def _eval_predictions(self, predictions): """ Evaluate predictions. Fill self._results with the metrics of the tasks. """ self._logger.info("Preparing results for YTVIS format ...") # unmap the category ids for COCO if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) num_classes = len(all_contiguous_ids) assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} for result in predictions: category_id = result["category_id"] assert category_id < num_classes, ( f"A prediction has class={category_id}, " f"but the dataset only has {num_classes} classes and " f"predicted class id should be in [0, {num_classes - 1}]." ) result["category_id"] = reverse_id_mapping[category_id] if self._output_dir: file_path = os.path.join(self._output_dir, "results.json") self._logger.info("Saving results to {}".format(file_path)) with PathManager.open(file_path, "w") as f: f.write(json.dumps(predictions)) f.flush() if not self._do_evaluation: self._logger.info("Annotations are not available for evaluation.") return coco_eval = ( _evaluate_predictions_on_coco( self._ytvis_api, predictions, ) if len(predictions) > 0 else None # cocoapi does not handle empty results very well ) res = self._derive_coco_results( coco_eval, class_names=self._metadata.get("thing_classes") ) self._results["segm"] = res def _derive_coco_results(self, coco_eval, class_names=None): """ Derive the desired score numbers from summarized COCOeval. Args: coco_eval (None or COCOEval): None represents no predictions from model. iou_type (str): class_names (None or list[str]): if provided, will use it to predict per-category AP. Returns: a dict of {metric name: score} """ metrics = ["AP", "AP50", "AP75", "APs", "APm", "APl", "AR1", "AR10"] if coco_eval is None: self._logger.warn("No predictions from the model!") return {metric: float("nan") for metric in metrics} # the standard metrics results = { metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan") for idx, metric in enumerate(metrics) } self._logger.info( "Evaluation results for {}: \n".format("segm") + create_small_table(results) ) if not np.isfinite(sum(results.values())): self._logger.info("Some metrics cannot be computed and is shown as NaN.") if class_names is None or len(class_names) <= 1: return results # Compute per-category AP # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa precisions = coco_eval.eval["precision"] # precision has dims (iou, recall, cls, area range, max dets) assert len(class_names) == precisions.shape[2] results_per_category = [] for idx, name in enumerate(class_names): # area range index 0: all area ranges # max dets index -1: typically 100 per image precision = precisions[:, :, idx, 0, -1] precision = precision[precision > -1] ap = np.mean(precision) if precision.size else float("nan") results_per_category.append(("{}".format(name), float(ap * 100))) # tabulate it N_COLS = min(6, len(results_per_category) * 2) results_flatten = list(itertools.chain(*results_per_category)) results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)]) table = tabulate( results_2d, tablefmt="pipe", floatfmt=".3f", headers=["category", "AP"] * (N_COLS // 2), numalign="left", ) self._logger.info("Per-category {} AP: \n".format("segm") + table) results.update({"AP-" + name: ap for name, ap in results_per_category}) return results def instances_to_coco_json_video(inputs, outputs): """ Dump an "Instances" object to a COCO-format json that's used for evaluation. Args: instances (Instances): video_id (int): the image id Returns: list[dict]: list of json annotations in COCO format. """ assert len(inputs) == 1, "More than one inputs are loaded for inference!" video_id = inputs[0]["video_id"] video_length = inputs[0]["length"] scores = outputs["pred_scores"] labels = outputs["pred_labels"] masks = outputs["pred_masks"] ytvis_results = [] for instance_id, (s, l, m) in enumerate(zip(scores, labels, masks)): segms = [ mask_util.encode(np.array(_mask[:, :, None], order="F", dtype="uint8"))[0] for _mask in m ] for rle in segms: rle["counts"] = rle["counts"].decode("utf-8") res = { "video_id": video_id, "score": s, "category_id": l, "segmentations": segms, } ytvis_results.append(res) return ytvis_results def _evaluate_predictions_on_coco( coco_gt, coco_results, img_ids=None, ): """ Evaluate the coco results using COCOEval API. """ assert len(coco_results) > 0 coco_results = copy.deepcopy(coco_results) # When evaluating mask AP, if the results contain bbox, cocoapi will # use the box area as the area of the instance, instead of the mask area. # This leads to a different definition of small/medium/large. # We remove the bbox field to let mask AP use mask area. for c in coco_results: c.pop("bbox", None) coco_dt = coco_gt.loadRes(coco_results) coco_eval = YTVOSeval(coco_gt, coco_dt) # For COCO, the default max_dets_per_image is [1, 10, 100]. max_dets_per_image = [1, 10, 100] # Default from COCOEval coco_eval.params.maxDets = max_dets_per_image if img_ids is not None: coco_eval.params.imgIds = img_ids coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() return coco_eval ================================================ FILE: mfvis_nococo/mask2former_video/modeling/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .transformer_decoder.video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder ================================================ FILE: mfvis_nococo/mask2former_video/modeling/criterion.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py """ MaskFormer criterion. """ import logging import torch import torch.nn.functional as F from torch import nn from detectron2.utils.comm import get_world_size from detectron2.projects.point_rend.point_features import ( get_uncertain_point_coords_with_randomness, point_sample, ) from mask2former.utils.misc import is_dist_avail_and_initialized def unfold_wo_center(x, kernel_size, dilation): assert x.dim() == 4 assert kernel_size % 2 == 1 # using SAME padding padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2 unfolded_x = F.unfold( x, kernel_size=kernel_size, padding=padding, dilation=dilation ) unfolded_x = unfolded_x.reshape( x.size(0), x.size(1), -1, x.size(2), x.size(3) ) # remove the center pixels size = kernel_size ** 2 unfolded_x = torch.cat(( unfolded_x[:, :, :size // 2], unfolded_x[:, :, size // 2 + 1:] ), dim=2) return unfolded_x def unfold_w_center(x, kernel_size, dilation): assert x.dim() == 4 assert kernel_size % 2 == 1 # using SAME padding padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2 unfolded_x = F.unfold( x, kernel_size=kernel_size, padding=padding, dilation=dilation ) unfolded_x = unfolded_x.reshape( x.size(0), x.size(1), -1, x.size(2), x.size(3) ) return unfolded_x def compute_pairwise_term(mask_logits, pairwise_size, pairwise_dilation): assert mask_logits.dim() == 4 log_fg_prob = F.logsigmoid(mask_logits) log_bg_prob = F.logsigmoid(-mask_logits) log_fg_prob_unfold = unfold_wo_center( log_fg_prob, kernel_size=pairwise_size, dilation=pairwise_dilation ) log_bg_prob_unfold = unfold_wo_center( log_bg_prob, kernel_size=pairwise_size, dilation=pairwise_dilation ) # the probability of making the same prediction = p_i * p_j + (1 - p_i) * (1 - p_j) # we compute the the probability in log space to avoid numerical instability log_same_fg_prob = log_fg_prob[:, :, None] + log_fg_prob_unfold log_same_bg_prob = log_bg_prob[:, :, None] + log_bg_prob_unfold max_ = torch.max(log_same_fg_prob, log_same_bg_prob) log_same_prob = torch.log( torch.exp(log_same_fg_prob - max_) + torch.exp(log_same_bg_prob - max_) ) + max_ # loss = -log(prob) return -log_same_prob[:, 0] def compute_pairwise_term_neighbor(mask_logits, mask_logits_neighbor, pairwise_size, pairwise_dilation): assert mask_logits.dim() == 4 log_fg_prob_neigh = F.logsigmoid(mask_logits_neighbor) log_bg_prob_neigh = F.logsigmoid(-mask_logits_neighbor) log_fg_prob = F.logsigmoid(mask_logits) log_bg_prob = F.logsigmoid(-mask_logits) log_fg_prob_unfold = unfold_w_center( log_fg_prob, kernel_size=pairwise_size, dilation=pairwise_dilation ) log_bg_prob_unfold = unfold_w_center( log_bg_prob, kernel_size=pairwise_size, dilation=pairwise_dilation ) # the probability of making the same prediction = p_i * p_j + (1 - p_i) * (1 - p_j) # we compute the the probability in log space to avoid numerical instability log_same_fg_prob = log_fg_prob_neigh[:, :, None] + log_fg_prob_unfold log_same_bg_prob = log_bg_prob_neigh[:, :, None] + log_bg_prob_unfold max_ = torch.max(log_same_fg_prob, log_same_bg_prob) log_same_prob = torch.log( torch.exp(log_same_fg_prob - max_) + torch.exp(log_same_bg_prob - max_) ) + max_ return -log_same_prob[:, 0] def dice_coefficient(x, target): eps = 1e-5 n_inst = x.size(0) x = x.reshape(n_inst, -1) target = target.reshape(n_inst, -1) intersection = (x * target).sum(dim=1) union = (x ** 2.0).sum(dim=1) + (target ** 2.0).sum(dim=1) + eps loss = 1. - (2 * intersection / union) return loss def compute_project_term(mask_scores, gt_bitmasks): mask_losses_y = dice_coefficient( mask_scores.max(dim=2, keepdim=True)[0], gt_bitmasks.max(dim=2, keepdim=True)[0] ) mask_losses_x = dice_coefficient( mask_scores.max(dim=3, keepdim=True)[0], gt_bitmasks.max(dim=3, keepdim=True)[0] ) return (mask_losses_x + mask_losses_y).mean() def dice_loss( inputs: torch.Tensor, targets: torch.Tensor, num_masks: float, ): """ Compute the DICE loss, similar to generalized IOU for masks Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). """ inputs = inputs.sigmoid() inputs = inputs.flatten(1) numerator = 2 * (inputs * targets).sum(-1) denominator = inputs.sum(-1) + targets.sum(-1) loss = 1 - (numerator + 1) / (denominator + 1) return loss.sum() / num_masks dice_loss_jit = torch.jit.script( dice_loss ) # type: torch.jit.ScriptModule def sigmoid_ce_loss( inputs: torch.Tensor, targets: torch.Tensor, num_masks: float, ): """ Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). Returns: Loss tensor """ loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") return loss.mean(1).sum() / num_masks sigmoid_ce_loss_jit = torch.jit.script( sigmoid_ce_loss ) # type: torch.jit.ScriptModule def calculate_uncertainty(logits): """ We estimate uncerainty as L1 distance between 0.0 and the logit prediction in 'logits' for the foreground class in `classes`. Args: logits (Tensor): A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is the number of foreground classes. The values are logits. Returns: scores (Tensor): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most uncertain locations having the highest uncertainty score. """ assert logits.shape[1] == 1 gt_class_logits = logits.clone() return -(torch.abs(gt_class_logits)) class VideoSetCriterion(nn.Module): """This class computes the loss for DETR. The process happens in two steps: 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched ground-truth / prediction (supervise class and box) """ def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses, num_points, oversample_ratio, importance_sample_ratio): """Create the criterion. Parameters: num_classes: number of object categories, omitting the special no-object category matcher: module able to compute a matching between targets and proposals weight_dict: dict containing as key the names of the losses and as values their relative weight. eos_coef: relative classification weight applied to the no-object category losses: list of all the losses to be applied. See get_loss for list of available losses. """ super().__init__() self.num_classes = num_classes self.matcher = matcher self.weight_dict = weight_dict self.eos_coef = eos_coef self.losses = losses empty_weight = torch.ones(self.num_classes + 1) empty_weight[-1] = self.eos_coef self.register_buffer("empty_weight", empty_weight) # pointwise mask loss parameters self.num_points = num_points self.oversample_ratio = oversample_ratio self.importance_sample_ratio = importance_sample_ratio self._warmup_iters = 2000 self.register_buffer("_iter", torch.zeros([1])) def loss_labels(self, outputs, targets, indices, num_masks): """Classification loss (NLL) targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes] """ assert "pred_logits" in outputs src_logits = outputs["pred_logits"].float() idx = self._get_src_permutation_idx(indices) target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) target_classes = torch.full( src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device ) target_classes[idx] = target_classes_o loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) losses = {"loss_ce": loss_ce} return losses def loss_masks(self, outputs, targets, indices, num_masks): """Compute the losses related to the masks: the focal loss and the dice loss. targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w] """ assert "pred_masks" in outputs src_idx = self._get_src_permutation_idx(indices) src_masks = outputs["pred_masks"] src_masks = src_masks[src_idx] # Modified to handle video target_masks = torch.cat([t['masks'][i] for t, (_, i) in zip(targets, indices)]).to(src_masks) # No need to upsample predictions as we are using normalized coordinates :) # NT x 1 x H x W src_masks = src_masks.flatten(0, 1)[:, None] target_masks = target_masks.flatten(0, 1)[:, None] with torch.no_grad(): # sample point_coords point_coords = get_uncertain_point_coords_with_randomness( src_masks, lambda logits: calculate_uncertainty(logits), self.num_points, self.oversample_ratio, self.importance_sample_ratio, ) # get gt labels point_labels = point_sample( target_masks, point_coords, align_corners=False, ).squeeze(1) point_logits = point_sample( src_masks, point_coords, align_corners=False, ).squeeze(1) losses = { "loss_mask": sigmoid_ce_loss_jit(point_logits, point_labels, num_masks), "loss_dice": dice_loss_jit(point_logits, point_labels, num_masks), } del src_masks del target_masks return losses def topk_mask(self, images_lab_sim, k): images_lab_sim_mask = torch.zeros_like(images_lab_sim) topk, indices = torch.topk(images_lab_sim, k, dim =1) # 1, 3, 5, 7 images_lab_sim_mask = images_lab_sim_mask.scatter(1, indices, topk) return images_lab_sim_mask def loss_masks_proj(self, outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2, images_lab_sim_nei3, images_lab_sim_nei4): """Compute the losses related to the masks: the focal loss and the dice loss. targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w] """ assert "pred_masks" in outputs self._iter += 1 src_idx = self._get_src_permutation_idx(indices) src_masks = outputs["pred_masks"] src_masks = src_masks[src_idx] # Modified to handle video target_masks = torch.cat([t['masks'][i] for t, (_, i) in zip(targets, indices)]).to(src_masks) images_lab_sim = torch.cat(images_lab_sim, dim =0) images_lab_sim_nei = torch.cat(images_lab_sim_nei, dim=0) images_lab_sim_nei1 = torch.cat(images_lab_sim_nei1, dim=0) images_lab_sim_nei2 = torch.cat(images_lab_sim_nei2, dim=0) images_lab_sim_nei3 = torch.cat(images_lab_sim_nei3, dim=0) images_lab_sim_nei4 = torch.cat(images_lab_sim_nei4, dim=0) images_lab_sim = images_lab_sim.view(-1, target_masks.shape[1], images_lab_sim.shape[-3], images_lab_sim.shape[-2], images_lab_sim.shape[-1]) images_lab_sim_nei = images_lab_sim_nei.unsqueeze(1) images_lab_sim_nei1 = images_lab_sim_nei1.unsqueeze(1) images_lab_sim_nei2 = images_lab_sim_nei2.unsqueeze(1) images_lab_sim_nei3 = images_lab_sim_nei3.unsqueeze(1) images_lab_sim_nei4 = images_lab_sim_nei4.unsqueeze(1) if len(src_idx[0].tolist()) > 0: images_lab_sim = torch.cat([images_lab_sim[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1) images_lab_sim_nei = self.topk_mask(torch.cat([images_lab_sim_nei[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1), 5) images_lab_sim_nei1 = self.topk_mask(torch.cat([images_lab_sim_nei1[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1), 5) images_lab_sim_nei2 = self.topk_mask(torch.cat([images_lab_sim_nei2[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1), 5) images_lab_sim_nei3 = self.topk_mask(torch.cat([images_lab_sim_nei3[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1), 5) images_lab_sim_nei4 = self.topk_mask(torch.cat([images_lab_sim_nei4[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1), 5) k_size = 3 if src_masks.shape[0] > 0: pairwise_losses_neighbor = compute_pairwise_term_neighbor( src_masks[:,:1], src_masks[:,1:2], k_size, 3 ) pairwise_losses_neighbor1 = compute_pairwise_term_neighbor( src_masks[:,1:2], src_masks[:,2:3], k_size, 3 ) pairwise_losses_neighbor2 = compute_pairwise_term_neighbor( src_masks[:,2:3], src_masks[:,3:4], k_size, 3 ) pairwise_losses_neighbor3 = compute_pairwise_term_neighbor( src_masks[:,3:4], src_masks[:,4:5], k_size, 3 ) pairwise_losses_neighbor4 = compute_pairwise_term_neighbor( src_masks[:,4:5], src_masks[:,0:1], k_size, 3 ) # print('pairwise_losses_neighbor:', pairwise_losses_neighbor.shape) src_masks = src_masks.flatten(0, 1)[:, None] target_masks = target_masks.flatten(0, 1)[:, None] target_masks = F.interpolate(target_masks, (src_masks.shape[-2], src_masks.shape[-1]), mode='bilinear') # images_lab_sim = F.interpolate(images_lab_sim, (src_masks.shape[-2], src_masks.shape[-1]), mode='bilinear') if src_masks.shape[0] > 0: loss_prj_term = compute_project_term(src_masks.sigmoid(), target_masks) pairwise_losses = compute_pairwise_term( src_masks, k_size, 2 ) weights = (images_lab_sim >= 0.3).float() * target_masks.float() target_masks_sum = target_masks.reshape(pairwise_losses_neighbor.shape[0], 5, target_masks.shape[-2], target_masks.shape[-1]).sum(dim=1, keepdim=True) target_masks_sum = (target_masks_sum >= 1.0).float() # ori is 1.0 weights_neighbor = (images_lab_sim_nei >= 0.05).float() * target_masks_sum # ori is 0.5, 0.01, 0.001, 0.005, 0.0001, 0.02, 0.05, 0.075, 0.1 , dy 0.5 weights_neighbor1 = (images_lab_sim_nei1 >= 0.05).float() * target_masks_sum # ori is 0.5, 0.01, 0.001, 0.005, 0.0001, 0.02, 0.05, 0.075, 0.1, dy 0.5 weights_neighbor2 = (images_lab_sim_nei2 >= 0.05).float() * target_masks_sum # ori is 0.5, 0.01, 0.001, 0.005, 0.0001, 0.02, 0.05, 0.075, 0.1, dy 0.5 weights_neighbor3 = (images_lab_sim_nei3 >= 0.05).float() * target_masks_sum weights_neighbor4 = (images_lab_sim_nei4 >= 0.05).float() * target_masks_sum warmup_factor = min(self._iter.item() / float(self._warmup_iters), 1.0) #1.0 loss_pairwise = (pairwise_losses * weights).sum() / weights.sum().clamp(min=1.0) loss_pairwise_neighbor = (pairwise_losses_neighbor * weights_neighbor).sum() / weights_neighbor.sum().clamp(min=1.0) * warmup_factor loss_pairwise_neighbor1 = (pairwise_losses_neighbor1 * weights_neighbor1).sum() / weights_neighbor1.sum().clamp(min=1.0) * warmup_factor loss_pairwise_neighbor2 = (pairwise_losses_neighbor2 * weights_neighbor2).sum() / weights_neighbor2.sum().clamp(min=1.0) * warmup_factor loss_pairwise_neighbor3 = (pairwise_losses_neighbor3 * weights_neighbor3).sum() / weights_neighbor3.sum().clamp(min=1.0) * warmup_factor loss_pairwise_neighbor4 = (pairwise_losses_neighbor4 * weights_neighbor4).sum() / weights_neighbor4.sum().clamp(min=1.0) * warmup_factor else: loss_prj_term = src_masks.sum() * 0. loss_pairwise = src_masks.sum() * 0. loss_pairwise_neighbor = src_masks.sum() * 0. loss_pairwise_neighbor1 = src_masks.sum() * 0. loss_pairwise_neighbor2 = src_masks.sum() * 0. loss_pairwise_neighbor3 = src_masks.sum() * 0. loss_pairwise_neighbor4 = src_masks.sum() * 0. # print('loss_proj term:', loss_prj_term) losses = { "loss_mask": loss_prj_term, "loss_bound": loss_pairwise, "loss_bound_neighbor": (loss_pairwise_neighbor + loss_pairwise_neighbor1 + loss_pairwise_neighbor2 + loss_pairwise_neighbor3 + loss_pairwise_neighbor4) * 0.1, # * 0.33 } del src_masks del target_masks return losses def _get_src_permutation_idx(self, indices): # permute predictions following indices batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) src_idx = torch.cat([src for (src, _) in indices]) return batch_idx, src_idx def _get_tgt_permutation_idx(self, indices): # permute targets following indices batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) tgt_idx = torch.cat([tgt for (_, tgt) in indices]) return batch_idx, tgt_idx def get_loss(self, loss, outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2, images_lab_sim_nei3, images_lab_sim_nei4): loss_map = { 'labels': self.loss_labels, 'masks': self.loss_masks_proj, } assert loss in loss_map, f"do you really want to compute {loss} loss?" if loss == 'masks': return loss_map[loss](outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2, images_lab_sim_nei3, images_lab_sim_nei4) else: return loss_map[loss](outputs, targets, indices, num_masks) def forward(self, outputs, targets, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2, images_lab_sim_nei3, images_lab_sim_nei4): """This performs the loss computation. Parameters: outputs: dict of tensors, see the output specification of the model for the format targets: list of dicts, such that len(targets) == batch_size. The expected keys in each dict depends on the losses applied, see each loss' doc """ outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"} # Retrieve the matching between the outputs of the last layer and the targets indices = self.matcher(outputs_without_aux, targets) # Compute the average number of target boxes accross all nodes, for normalization purposes num_masks = sum(len(t["labels"]) for t in targets) num_masks = torch.as_tensor( [num_masks], dtype=torch.float, device=next(iter(outputs.values())).device ) if is_dist_avail_and_initialized(): torch.distributed.all_reduce(num_masks) num_masks = torch.clamp(num_masks / get_world_size(), min=1).item() # Compute all the requested losses losses = {} for loss in self.losses: losses.update(self.get_loss(loss, outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2, images_lab_sim_nei3, images_lab_sim_nei4)) # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. if "aux_outputs" in outputs: for i, aux_outputs in enumerate(outputs["aux_outputs"]): indices = self.matcher(aux_outputs, targets) for loss in self.losses: l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2, images_lab_sim_nei3, images_lab_sim_nei4) l_dict = {k + f"_{i}": v for k, v in l_dict.items()} losses.update(l_dict) return losses def __repr__(self): head = "Criterion " + self.__class__.__name__ body = [ "matcher: {}".format(self.matcher.__repr__(_repr_indent=8)), "losses: {}".format(self.losses), "weight_dict: {}".format(self.weight_dict), "num_classes: {}".format(self.num_classes), "eos_coef: {}".format(self.eos_coef), "num_points: {}".format(self.num_points), "oversample_ratio: {}".format(self.oversample_ratio), "importance_sample_ratio: {}".format(self.importance_sample_ratio), ] _repr_indent = 4 lines = [head] + [" " * _repr_indent + line for line in body] return "\n".join(lines) ================================================ FILE: mfvis_nococo/mask2former_video/modeling/matcher.py ================================================ # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py """ Modules to compute the matching cost and solve the corresponding LSAP. """ import torch import torch.nn.functional as F from scipy.optimize import linear_sum_assignment from torch import nn from torch.cuda.amp import autocast from detectron2.projects.point_rend.point_features import point_sample def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor: """ Compute the bounding boxes around the provided masks. Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. Args: masks (Tensor[N, H, W]): masks to transform where N is the number of masks and (H, W) are the spatial dimensions. Returns: Tensor[N, 4]: bounding boxes """ if masks.numel() == 0: return masks n = masks.shape[0] masks = masks.flatten(0, 1) for index, mask in enumerate(masks): y, x = torch.where(mask != 0) if len(x) * len(y) == 0: continue masks[index, torch.min(y):torch.max(y)+1, torch.min(x):torch.max(x)+1] = 1.0 masks = masks.view(n, -1, masks.shape[-2], masks.shape[-1]) return masks def masks_to_boxes_new(masks: torch.Tensor) -> torch.Tensor: """ Compute the bounding boxes around the provided masks. Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. Args: masks (Tensor[N, H, W]): masks to transform where N is the number of masks and (H, W) are the spatial dimensions. Returns: Tensor[N, 4]: bounding boxes """ if masks.numel() == 0: return masks n, _, h, w = masks.shape masks = masks.flatten(0, 1) y = torch.arange(0, h, dtype=torch.float).to(masks.device) x = torch.arange(0, w, dtype=torch.float).to(masks.device) y, x = torch.meshgrid(y, x) x_mask = (masks * x.unsqueeze(0)) x_max = x_mask.flatten(1).max(-1)[0] + 1 x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] y_mask = (masks * y.unsqueeze(0)) y_max = y_mask.flatten(1).max(-1)[0] + 1 y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] boxes = torch.stack([x_min, y_min, x_max, y_max], 1) mem_mask = torch.zeros_like(masks) hMask = torch.logical_or(torch.arange(h).unsqueeze(0).to(boxes)=boxes[:, 3, None]) wMask = torch.logical_or(torch.arange(w).unsqueeze(0).to(boxes)=boxes[:, 2, None]) mem_mask = torch.logical_or(hMask.unsqueeze(2), wMask.unsqueeze(1)).float() mem_mask = 1.0 - mem_mask.view(n, -1, masks.shape[-2], masks.shape[-1]) return mem_mask def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor): """ Compute the DICE loss, similar to generalized IOU for masks Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). """ inputs = inputs.sigmoid() inputs = inputs.flatten(1) numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets) denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :] loss = 1 - (numerator + 1) / (denominator + 1) return loss def batch_dice_loss_nosig(inputs: torch.Tensor, targets: torch.Tensor): """ Compute the DICE loss, similar to generalized IOU for masks Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). """ # inputs = inputs.sigmoid() inputs = inputs.flatten(1) numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets) denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :] loss = 1 - (numerator + 1) / (denominator + 1) return loss batch_dice_loss_jit = torch.jit.script( batch_dice_loss ) # type: torch.jit.ScriptModule batch_dice_loss_jit_nosig = torch.jit.script( batch_dice_loss_nosig ) # type: torch.jit.ScriptModule def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor): """ Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). Returns: Loss tensor """ hw = inputs.shape[1] pos = F.binary_cross_entropy_with_logits( inputs, torch.ones_like(inputs), reduction="none" ) neg = F.binary_cross_entropy_with_logits( inputs, torch.zeros_like(inputs), reduction="none" ) loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum( "nc,mc->nm", neg, (1 - targets) ) return loss / hw batch_sigmoid_ce_loss_jit = torch.jit.script( batch_sigmoid_ce_loss ) # type: torch.jit.ScriptModule class VideoHungarianMatcher(nn.Module): """This class computes an assignment between the targets and the predictions of the network For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are un-matched (and thus treated as non-objects). """ def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0): """Creates the matcher Params: cost_class: This is the relative weight of the classification error in the matching cost cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost """ super().__init__() self.cost_class = cost_class self.cost_mask = cost_mask self.cost_dice = cost_dice assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0" self.num_points = num_points @torch.no_grad() def memory_efficient_forward(self, outputs, targets): """More memory-friendly matching""" bs, num_queries = outputs["pred_logits"].shape[:2] indices = [] # Iterate through batch size for b in range(bs): out_prob = outputs["pred_logits"][b].softmax(-1) # [num_queries, num_classes] tgt_ids = targets[b]["labels"] # Compute the classification cost. Contrary to the loss, we don't use the NLL, # but approximate it in 1 - proba[target class]. # The 1 is a constant that doesn't change the matching, it can be ommitted. cost_class = -out_prob[:, tgt_ids] out_mask = outputs["pred_masks"][b] # [num_queries, T, H_pred, W_pred] out_mask = masks_to_boxes_new((out_mask.sigmoid() > 0.5).float()) # gt masks are already padded when preparing target tgt_mask = targets[b]["masks"].to(out_mask) # [num_gts, T, H_pred, W_pred] tgt_mask = masks_to_boxes(tgt_mask) # all masks share the same set of points for efficient matching! point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device) # get gt labels tgt_mask = point_sample( tgt_mask, point_coords.repeat(tgt_mask.shape[0], 1, 1), align_corners=False, ).flatten(1) out_mask = point_sample( out_mask, point_coords.repeat(out_mask.shape[0], 1, 1), align_corners=False, ).flatten(1) with autocast(enabled=False): out_mask = out_mask.float() tgt_mask = tgt_mask.float() cost_dice_nosig = batch_dice_loss_jit_nosig(out_mask, tgt_mask) C = ( self.cost_class * cost_class + self.cost_dice * cost_dice_nosig ) C = C.reshape(num_queries, -1).cpu() indices.append(linear_sum_assignment(C)) return [ (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices ] @torch.no_grad() def forward(self, outputs, targets): """Performs the matching Params: outputs: This is a dict that contains at least these entries: "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth objects in the target) containing the class labels "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) """ return self.memory_efficient_forward(outputs, targets) def __repr__(self, _repr_indent=4): head = "Matcher " + self.__class__.__name__ body = [ "cost_class: {}".format(self.cost_class), "cost_mask: {}".format(self.cost_mask), "cost_dice: {}".format(self.cost_dice), ] lines = [head] + [" " * _repr_indent + line for line in body] return "\n".join(lines) ================================================ FILE: mfvis_nococo/mask2former_video/modeling/transformer_decoder/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder ================================================ FILE: mfvis_nococo/mask2former_video/modeling/transformer_decoder/position_encoding.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py """ Various positional encodings for the transformer. """ import math import torch from torch import nn class PositionEmbeddingSine3D(nn.Module): """ This is a more standard version of the position embedding, very similar to the one used by the Attention is all you need paper, generalized to work on images. """ def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): super().__init__() self.num_pos_feats = num_pos_feats self.temperature = temperature self.normalize = normalize if scale is not None and normalize is False: raise ValueError("normalize should be True if scale is passed") if scale is None: scale = 2 * math.pi self.scale = scale def forward(self, x, mask=None): # b, t, c, h, w assert x.dim() == 5, f"{x.shape} should be a 5-dimensional Tensor, got {x.dim()}-dimensional Tensor instead" if mask is None: mask = torch.zeros((x.size(0), x.size(1), x.size(3), x.size(4)), device=x.device, dtype=torch.bool) not_mask = ~mask z_embed = not_mask.cumsum(1, dtype=torch.float32) y_embed = not_mask.cumsum(2, dtype=torch.float32) x_embed = not_mask.cumsum(3, dtype=torch.float32) if self.normalize: eps = 1e-6 z_embed = z_embed / (z_embed[:, -1:, :, :] + eps) * self.scale y_embed = y_embed / (y_embed[:, :, -1:, :] + eps) * self.scale x_embed = x_embed / (x_embed[:, :, :, -1:] + eps) * self.scale dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) dim_t_z = torch.arange((self.num_pos_feats * 2), dtype=torch.float32, device=x.device) dim_t_z = self.temperature ** (2 * (dim_t_z // 2) / (self.num_pos_feats * 2)) pos_x = x_embed[:, :, :, :, None] / dim_t pos_y = y_embed[:, :, :, :, None] / dim_t pos_z = z_embed[:, :, :, :, None] / dim_t_z pos_x = torch.stack((pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), dim=5).flatten(4) pos_y = torch.stack((pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), dim=5).flatten(4) pos_z = torch.stack((pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()), dim=5).flatten(4) pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3) # b, t, c, h, w return pos ================================================ FILE: mfvis_nococo/mask2former_video/modeling/transformer_decoder/video_mask2former_transformer_decoder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py import logging import fvcore.nn.weight_init as weight_init from typing import Optional import torch from torch import nn, Tensor from torch.nn import functional as F from detectron2.config import configurable from detectron2.layers import Conv2d from mask2former.modeling.transformer_decoder.maskformer_transformer_decoder import TRANSFORMER_DECODER_REGISTRY from .position_encoding import PositionEmbeddingSine3D class SelfAttentionLayer(nn.Module): def __init__(self, d_model, nhead, dropout=0.0, activation="relu", normalize_before=False): super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.norm = nn.LayerNorm(d_model) self.dropout = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post(self, tgt, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): q = k = self.with_pos_embed(tgt, query_pos) tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) tgt = self.norm(tgt) return tgt def forward_pre(self, tgt, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): tgt2 = self.norm(tgt) q = k = self.with_pos_embed(tgt2, query_pos) tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) return tgt def forward(self, tgt, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): if self.normalize_before: return self.forward_pre(tgt, tgt_mask, tgt_key_padding_mask, query_pos) return self.forward_post(tgt, tgt_mask, tgt_key_padding_mask, query_pos) class CrossAttentionLayer(nn.Module): def __init__(self, d_model, nhead, dropout=0.0, activation="relu", normalize_before=False): super().__init__() self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.norm = nn.LayerNorm(d_model) self.dropout = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post(self, tgt, memory, memory_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos), key=self.with_pos_embed(memory, pos), value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) tgt = self.norm(tgt) return tgt def forward_pre(self, tgt, memory, memory_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): tgt2 = self.norm(tgt) tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos), key=self.with_pos_embed(memory, pos), value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) return tgt def forward(self, tgt, memory, memory_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): if self.normalize_before: return self.forward_pre(tgt, memory, memory_mask, memory_key_padding_mask, pos, query_pos) return self.forward_post(tgt, memory, memory_mask, memory_key_padding_mask, pos, query_pos) class FFNLayer(nn.Module): def __init__(self, d_model, dim_feedforward=2048, dropout=0.0, activation="relu", normalize_before=False): super().__init__() # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm = nn.LayerNorm(d_model) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post(self, tgt): tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) tgt = tgt + self.dropout(tgt2) tgt = self.norm(tgt) return tgt def forward_pre(self, tgt): tgt2 = self.norm(tgt) tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) tgt = tgt + self.dropout(tgt2) return tgt def forward(self, tgt): if self.normalize_before: return self.forward_pre(tgt) return self.forward_post(tgt) def _get_activation_fn(activation): """Return an activation function given a string""" if activation == "relu": return F.relu if activation == "gelu": return F.gelu if activation == "glu": return F.glu raise RuntimeError(F"activation should be relu/gelu, not {activation}.") class MLP(nn.Module): """ Very simple multi-layer perceptron (also called FFN)""" def __init__(self, input_dim, hidden_dim, output_dim, num_layers): super().__init__() self.num_layers = num_layers h = [hidden_dim] * (num_layers - 1) self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) def forward(self, x): for i, layer in enumerate(self.layers): x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) return x @TRANSFORMER_DECODER_REGISTRY.register() class VideoMultiScaleMaskedTransformerDecoder(nn.Module): _version = 2 def _load_from_state_dict( self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ): version = local_metadata.get("version", None) if version is None or version < 2: # Do not warn if train from scratch scratch = True logger = logging.getLogger(__name__) for k in list(state_dict.keys()): newk = k if "static_query" in k: newk = k.replace("static_query", "query_feat") if newk != k: state_dict[newk] = state_dict[k] del state_dict[k] scratch = False if not scratch: logger.warning( f"Weight format of {self.__class__.__name__} have changed! " "Please upgrade your models. Applying automatic conversion now ..." ) @configurable def __init__( self, in_channels, mask_classification=True, *, num_classes: int, hidden_dim: int, num_queries: int, nheads: int, dim_feedforward: int, dec_layers: int, pre_norm: bool, mask_dim: int, enforce_input_project: bool, # video related num_frames, ): """ NOTE: this interface is experimental. Args: in_channels: channels of the input features mask_classification: whether to add mask classifier or not num_classes: number of classes hidden_dim: Transformer feature dimension num_queries: number of queries nheads: number of heads dim_feedforward: feature dimension in feedforward network enc_layers: number of Transformer encoder layers dec_layers: number of Transformer decoder layers pre_norm: whether to use pre-LayerNorm or not mask_dim: mask feature dimension enforce_input_project: add input project 1x1 conv even if input channels and hidden dim is identical """ super().__init__() assert mask_classification, "Only support mask classification model" self.mask_classification = mask_classification self.num_frames = num_frames # positional encoding N_steps = hidden_dim // 2 self.pe_layer = PositionEmbeddingSine3D(N_steps, normalize=True) # define Transformer decoder here self.num_heads = nheads self.num_layers = dec_layers self.transformer_self_attention_layers = nn.ModuleList() self.transformer_cross_attention_layers = nn.ModuleList() self.transformer_ffn_layers = nn.ModuleList() for _ in range(self.num_layers): self.transformer_self_attention_layers.append( SelfAttentionLayer( d_model=hidden_dim, nhead=nheads, dropout=0.0, normalize_before=pre_norm, ) ) self.transformer_cross_attention_layers.append( CrossAttentionLayer( d_model=hidden_dim, nhead=nheads, dropout=0.0, normalize_before=pre_norm, ) ) self.transformer_ffn_layers.append( FFNLayer( d_model=hidden_dim, dim_feedforward=dim_feedforward, dropout=0.0, normalize_before=pre_norm, ) ) self.decoder_norm = nn.LayerNorm(hidden_dim) self.num_queries = num_queries # learnable query features self.query_feat = nn.Embedding(num_queries, hidden_dim) # learnable query p.e. self.query_embed = nn.Embedding(num_queries, hidden_dim) # level embedding (we always use 3 scales) self.num_feature_levels = 3 self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim) self.input_proj = nn.ModuleList() for _ in range(self.num_feature_levels): if in_channels != hidden_dim or enforce_input_project: self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1)) weight_init.c2_xavier_fill(self.input_proj[-1]) else: self.input_proj.append(nn.Sequential()) # output FFNs if self.mask_classification: self.class_embed = nn.Linear(hidden_dim, num_classes + 1) self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) @classmethod def from_config(cls, cfg, in_channels, mask_classification): ret = {} ret["in_channels"] = in_channels ret["mask_classification"] = mask_classification ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES # Transformer parameters: ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD # NOTE: because we add learnable query features which requires supervision, # we add minus 1 to decoder layers to be consistent with our loss # implementation: that is, number of auxiliary losses is always # equal to number of decoder layers. With learnable query features, the number of # auxiliary losses equals number of decoders plus 1. assert cfg.MODEL.MASK_FORMER.DEC_LAYERS >= 1 ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS - 1 ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM ret["num_frames"] = cfg.INPUT.SAMPLING_FRAME_NUM return ret def forward(self, x, mask_features, mask = None): bt, c_m, h_m, w_m = mask_features.shape bs = bt // self.num_frames if self.training else 1 t = bt // bs mask_features = mask_features.view(bs, t, c_m, h_m, w_m) # x is a list of multi-scale feature assert len(x) == self.num_feature_levels src = [] pos = [] size_list = [] # disable mask, it does not affect performance del mask for i in range(self.num_feature_levels): size_list.append(x[i].shape[-2:]) pos.append(self.pe_layer(x[i].view(bs, t, -1, size_list[-1][0], size_list[-1][1]), None).flatten(3)) src.append(self.input_proj[i](x[i]).flatten(2) + self.level_embed.weight[i][None, :, None]) # NTxCxHW => NxTxCxHW => (TxHW)xNxC _, c, hw = src[-1].shape pos[-1] = pos[-1].view(bs, t, c, hw).permute(1, 3, 0, 2).flatten(0, 1) src[-1] = src[-1].view(bs, t, c, hw).permute(1, 3, 0, 2).flatten(0, 1) # QxNxC query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1) output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1) predictions_class = [] predictions_mask = [] # prediction heads on learnable query features outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0]) predictions_class.append(outputs_class) predictions_mask.append(outputs_mask) for i in range(self.num_layers): level_index = i % self.num_feature_levels attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False # attention: cross-attention first output = self.transformer_cross_attention_layers[i]( output, src[level_index], memory_mask=attn_mask, memory_key_padding_mask=None, # here we do not apply masking on padded region pos=pos[level_index], query_pos=query_embed ) output = self.transformer_self_attention_layers[i]( output, tgt_mask=None, tgt_key_padding_mask=None, query_pos=query_embed ) # FFN output = self.transformer_ffn_layers[i]( output ) outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels]) predictions_class.append(outputs_class) predictions_mask.append(outputs_mask) assert len(predictions_class) == self.num_layers + 1 out = { 'pred_logits': predictions_class[-1], 'pred_masks': predictions_mask[-1], 'aux_outputs': self._set_aux_loss( predictions_class if self.mask_classification else None, predictions_mask ) } return out def forward_prediction_heads(self, output, mask_features, attn_mask_target_size): decoder_output = self.decoder_norm(output) decoder_output = decoder_output.transpose(0, 1) outputs_class = self.class_embed(decoder_output) mask_embed = self.mask_embed(decoder_output) outputs_mask = torch.einsum("bqc,btchw->bqthw", mask_embed, mask_features) b, q, t, _, _ = outputs_mask.shape # NOTE: prediction is of higher-resolution # [B, Q, T, H, W] -> [B, Q, T*H*W] -> [B, h, Q, T*H*W] -> [B*h, Q, T*HW] attn_mask = F.interpolate(outputs_mask.flatten(0, 1), size=attn_mask_target_size, mode="bilinear", align_corners=False).view( b, q, t, attn_mask_target_size[0], attn_mask_target_size[1]) # must use bool type # If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged. attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool() attn_mask = attn_mask.detach() return outputs_class, outputs_mask, attn_mask @torch.jit.unused def _set_aux_loss(self, outputs_class, outputs_seg_masks): # this is a workaround to make torchscript happy, as torchscript # doesn't support dictionary with non-homogeneous values, such # as a dict having both a Tensor and a list. if self.mask_classification: return [ {"pred_logits": a, "pred_masks": b} for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1]) ] else: return [{"pred_masks": b} for b in outputs_seg_masks[:-1]] ================================================ FILE: mfvis_nococo/mask2former_video/utils/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. ================================================ FILE: mfvis_nococo/mask2former_video/utils/memory.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging from contextlib import contextmanager from functools import wraps import torch from torch.cuda.amp import autocast __all__ = ["retry_if_cuda_oom"] @contextmanager def _ignore_torch_cuda_oom(): """ A context which ignores CUDA OOM exception from pytorch. """ try: yield except RuntimeError as e: # NOTE: the string may change? if "CUDA out of memory. " in str(e): pass else: raise def retry_if_cuda_oom(func): """ Makes a function retry itself after encountering pytorch's CUDA OOM error. It will first retry after calling `torch.cuda.empty_cache()`. If that still fails, it will then retry by trying to convert inputs to CPUs. In this case, it expects the function to dispatch to CPU implementation. The return values may become CPU tensors as well and it's user's responsibility to convert it back to CUDA tensor if needed. Args: func: a stateless callable that takes tensor-like objects as arguments Returns: a callable which retries `func` if OOM is encountered. Examples: :: output = retry_if_cuda_oom(some_torch_function)(input1, input2) # output may be on CPU even if inputs are on GPU Note: 1. When converting inputs to CPU, it will only look at each argument and check if it has `.device` and `.to` for conversion. Nested structures of tensors are not supported. 2. Since the function might be called more than once, it has to be stateless. """ def maybe_to_cpu(x): try: like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to") except AttributeError: like_gpu_tensor = False if like_gpu_tensor: return x.to(device="cpu").to(torch.float32) else: return x @wraps(func) def wrapped(*args, **kwargs): with _ignore_torch_cuda_oom(): return func(*args, **kwargs) # Clear cache and retry torch.cuda.empty_cache() with _ignore_torch_cuda_oom(): return func(*args, **kwargs) # Try on CPU. This slows down the code significantly, therefore print a notice. logger = logging.getLogger(__name__) logger.info("Attempting to copy inputs to CPU due to CUDA OOM") new_args = (maybe_to_cpu(x) for x in args) new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()} with autocast(enabled=False): return func(*new_args, **new_kwargs) return wrapped ================================================ FILE: mfvis_nococo/mask2former_video/video_maskformer_model.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging import math from typing import Tuple import torch from torch import nn from torch.nn import functional as F from detectron2.config import configurable from detectron2.data import MetadataCatalog from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head from detectron2.modeling.backbone import Backbone from detectron2.modeling.postprocessing import sem_seg_postprocess from detectron2.structures import Boxes, ImageList, Instances, BitMasks from .modeling.criterion import VideoSetCriterion from .modeling.matcher import VideoHungarianMatcher from .utils.memory import retry_if_cuda_oom from skimage import color import cv2 import numpy as np def unfold_wo_center(x, kernel_size, dilation): assert x.dim() == 4 assert kernel_size % 2 == 1 # using SAME padding padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2 unfolded_x = F.unfold( x, kernel_size=kernel_size, padding=padding, dilation=dilation ) unfolded_x = unfolded_x.reshape( x.size(0), x.size(1), -1, x.size(2), x.size(3) ) # remove the center pixels size = kernel_size ** 2 unfolded_x = torch.cat(( unfolded_x[:, :, :size // 2], unfolded_x[:, :, size // 2 + 1:] ), dim=2) return unfolded_x def unfold_w_center(x, kernel_size, dilation): assert x.dim() == 4 assert kernel_size % 2 == 1 # using SAME padding padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2 unfolded_x = F.unfold( x, kernel_size=kernel_size, padding=padding, dilation=dilation ) unfolded_x = unfolded_x.reshape( x.size(0), x.size(1), -1, x.size(2), x.size(3) ) return unfolded_x def get_images_color_similarity(images, kernel_size, dilation): assert images.dim() == 4 assert images.size(0) == 1 unfolded_images = unfold_wo_center( images, kernel_size=kernel_size, dilation=dilation ) diff = images[:, :, None] - unfolded_images similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5) return similarity def get_neighbor_images_color_similarity(images, images_neighbor, kernel_size, dilation): assert images.dim() == 4 assert images.size(0) == 1 unfolded_images = unfold_w_center( images, kernel_size=kernel_size, dilation=dilation ) diff = images_neighbor[:, :, None] - unfolded_images similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5) return similarity def get_neighbor_images_patch_color_similarity(images, images_neighbor, kernel_size, dilation): assert images.dim() == 4 assert images.size(0) == 1 unfolded_images = unfold_w_center( images, kernel_size=kernel_size, dilation= 1 #dilation ) unfolded_images_neighbor = unfold_w_center( images_neighbor, kernel_size=kernel_size, dilation= 1 #dilation ) unfolded_images = unfolded_images.flatten(1,2) unfolded_images_neighbor = unfolded_images_neighbor.flatten(1,2) similarity = get_neighbor_images_color_similarity(unfolded_images, unfolded_images_neighbor, 3, 3) return similarity logger = logging.getLogger(__name__) @META_ARCH_REGISTRY.register() class VideoMaskFormer(nn.Module): """ Main class for mask classification semantic segmentation architectures. """ @configurable def __init__( self, *, backbone: Backbone, sem_seg_head: nn.Module, criterion: nn.Module, num_queries: int, object_mask_threshold: float, overlap_threshold: float, metadata, size_divisibility: int, sem_seg_postprocess_before_inference: bool, pixel_mean: Tuple[float], pixel_std: Tuple[float], # video num_frames, ): """ Args: backbone: a backbone module, must follow detectron2's backbone interface sem_seg_head: a module that predicts semantic segmentation from backbone features criterion: a module that defines the loss num_queries: int, number of queries object_mask_threshold: float, threshold to filter query based on classification score for panoptic segmentation inference overlap_threshold: overlap threshold used in general inference for panoptic segmentation metadata: dataset meta, get `thing` and `stuff` category names for panoptic segmentation inference size_divisibility: Some backbones require the input height and width to be divisible by a specific integer. We can use this to override such requirement. sem_seg_postprocess_before_inference: whether to resize the prediction back to original input size before semantic segmentation inference or after. For high-resolution dataset like Mapillary, resizing predictions before inference will cause OOM error. pixel_mean, pixel_std: list or tuple with #channels element, representing the per-channel mean and std to be used to normalize the input image semantic_on: bool, whether to output semantic segmentation prediction instance_on: bool, whether to output instance segmentation prediction panoptic_on: bool, whether to output panoptic segmentation prediction test_topk_per_image: int, instance segmentation parameter, keep topk instances per image """ super().__init__() self.backbone = backbone self.sem_seg_head = sem_seg_head self.criterion = criterion self.num_queries = num_queries self.overlap_threshold = overlap_threshold self.object_mask_threshold = object_mask_threshold self.metadata = metadata if size_divisibility < 0: # use backbone size_divisibility if not set size_divisibility = self.backbone.size_divisibility self.size_divisibility = size_divisibility self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False) self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False) self.num_frames = num_frames #self.structure_fc = nn.Conv2d(27, 256, 1) @classmethod def from_config(cls, cfg): backbone = build_backbone(cfg) sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape()) # Loss parameters: deep_supervision = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION no_object_weight = cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT # loss weights class_weight = cfg.MODEL.MASK_FORMER.CLASS_WEIGHT dice_weight = cfg.MODEL.MASK_FORMER.DICE_WEIGHT mask_weight = cfg.MODEL.MASK_FORMER.MASK_WEIGHT # building criterion matcher = VideoHungarianMatcher( cost_class=class_weight, cost_mask=mask_weight, cost_dice=dice_weight, num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS, ) weight_dict = {"loss_ce": class_weight, "loss_mask": mask_weight, "loss_dice": dice_weight, "loss_bound": mask_weight, "loss_bound_neighbor": mask_weight} if deep_supervision: dec_layers = cfg.MODEL.MASK_FORMER.DEC_LAYERS aux_weight_dict = {} for i in range(dec_layers - 1): aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) weight_dict.update(aux_weight_dict) losses = ["labels", "masks"] criterion = VideoSetCriterion( sem_seg_head.num_classes, matcher=matcher, weight_dict=weight_dict, eos_coef=no_object_weight, losses=losses, num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS, oversample_ratio=cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO, importance_sample_ratio=cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO, ) return { "backbone": backbone, "sem_seg_head": sem_seg_head, "criterion": criterion, "num_queries": cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES, "object_mask_threshold": cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD, "overlap_threshold": cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD, "metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]), "size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY, "sem_seg_postprocess_before_inference": True, "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, # video "num_frames": cfg.INPUT.SAMPLING_FRAME_NUM, } @property def device(self): return self.pixel_mean.device def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper`. Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * "image": Tensor, image in (C, H, W) format. * "instances": per-region ground truth * Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model (may be different from input resolution), used in inference. Returns: list[dict]: each dict has the results for one image. The dict contains the following keys: * "sem_seg": A Tensor that represents the per-pixel segmentation prediced by the head. The prediction has shape KxHxW that represents the logits of each class for each pixel. * "panoptic_seg": A tuple that represent panoptic output panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment. segments_info (list[dict]): Describe each segment in `panoptic_seg`. Each dict contains keys "id", "category_id", "isthing". """ images = [] for video in batched_inputs: for frame in video["image"]: images.append(frame.to(self.device)) if self.training: k_size = 3 #3 rs_images = ImageList.from_tensors(images, self.size_divisibility) downsampled_images = F.avg_pool2d(rs_images.tensor.float(), kernel_size=4, stride=4, padding=0) #for img in images] images_lab = [torch.as_tensor(color.rgb2lab(ds_image[[2, 1, 0]].byte().permute(1, 2, 0).cpu().numpy()), device=ds_image.device, dtype=torch.float32).permute(2, 0, 1) for ds_image in downsampled_images] images_lab_sim = [get_images_color_similarity(img_lab.unsqueeze(0), k_size, 2) for img_lab in images_lab] # ori is 0.3, 0.5, 0.7 images_lab_sim_nei = [get_neighbor_images_patch_color_similarity(images_lab[ii].unsqueeze(0), images_lab[ii+1].unsqueeze(0), 3, 3) for ii in range(0, len(images_lab), 5)] # ori dilation is 3 images_lab_sim_nei1 = [get_neighbor_images_patch_color_similarity(images_lab[ii+1].unsqueeze(0), images_lab[ii+2].unsqueeze(0), 3, 3) for ii in range(0, len(images_lab), 5)] images_lab_sim_nei2 = [get_neighbor_images_patch_color_similarity(images_lab[ii+2].unsqueeze(0), images_lab[ii+3].unsqueeze(0), 3, 3) for ii in range(0, len(images_lab), 5)] images_lab_sim_nei3 = [get_neighbor_images_patch_color_similarity(images_lab[ii+3].unsqueeze(0), images_lab[ii+4].unsqueeze(0), 3, 3) for ii in range(0, len(images_lab), 5)] images_lab_sim_nei4 = [get_neighbor_images_patch_color_similarity(images_lab[ii+4].unsqueeze(0), images_lab[ii].unsqueeze(0), 3, 3) for ii in range(0, len(images_lab), 5)] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, self.size_divisibility) features = self.backbone(images.tensor) outputs = self.sem_seg_head(features) if self.training: # mask classification target targets = self.prepare_targets(batched_inputs, images) # bipartite matching-based loss losses = self.criterion(outputs, targets, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2, images_lab_sim_nei3, images_lab_sim_nei4) for k in list(losses.keys()): if k in self.criterion.weight_dict: losses[k] *= self.criterion.weight_dict[k] else: # remove this loss if not specified in `weight_dict` losses.pop(k) return losses else: mask_cls_results = outputs["pred_logits"] mask_pred_results = outputs["pred_masks"] mask_cls_result = mask_cls_results[0] # upsample masks mask_pred_result = retry_if_cuda_oom(F.interpolate)( mask_pred_results[0], size=(images.tensor.shape[-2], images.tensor.shape[-1]), mode="bilinear", align_corners=False, ) del outputs input_per_image = batched_inputs[0] image_size = images.image_sizes[0] # image size without padding after data augmentation height = input_per_image.get("height", image_size[0]) # raw image size before data augmentation width = input_per_image.get("width", image_size[1]) return retry_if_cuda_oom(self.inference_video)(mask_cls_result, mask_pred_result, image_size, height, width) def prepare_targets(self, targets, images): h_pad, w_pad = images.tensor.shape[-2:] gt_instances = [] for targets_per_video in targets: _num_instance = len(targets_per_video["instances"][0]) mask_shape = [_num_instance, self.num_frames, h_pad, w_pad] gt_masks_per_video = torch.zeros(mask_shape, dtype=torch.bool, device=self.device) gt_ids_per_video = [] for f_i, targets_per_frame in enumerate(targets_per_video["instances"]): targets_per_frame = targets_per_frame.to(self.device) h, w = targets_per_frame.image_size gt_ids_per_video.append(targets_per_frame.gt_ids[:, None]) gt_masks_per_video[:, f_i, :h, :w] = targets_per_frame.gt_masks.tensor gt_ids_per_video = torch.cat(gt_ids_per_video, dim=1) valid_idx = (gt_ids_per_video != -1).any(dim=-1) gt_classes_per_video = targets_per_frame.gt_classes[valid_idx] # N, gt_ids_per_video = gt_ids_per_video[valid_idx] # N, num_frames gt_instances.append({"labels": gt_classes_per_video, "ids": gt_ids_per_video}) gt_masks_per_video = gt_masks_per_video[valid_idx].float() # N, num_frames, H, W gt_instances[-1].update({"masks": gt_masks_per_video}) return gt_instances def inference_video(self, pred_cls, pred_masks, img_size, output_height, output_width): if len(pred_cls) > 0: scores = F.softmax(pred_cls, dim=-1)[:, :-1] labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1) # keep top-10 predictions scores_per_image, topk_indices = scores.flatten(0, 1).topk(10, sorted=False) labels_per_image = labels[topk_indices] topk_indices = topk_indices // self.sem_seg_head.num_classes pred_masks = pred_masks[topk_indices] pred_masks = pred_masks[:, :, : img_size[0], : img_size[1]] pred_masks = F.interpolate( pred_masks, size=(output_height, output_width), mode="bilinear", align_corners=False ) masks = pred_masks > 0. out_scores = scores_per_image.tolist() out_labels = labels_per_image.tolist() out_masks = [m for m in masks.cpu()] else: out_scores = [] out_labels = [] out_masks = [] video_output = { "image_size": (output_height, output_width), "pred_scores": out_scores, "pred_labels": out_labels, "pred_masks": out_masks, } return video_output ================================================ FILE: mfvis_nococo/scripts/eval_8gpu_mask2former_r101_video.sh ================================================ export PYTHONPATH=$PYTHONPATH:`pwd` ID=159 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\ --config-file configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml\ --eval-only MODEL.WEIGHTS ../mfvis_models/model_final_r101_0473.pth ================================================ FILE: mfvis_nococo/scripts/train_8gpu_mask2former_r101_video_coco.sh ================================================ export PYTHONPATH=$PYTHONPATH:`pwd` ID=159 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\ --config-file configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep_coco.yaml ================================================ FILE: mfvis_nococo/scripts/train_8gpu_mask2former_r50_video.sh ================================================ export PYTHONPATH=$PYTHONPATH:`pwd` ID=159 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\ --config-file configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml ================================================ FILE: mfvis_nococo/scripts/train_8gpu_mask2former_r50_video_coco.sh ================================================ export PYTHONPATH=$PYTHONPATH:`pwd` ID=159 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\ --config-file configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep_coco.yaml ================================================ FILE: mfvis_nococo/scripts/visual_video_r101.sh ================================================ export PYTHONPATH=$PYTHONPATH:`pwd` CUDA_VISIBLE_DEVICES=0 python3 demo_video/demo.py --config-file configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml --save-frames True \ --input './datasets/ytvis_2019/valid/JPEGImages/' \ --output 'box_patch_newknn_r101_vis/' \ --opts MODEL.WEIGHTS ../mfvis_models/model_final_r101_0473.pth ================================================ FILE: mfvis_nococo/scripts/visual_video_r50.sh ================================================ export PYTHONPATH=$PYTHONPATH:`pwd` CUDA_VISIBLE_DEVICES=0 python3 demo_video/demo.py --config-file configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml --save-frames True \ --input './datasets/ytvis_2019/valid/JPEGImages/' \ --output 'box_patch_newknn_r50_vis/' \ --opts MODEL.WEIGHTS ./mfvis_models/model_final_r50_0438.pth ================================================ FILE: mfvis_nococo/train_net_video.py ================================================ """ This script is a simplified version of the training script in detectron2/tools. """ try: # ignore ShapelyDeprecationWarning from fvcore from shapely.errors import ShapelyDeprecationWarning import warnings warnings.filterwarnings('ignore', category=ShapelyDeprecationWarning) except: pass import copy import itertools import logging import os from collections import OrderedDict from typing import Any, Dict, List, Set import torch import detectron2.utils.comm as comm from detectron2.checkpoint import DetectionCheckpointer from detectron2.config import get_cfg from detectron2.data import MetadataCatalog from detectron2.engine import ( DefaultTrainer, default_argument_parser, default_setup, launch, ) from detectron2.evaluation import ( DatasetEvaluator, inference_on_dataset, print_csv_format, verify_results, ) from detectron2.projects.deeplab import add_deeplab_config, build_lr_scheduler from detectron2.solver.build import maybe_add_gradient_clipping from detectron2.utils.logger import setup_logger # MaskFormer from mask2former import add_maskformer2_config from mask2former_video import ( YTVISDatasetMapper, YTVISEvaluator, add_maskformer2_video_config, build_detection_train_loader, build_detection_test_loader, get_detection_dataset_dicts, ) class Trainer(DefaultTrainer): """ Extension of the Trainer class adapted to MaskFormer. """ @classmethod def build_evaluator(cls, cfg, dataset_name, output_folder=None): """ Create evaluator(s) for a given dataset. This uses the special metadata "evaluator_type" associated with each builtin dataset. For your own dataset, you can simply create an evaluator manually in your script and do not have to worry about the hacky if-else logic here. """ if output_folder is None: output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") os.makedirs(output_folder, exist_ok=True) return YTVISEvaluator(dataset_name, cfg, True, output_folder) @classmethod def build_train_loader(cls, cfg): dataset_name = cfg.DATASETS.TRAIN[0] mapper = YTVISDatasetMapper(cfg, is_train=True) dataset_dict = get_detection_dataset_dicts( dataset_name, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) return build_detection_train_loader(cfg, mapper=mapper, dataset=dataset_dict) @classmethod def build_test_loader(cls, cfg, dataset_name): dataset_name = cfg.DATASETS.TEST[0] mapper = YTVISDatasetMapper(cfg, is_train=False) return build_detection_test_loader(cfg, dataset_name, mapper=mapper) @classmethod def build_lr_scheduler(cls, cfg, optimizer): """ It now calls :func:`detectron2.solver.build_lr_scheduler`. Overwrite it if you'd like a different scheduler. """ return build_lr_scheduler(cfg, optimizer) @classmethod def build_optimizer(cls, cfg, model): weight_decay_norm = cfg.SOLVER.WEIGHT_DECAY_NORM weight_decay_embed = cfg.SOLVER.WEIGHT_DECAY_EMBED defaults = {} defaults["lr"] = cfg.SOLVER.BASE_LR defaults["weight_decay"] = cfg.SOLVER.WEIGHT_DECAY norm_module_types = ( torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d, torch.nn.SyncBatchNorm, # NaiveSyncBatchNorm inherits from BatchNorm2d torch.nn.GroupNorm, torch.nn.InstanceNorm1d, torch.nn.InstanceNorm2d, torch.nn.InstanceNorm3d, torch.nn.LayerNorm, torch.nn.LocalResponseNorm, ) params: List[Dict[str, Any]] = [] memo: Set[torch.nn.parameter.Parameter] = set() for module_name, module in model.named_modules(): for module_param_name, value in module.named_parameters(recurse=False): if not value.requires_grad: continue # Avoid duplicating parameters if value in memo: continue memo.add(value) hyperparams = copy.copy(defaults) if "backbone" in module_name: hyperparams["lr"] = hyperparams["lr"] * cfg.SOLVER.BACKBONE_MULTIPLIER if ( "relative_position_bias_table" in module_param_name or "absolute_pos_embed" in module_param_name ): print(module_param_name) hyperparams["weight_decay"] = 0.0 if isinstance(module, norm_module_types): hyperparams["weight_decay"] = weight_decay_norm if isinstance(module, torch.nn.Embedding): hyperparams["weight_decay"] = weight_decay_embed params.append({"params": [value], **hyperparams}) def maybe_add_full_model_gradient_clipping(optim): # detectron2 doesn't have full model gradient clipping now clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE enable = ( cfg.SOLVER.CLIP_GRADIENTS.ENABLED and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model" and clip_norm_val > 0.0 ) class FullModelGradientClippingOptimizer(optim): def step(self, closure=None): all_params = itertools.chain(*[x["params"] for x in self.param_groups]) torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val) super().step(closure=closure) return FullModelGradientClippingOptimizer if enable else optim optimizer_type = cfg.SOLVER.OPTIMIZER if optimizer_type == "SGD": optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)( params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM ) elif optimizer_type == "ADAMW": optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)( params, cfg.SOLVER.BASE_LR ) else: raise NotImplementedError(f"no optimizer type {optimizer_type}") if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model": optimizer = maybe_add_gradient_clipping(cfg, optimizer) return optimizer @classmethod def test(cls, cfg, model, evaluators=None): """ Evaluate the given model. The given model is expected to already contain weights to evaluate. Args: cfg (CfgNode): model (nn.Module): evaluators (list[DatasetEvaluator] or None): if None, will call :meth:`build_evaluator`. Otherwise, must have the same length as ``cfg.DATASETS.TEST``. Returns: dict: a dict of result metrics """ from torch.cuda.amp import autocast logger = logging.getLogger(__name__) if isinstance(evaluators, DatasetEvaluator): evaluators = [evaluators] if evaluators is not None: assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format( len(cfg.DATASETS.TEST), len(evaluators) ) results = OrderedDict() for idx, dataset_name in enumerate(cfg.DATASETS.TEST): data_loader = cls.build_test_loader(cfg, dataset_name) # When evaluators are passed in as arguments, # implicitly assume that evaluators can be created before data_loader. if evaluators is not None: evaluator = evaluators[idx] else: try: evaluator = cls.build_evaluator(cfg, dataset_name) except NotImplementedError: logger.warn( "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, " "or implement its `build_evaluator` method." ) results[dataset_name] = {} continue with autocast(): results_i = inference_on_dataset(model, data_loader, evaluator) results[dataset_name] = results_i if comm.is_main_process(): assert isinstance( results_i, dict ), "Evaluator must return a dict on the main process. Got {} instead.".format( results_i ) logger.info("Evaluation results for {} in csv format:".format(dataset_name)) print_csv_format(results_i) if len(results) == 1: results = list(results.values())[0] return results def setup(args): """ Create configs and perform basic setups. """ cfg = get_cfg() # for poly lr schedule add_deeplab_config(cfg) add_maskformer2_config(cfg) add_maskformer2_video_config(cfg) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() default_setup(cfg, args) # Setup logger for "mask_former" module setup_logger(name="mask2former") setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="mask2former_video") return cfg def main(args): cfg = setup(args) if args.eval_only: model = Trainer.build_model(cfg) DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume ) res = Trainer.test(cfg, model) if cfg.TEST.AUG.ENABLED: raise NotImplementedError if comm.is_main_process(): verify_results(cfg, res) return res trainer = Trainer(cfg) trainer.resume_or_load(resume=args.resume) return trainer.train() if __name__ == "__main__": args = default_argument_parser().parse_args() print("Command Line Args:", args) launch( main, args.num_gpus, num_machines=args.num_machines, machine_rank=args.machine_rank, dist_url=args.dist_url, args=(args,), ) ================================================ FILE: requirements.txt ================================================ cython scipy shapely timm h5py submitit scikit-image ================================================ FILE: scripts/eval_8gpu_mask2former_r101_video.sh ================================================ export PYTHONPATH=$PYTHONPATH:`pwd` ID=159 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\ --config-file configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml\ --eval-only MODEL.WEIGHTS ./mfvis_models/model_final_r101_0491.pth ================================================ FILE: scripts/eval_8gpu_mask2former_r50_video.sh ================================================ export PYTHONPATH=$PYTHONPATH:`pwd` ID=159 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\ --config-file configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml\ --eval-only MODEL.WEIGHTS ./mfvis_models/model_final_r50_0466.pth ================================================ FILE: scripts/eval_8gpu_mask2former_swinl_video.sh ================================================ export PYTHONPATH=$PYTHONPATH:`pwd` ID=159 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\ --config-file configs/youtubevis_2019/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml\ --eval-only MODEL.WEIGHTS ./mfvis_models/model_final_swinl_0560.pth ================================================ FILE: scripts/train_8gpu_mask2former_r101_video.sh ================================================ export PYTHONPATH=$PYTHONPATH:`pwd` ID=159 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\ --config-file configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml ================================================ FILE: scripts/train_8gpu_mask2former_r50_video.sh ================================================ export PYTHONPATH=$PYTHONPATH:`pwd` #export CUDA_LAUNCH_BLOCKING=1 # for debug ID=159 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\ --config-file configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml ================================================ FILE: scripts/train_8gpu_mask2former_swinl_video.sh ================================================ export PYTHONPATH=$PYTHONPATH:`pwd` ID=159 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\ --config-file configs/youtubevis_2019/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml ================================================ FILE: scripts/visual_video.sh ================================================ export PYTHONPATH=$PYTHONPATH:`pwd` CUDA_VISIBLE_DEVICES=0 python3 demo_video/demo.py --config-file configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml --save-frames True \ --input './datasets/ytvis_2019/valid/JPEGImages/' \ --output 'r101_vis/' \ --opts MODEL.WEIGHTS ./mfvis_models/model_final_r101_0491.pth ================================================ FILE: tools/README.md ================================================ This directory contains few tools for MaskFormer. * `convert-torchvision-to-d2.py` Tool to convert torchvision pre-trained weights for D2. ``` wget https://download.pytorch.org/models/resnet101-63fe2227.pth python tools/convert-torchvision-to-d2.py resnet101-63fe2227.pth R-101.pkl ``` * `convert-pretrained-swin-model-to-d2.py` Tool to convert Swin Transformer pre-trained weights for D2. ``` pip install timm wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth python tools/convert-pretrained-swin-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth python tools/convert-pretrained-swin-model-to-d2.py swin_small_patch4_window7_224.pth swin_small_patch4_window7_224.pkl wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth python tools/convert-pretrained-swin-model-to-d2.py swin_base_patch4_window12_384_22k.pth swin_base_patch4_window12_384_22k.pkl wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth python tools/convert-pretrained-swin-model-to-d2.py swin_large_patch4_window12_384_22k.pth swin_large_patch4_window12_384_22k.pkl ``` * `evaluate_pq_for_semantic_segmentation.py` Tool to evaluate PQ (PQ-stuff) for semantic segmentation predictions. Usage: ``` python tools/evaluate_pq_for_semantic_segmentation.py --dataset-name ade20k_sem_seg_val --json-file OUTPUT_DIR/inference/sem_seg_predictions.json ``` where `OUTPUT_DIR` is set in the config file. * `evaluate_coco_boundary_ap.py` Tool to evaluate Boundary AP for instance segmentation predictions. Usage: ``` python tools/coco_instance_evaluation.py --gt-json-file COCO_GT_JSON --dt-json-file COCO_DT_JSON ``` To install Boundary IoU API, run: ``` pip install git+https://github.com/bowenc0221/boundary-iou-api.git ``` * `analyze_model.py` Tool to analyze model parameters and flops. Usage for semantic segmentation (ADE20K only, use with caution!): ``` python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE ``` Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`. Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like Cityscapes! Usage for panoptic and instance segmentation: ``` python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE ``` Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images. ================================================ FILE: tools/analyze_model.py ================================================ # -*- coding: utf-8 -*- # Modified by Bowen Cheng from https://github.com/facebookresearch/detectron2/blob/main/tools/analyze_model.py import logging import numpy as np from collections import Counter import tqdm from fvcore.nn import flop_count_table # can also try flop_count_str from detectron2.checkpoint import DetectionCheckpointer from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate from detectron2.data import build_detection_test_loader from detectron2.engine import default_argument_parser from detectron2.modeling import build_model from detectron2.projects.deeplab import add_deeplab_config from detectron2.utils.analysis import ( FlopCountAnalysis, activation_count_operators, parameter_count_table, ) from detectron2.utils.logger import setup_logger # fmt: off import os import sys sys.path.insert(1, os.path.join(sys.path[0], '..')) # fmt: on from mask2former import add_maskformer2_config logger = logging.getLogger("detectron2") def setup(args): if args.config_file.endswith(".yaml"): cfg = get_cfg() add_deeplab_config(cfg) add_maskformer2_config(cfg) cfg.merge_from_file(args.config_file) cfg.DATALOADER.NUM_WORKERS = 0 cfg.merge_from_list(args.opts) cfg.freeze() else: cfg = LazyConfig.load(args.config_file) cfg = LazyConfig.apply_overrides(cfg, args.opts) setup_logger(name="fvcore") setup_logger() return cfg def do_flop(cfg): if isinstance(cfg, CfgNode): data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) model = build_model(cfg) DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) else: data_loader = instantiate(cfg.dataloader.test) model = instantiate(cfg.model) model.to(cfg.train.device) DetectionCheckpointer(model).load(cfg.train.init_checkpoint) model.eval() counts = Counter() total_flops = [] for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa if args.use_fixed_input_size and isinstance(cfg, CfgNode): import torch crop_size = cfg.INPUT.CROP.SIZE[0] data[0]["image"] = torch.zeros((3, crop_size, crop_size)) flops = FlopCountAnalysis(model, data) if idx > 0: flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False) counts += flops.by_operator() total_flops.append(flops.total()) logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops)) logger.info( "Average GFlops for each type of operators:\n" + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()]) ) logger.info( "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9) ) def do_activation(cfg): if isinstance(cfg, CfgNode): data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) model = build_model(cfg) DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) else: data_loader = instantiate(cfg.dataloader.test) model = instantiate(cfg.model) model.to(cfg.train.device) DetectionCheckpointer(model).load(cfg.train.init_checkpoint) model.eval() counts = Counter() total_activations = [] for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa count = activation_count_operators(model, data) counts += count total_activations.append(sum(count.values())) logger.info( "(Million) Activations for Each Type of Operators:\n" + str([(k, v / idx) for k, v in counts.items()]) ) logger.info( "Total (Million) Activations: {}±{}".format( np.mean(total_activations), np.std(total_activations) ) ) def do_parameter(cfg): if isinstance(cfg, CfgNode): model = build_model(cfg) else: model = instantiate(cfg.model) logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5)) def do_structure(cfg): if isinstance(cfg, CfgNode): model = build_model(cfg) else: model = instantiate(cfg.model) logger.info("Model Structure:\n" + str(model)) if __name__ == "__main__": parser = default_argument_parser( epilog=""" Examples: To show parameters of a model: $ ./analyze_model.py --tasks parameter \\ --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml Flops and activations are data-dependent, therefore inputs and model weights are needed to count them: $ ./analyze_model.py --num-inputs 100 --tasks flop \\ --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\ MODEL.WEIGHTS /path/to/model.pkl """ ) parser.add_argument( "--tasks", choices=["flop", "activation", "parameter", "structure"], required=True, nargs="+", ) parser.add_argument( "-n", "--num-inputs", default=100, type=int, help="number of inputs used to compute statistics for flops/activations, " "both are data dependent.", ) parser.add_argument( "--use-fixed-input-size", action="store_true", help="use fixed input size when calculating flops", ) args = parser.parse_args() assert not args.eval_only assert args.num_gpus == 1 cfg = setup(args) for task in args.tasks: { "flop": do_flop, "activation": do_activation, "parameter": do_parameter, "structure": do_structure, }[task](cfg) ================================================ FILE: tools/convert-pretrained-swin-model-to-d2.py ================================================ #!/usr/bin/env python import pickle as pkl import sys import torch """ Usage: # download pretrained swin model: wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth # run the conversion ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config: MODEL: WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl" INPUT: FORMAT: "RGB" """ if __name__ == "__main__": input = sys.argv[1] obj = torch.load(input, map_location="cpu")["model"] res = {"model": obj, "__author__": "third_party", "matching_heuristics": True} with open(sys.argv[2], "wb") as f: pkl.dump(res, f) ================================================ FILE: tools/convert-torchvision-to-d2.py ================================================ #!/usr/bin/env python import pickle as pkl import sys import torch """ Usage: # download one of the ResNet{18,34,50,101,152} models from torchvision: wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth # run the conversion ./convert-torchvision-to-d2.py r50.pth r50.pkl # Then, use r50.pkl with the following changes in config: MODEL: WEIGHTS: "/path/to/r50.pkl" PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] RESNETS: DEPTH: 50 STRIDE_IN_1X1: False INPUT: FORMAT: "RGB" """ if __name__ == "__main__": input = sys.argv[1] obj = torch.load(input, map_location="cpu") newmodel = {} for k in list(obj.keys()): old_k = k if "layer" not in k: k = "stem." + k for t in [1, 2, 3, 4]: k = k.replace("layer{}".format(t), "res{}".format(t + 1)) for t in [1, 2, 3]: k = k.replace("bn{}".format(t), "conv{}.norm".format(t)) k = k.replace("downsample.0", "shortcut") k = k.replace("downsample.1", "shortcut.norm") print(old_k, "->", k) newmodel[k] = obj.pop(old_k).detach().numpy() res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True} with open(sys.argv[2], "wb") as f: pkl.dump(res, f) if obj: print("Unconverted keys:", obj.keys()) ================================================ FILE: tools/evaluate_coco_boundary_ap.py ================================================ #!/usr/bin/env python # Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py """ Evaluation for COCO val2017: python ./tools/coco_instance_evaluation.py \ --gt-json-file COCO_GT_JSON \ --dt-json-file COCO_DT_JSON """ import argparse import json from boundary_iou.coco_instance_api.coco import COCO from boundary_iou.coco_instance_api.cocoeval import COCOeval def main(): parser = argparse.ArgumentParser() parser.add_argument("--gt-json-file", default="") parser.add_argument("--dt-json-file", default="") parser.add_argument("--iou-type", default="boundary") parser.add_argument("--dilation-ratio", default="0.020", type=float) args = parser.parse_args() print(args) annFile = args.gt_json_file resFile = args.dt_json_file dilation_ratio = args.dilation_ratio if args.iou_type == "boundary": get_boundary = True else: get_boundary = False cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio) # remove box predictions resFile = json.load(open(resFile)) for c in resFile: c.pop("bbox", None) cocoDt = cocoGt.loadRes(resFile) cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio) cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() if __name__ == '__main__': main() ================================================ FILE: tools/evaluate_pq_for_semantic_segmentation.py ================================================ #!/usr/bin/env python import argparse import json import os from collections import defaultdict from tqdm import tqdm import numpy as np import torch from detectron2.data import MetadataCatalog from detectron2.data.detection_utils import read_image from detectron2.utils.file_io import PathManager from pycocotools import mask as maskUtils from panopticapi.evaluation import PQStat def default_argument_parser(): """ Creates a parser with some common arguments used by analysis tools. Returns: argparse.ArgumentParser: """ parser = argparse.ArgumentParser(description="Evaluate PQ metric for semantic segmentation.") # NOTE: currently does not support Cityscapes, you need to convert # Cityscapes prediction format to Detectron2 prediction format. parser.add_argument( "--dataset-name", default="ade20k_sem_seg_val", choices=["ade20k_sem_seg_val", "coco_2017_test_stuff_10k_sem_seg", "ade20k_full_sem_seg_val"], help="dataset name you want to evaluate") parser.add_argument("--json-file", default="", help="path to detection json file") return parser # Modified from the official panoptic api: https://github.com/cocodataset/panopticapi/blob/master/panopticapi/evaluation.py def pq_compute_single_image(segm_gt, segm_dt, categories, ignore_label): pq_stat = PQStat() VOID = ignore_label OFFSET = 256 * 256 * 256 pan_gt = segm_gt pan_pred = segm_dt gt_ann = {'segments_info': []} labels, labels_cnt = np.unique(segm_gt, return_counts=True) for cat_id, cnt in zip(labels, labels_cnt): if cat_id == VOID: continue gt_ann['segments_info'].append( {"id": cat_id, "category_id": cat_id, "area": cnt, "iscrowd": 0} ) pred_ann = {'segments_info': []} for cat_id in np.unique(segm_dt): pred_ann['segments_info'].append({"id": cat_id, "category_id": cat_id}) gt_segms = {el['id']: el for el in gt_ann['segments_info']} pred_segms = {el['id']: el for el in pred_ann['segments_info']} # predicted segments area calculation + prediction sanity checks pred_labels_set = set(el['id'] for el in pred_ann['segments_info']) labels, labels_cnt = np.unique(pan_pred, return_counts=True) for label, label_cnt in zip(labels, labels_cnt): if label not in pred_segms: if label == VOID: continue raise KeyError('In the image with ID {} segment with ID {} is presented in PNG and not presented in JSON.'.format(image_id, label)) pred_segms[label]['area'] = label_cnt pred_labels_set.remove(label) if pred_segms[label]['category_id'] not in categories: raise KeyError('In the image with ID {} segment with ID {} has unknown category_id {}.'.format(image_id, label, pred_segms[label]['category_id'])) if len(pred_labels_set) != 0: raise KeyError('In the image with ID {} the following segment IDs {} are presented in JSON and not presented in PNG.'.format(image_id, list(pred_labels_set))) # confusion matrix calculation pan_gt_pred = pan_gt.astype(np.uint64) * OFFSET + pan_pred.astype(np.uint64) gt_pred_map = {} labels, labels_cnt = np.unique(pan_gt_pred, return_counts=True) for label, intersection in zip(labels, labels_cnt): gt_id = label // OFFSET pred_id = label % OFFSET gt_pred_map[(gt_id, pred_id)] = intersection # count all matched pairs gt_matched = set() pred_matched = set() for label_tuple, intersection in gt_pred_map.items(): gt_label, pred_label = label_tuple if gt_label not in gt_segms: continue if pred_label not in pred_segms: continue if gt_segms[gt_label]['iscrowd'] == 1: continue if gt_segms[gt_label]['category_id'] != pred_segms[pred_label]['category_id']: continue union = pred_segms[pred_label]['area'] + gt_segms[gt_label]['area'] - intersection - gt_pred_map.get((VOID, pred_label), 0) iou = intersection / union if iou > 0.5: pq_stat[gt_segms[gt_label]['category_id']].tp += 1 pq_stat[gt_segms[gt_label]['category_id']].iou += iou gt_matched.add(gt_label) pred_matched.add(pred_label) # count false positives crowd_labels_dict = {} for gt_label, gt_info in gt_segms.items(): if gt_label in gt_matched: continue # crowd segments are ignored if gt_info['iscrowd'] == 1: crowd_labels_dict[gt_info['category_id']] = gt_label continue pq_stat[gt_info['category_id']].fn += 1 # count false positives for pred_label, pred_info in pred_segms.items(): if pred_label in pred_matched: continue # intersection of the segment with VOID intersection = gt_pred_map.get((VOID, pred_label), 0) # plus intersection with corresponding CROWD region if it exists if pred_info['category_id'] in crowd_labels_dict: intersection += gt_pred_map.get((crowd_labels_dict[pred_info['category_id']], pred_label), 0) # predicted segment is ignored if more than half of the segment correspond to VOID and CROWD regions if intersection / pred_info['area'] > 0.5: continue pq_stat[pred_info['category_id']].fp += 1 return pq_stat def main(): parser = default_argument_parser() args = parser.parse_args() _root = os.getenv("DETECTRON2_DATASETS", "datasets") json_file = args.json_file with open(json_file) as f: predictions = json.load(f) imgToAnns = defaultdict(list) for pred in predictions: image_id = os.path.basename(pred["file_name"]).split(".")[0] imgToAnns[image_id].append( {"category_id" : pred["category_id"], "segmentation" : pred["segmentation"]} ) image_ids = list(imgToAnns.keys()) meta = MetadataCatalog.get(args.dataset_name) class_names = meta.stuff_classes num_classes = len(meta.stuff_classes) ignore_label = meta.ignore_label conf_matrix = np.zeros((num_classes + 1, num_classes + 1), dtype=np.int64) categories = {} for i in range(num_classes): categories[i] = {"id": i, "name": class_names[i], "isthing": 0} pq_stat = PQStat() for image_id in tqdm(image_ids): if args.dataset_name == "ade20k_sem_seg_val": gt_dir = os.path.join(_root, "ADEChallengeData2016", "annotations_detectron2", "validation") segm_gt = read_image(os.path.join(gt_dir, image_id + ".png")).copy().astype(np.int64) elif args.dataset_name == "coco_2017_test_stuff_10k_sem_seg": gt_dir = os.path.join(_root, "coco", "coco_stuff_10k", "annotations_detectron2", "test") segm_gt = read_image(os.path.join(gt_dir, image_id + ".png")).copy().astype(np.int64) elif args.dataset_name == "ade20k_full_sem_seg_val": gt_dir = os.path.join(_root, "ADE20K_2021_17_01", "annotations_detectron2", "validation") segm_gt = read_image(os.path.join(gt_dir, image_id + ".tif")).copy().astype(np.int64) else: raise ValueError(f"Unsupported dataset {args.dataset_name}") # get predictions segm_dt = np.zeros_like(segm_gt) anns = imgToAnns[image_id] for ann in anns: # map back category_id if hasattr(meta, "stuff_dataset_id_to_contiguous_id"): if ann["category_id"] in meta.stuff_dataset_id_to_contiguous_id: category_id = meta.stuff_dataset_id_to_contiguous_id[ann["category_id"]] else: category_id = ann["category_id"] mask = maskUtils.decode(ann["segmentation"]) segm_dt[mask > 0] = category_id # miou gt = segm_gt.copy() pred = segm_dt.copy() gt[gt == ignore_label] = num_classes conf_matrix += np.bincount( (num_classes + 1) * pred.reshape(-1) + gt.reshape(-1), minlength=conf_matrix.size, ).reshape(conf_matrix.shape) # pq pq_stat_single = pq_compute_single_image(segm_gt, segm_dt, categories, meta.ignore_label) pq_stat += pq_stat_single metrics = [("All", None), ("Stuff", False)] results = {} for name, isthing in metrics: results[name], per_class_results = pq_stat.pq_average(categories, isthing=isthing) if name == 'All': results['per_class'] = per_class_results print("{:10s}| {:>5s} {:>5s} {:>5s} {:>5s}".format("", "PQ", "SQ", "RQ", "N")) print("-" * (10 + 7 * 4)) for name, _isthing in metrics: print("{:10s}| {:5.1f} {:5.1f} {:5.1f} {:5d}".format( name, 100 * results[name]['pq'], 100 * results[name]['sq'], 100 * results[name]['rq'], results[name]['n']) ) # calculate miou acc = np.full(num_classes, np.nan, dtype=np.float64) iou = np.full(num_classes, np.nan, dtype=np.float64) tp = conf_matrix.diagonal()[:-1].astype(np.float64) pos_gt = np.sum(conf_matrix[:-1, :-1], axis=0).astype(np.float64) pos_pred = np.sum(conf_matrix[:-1, :-1], axis=1).astype(np.float64) acc_valid = pos_gt > 0 acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid] iou_valid = (pos_gt + pos_pred) > 0 union = pos_gt + pos_pred - tp iou[acc_valid] = tp[acc_valid] / union[acc_valid] miou = np.sum(iou[acc_valid]) / np.sum(iou_valid) print("") print(f"mIoU: {miou}") if __name__ == '__main__': main() ================================================ FILE: train_net.py ================================================ """ MaskFormer Training Script. This script is a simplified version of the training script in detectron2/tools. """ try: # ignore ShapelyDeprecationWarning from fvcore from shapely.errors import ShapelyDeprecationWarning import warnings warnings.filterwarnings('ignore', category=ShapelyDeprecationWarning) except: pass import copy import itertools import logging import os from collections import OrderedDict from typing import Any, Dict, List, Set import torch import detectron2.utils.comm as comm from detectron2.checkpoint import DetectionCheckpointer from detectron2.config import get_cfg from detectron2.data import MetadataCatalog, build_detection_train_loader from detectron2.engine import ( DefaultTrainer, default_argument_parser, default_setup, launch, ) from detectron2.evaluation import ( CityscapesInstanceEvaluator, CityscapesSemSegEvaluator, COCOEvaluator, COCOPanopticEvaluator, DatasetEvaluators, LVISEvaluator, SemSegEvaluator, verify_results, ) from detectron2.projects.deeplab import add_deeplab_config, build_lr_scheduler from detectron2.solver.build import maybe_add_gradient_clipping from detectron2.utils.logger import setup_logger # MaskFormer from mask2former import ( COCOInstanceNewBaselineDatasetMapper, COCOPanopticNewBaselineDatasetMapper, InstanceSegEvaluator, MaskFormerInstanceDatasetMapper, MaskFormerPanopticDatasetMapper, MaskFormerSemanticDatasetMapper, SemanticSegmentorWithTTA, add_maskformer2_config, ) class Trainer(DefaultTrainer): """ Extension of the Trainer class adapted to MaskFormer. """ @classmethod def build_evaluator(cls, cfg, dataset_name, output_folder=None): """ Create evaluator(s) for a given dataset. This uses the special metadata "evaluator_type" associated with each builtin dataset. For your own dataset, you can simply create an evaluator manually in your script and do not have to worry about the hacky if-else logic here. """ if output_folder is None: output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") evaluator_list = [] evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type # semantic segmentation if evaluator_type in ["sem_seg", "ade20k_panoptic_seg"]: evaluator_list.append( SemSegEvaluator( dataset_name, distributed=True, output_dir=output_folder, ) ) # instance segmentation if evaluator_type == "coco": evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder)) # panoptic segmentation if evaluator_type in [ "coco_panoptic_seg", "ade20k_panoptic_seg", "cityscapes_panoptic_seg", "mapillary_vistas_panoptic_seg", ]: if cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON: evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder)) # COCO if evaluator_type == "coco_panoptic_seg" and cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON: evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder)) if evaluator_type == "coco_panoptic_seg" and cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON: evaluator_list.append(SemSegEvaluator(dataset_name, distributed=True, output_dir=output_folder)) # Mapillary Vistas if evaluator_type == "mapillary_vistas_panoptic_seg" and cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON: evaluator_list.append(InstanceSegEvaluator(dataset_name, output_dir=output_folder)) if evaluator_type == "mapillary_vistas_panoptic_seg" and cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON: evaluator_list.append(SemSegEvaluator(dataset_name, distributed=True, output_dir=output_folder)) # Cityscapes if evaluator_type == "cityscapes_instance": assert ( torch.cuda.device_count() > comm.get_rank() ), "CityscapesEvaluator currently do not work with multiple machines." return CityscapesInstanceEvaluator(dataset_name) if evaluator_type == "cityscapes_sem_seg": assert ( torch.cuda.device_count() > comm.get_rank() ), "CityscapesEvaluator currently do not work with multiple machines." return CityscapesSemSegEvaluator(dataset_name) if evaluator_type == "cityscapes_panoptic_seg": if cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON: assert ( torch.cuda.device_count() > comm.get_rank() ), "CityscapesEvaluator currently do not work with multiple machines." evaluator_list.append(CityscapesSemSegEvaluator(dataset_name)) if cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON: assert ( torch.cuda.device_count() > comm.get_rank() ), "CityscapesEvaluator currently do not work with multiple machines." evaluator_list.append(CityscapesInstanceEvaluator(dataset_name)) # ADE20K if evaluator_type == "ade20k_panoptic_seg" and cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON: evaluator_list.append(InstanceSegEvaluator(dataset_name, output_dir=output_folder)) # LVIS if evaluator_type == "lvis": return LVISEvaluator(dataset_name, output_dir=output_folder) if len(evaluator_list) == 0: raise NotImplementedError( "no Evaluator for the dataset {} with the type {}".format( dataset_name, evaluator_type ) ) elif len(evaluator_list) == 1: return evaluator_list[0] return DatasetEvaluators(evaluator_list) @classmethod def build_train_loader(cls, cfg): # Semantic segmentation dataset mapper if cfg.INPUT.DATASET_MAPPER_NAME == "mask_former_semantic": mapper = MaskFormerSemanticDatasetMapper(cfg, True) return build_detection_train_loader(cfg, mapper=mapper) # Panoptic segmentation dataset mapper elif cfg.INPUT.DATASET_MAPPER_NAME == "mask_former_panoptic": mapper = MaskFormerPanopticDatasetMapper(cfg, True) return build_detection_train_loader(cfg, mapper=mapper) # Instance segmentation dataset mapper elif cfg.INPUT.DATASET_MAPPER_NAME == "mask_former_instance": mapper = MaskFormerInstanceDatasetMapper(cfg, True) return build_detection_train_loader(cfg, mapper=mapper) # coco instance segmentation lsj new baseline elif cfg.INPUT.DATASET_MAPPER_NAME == "coco_instance_lsj": mapper = COCOInstanceNewBaselineDatasetMapper(cfg, True) return build_detection_train_loader(cfg, mapper=mapper) # coco panoptic segmentation lsj new baseline elif cfg.INPUT.DATASET_MAPPER_NAME == "coco_panoptic_lsj": mapper = COCOPanopticNewBaselineDatasetMapper(cfg, True) return build_detection_train_loader(cfg, mapper=mapper) else: mapper = None return build_detection_train_loader(cfg, mapper=mapper) @classmethod def build_lr_scheduler(cls, cfg, optimizer): """ It now calls :func:`detectron2.solver.build_lr_scheduler`. Overwrite it if you'd like a different scheduler. """ return build_lr_scheduler(cfg, optimizer) @classmethod def build_optimizer(cls, cfg, model): weight_decay_norm = cfg.SOLVER.WEIGHT_DECAY_NORM weight_decay_embed = cfg.SOLVER.WEIGHT_DECAY_EMBED defaults = {} defaults["lr"] = cfg.SOLVER.BASE_LR defaults["weight_decay"] = cfg.SOLVER.WEIGHT_DECAY norm_module_types = ( torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d, torch.nn.SyncBatchNorm, # NaiveSyncBatchNorm inherits from BatchNorm2d torch.nn.GroupNorm, torch.nn.InstanceNorm1d, torch.nn.InstanceNorm2d, torch.nn.InstanceNorm3d, torch.nn.LayerNorm, torch.nn.LocalResponseNorm, ) params: List[Dict[str, Any]] = [] memo: Set[torch.nn.parameter.Parameter] = set() for module_name, module in model.named_modules(): for module_param_name, value in module.named_parameters(recurse=False): if not value.requires_grad: continue # Avoid duplicating parameters if value in memo: continue memo.add(value) hyperparams = copy.copy(defaults) if "backbone" in module_name: hyperparams["lr"] = hyperparams["lr"] * cfg.SOLVER.BACKBONE_MULTIPLIER if ( "relative_position_bias_table" in module_param_name or "absolute_pos_embed" in module_param_name ): print(module_param_name) hyperparams["weight_decay"] = 0.0 if isinstance(module, norm_module_types): hyperparams["weight_decay"] = weight_decay_norm if isinstance(module, torch.nn.Embedding): hyperparams["weight_decay"] = weight_decay_embed params.append({"params": [value], **hyperparams}) def maybe_add_full_model_gradient_clipping(optim): # detectron2 doesn't have full model gradient clipping now clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE enable = ( cfg.SOLVER.CLIP_GRADIENTS.ENABLED and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model" and clip_norm_val > 0.0 ) class FullModelGradientClippingOptimizer(optim): def step(self, closure=None): all_params = itertools.chain(*[x["params"] for x in self.param_groups]) torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val) super().step(closure=closure) return FullModelGradientClippingOptimizer if enable else optim optimizer_type = cfg.SOLVER.OPTIMIZER if optimizer_type == "SGD": optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)( params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM ) elif optimizer_type == "ADAMW": optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)( params, cfg.SOLVER.BASE_LR ) else: raise NotImplementedError(f"no optimizer type {optimizer_type}") if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model": optimizer = maybe_add_gradient_clipping(cfg, optimizer) return optimizer @classmethod def test_with_TTA(cls, cfg, model): logger = logging.getLogger("detectron2.trainer") # In the end of training, run an evaluation with TTA. logger.info("Running inference with test-time augmentation ...") model = SemanticSegmentorWithTTA(cfg, model) evaluators = [ cls.build_evaluator( cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA") ) for name in cfg.DATASETS.TEST ] res = cls.test(cfg, model, evaluators) res = OrderedDict({k + "_TTA": v for k, v in res.items()}) return res def setup(args): """ Create configs and perform basic setups. """ cfg = get_cfg() # for poly lr schedule add_deeplab_config(cfg) add_maskformer2_config(cfg) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() default_setup(cfg, args) # Setup logger for "mask_former" module setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="mask2former") return cfg def main(args): cfg = setup(args) if args.eval_only: model = Trainer.build_model(cfg) DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume ) res = Trainer.test(cfg, model) if cfg.TEST.AUG.ENABLED: res.update(Trainer.test_with_TTA(cfg, model)) if comm.is_main_process(): verify_results(cfg, res) return res trainer = Trainer(cfg) trainer.resume_or_load(resume=args.resume) return trainer.train() if __name__ == "__main__": args = default_argument_parser().parse_args() print("Command Line Args:", args) launch( main, args.num_gpus, num_machines=args.num_machines, machine_rank=args.machine_rank, dist_url=args.dist_url, args=(args,), ) ================================================ FILE: train_net_video.py ================================================ """ This script is a simplified version of the training script in detectron2/tools. """ try: # ignore ShapelyDeprecationWarning from fvcore from shapely.errors import ShapelyDeprecationWarning import warnings warnings.filterwarnings('ignore', category=ShapelyDeprecationWarning) except: pass import copy import itertools import logging import os from collections import OrderedDict from typing import Any, Dict, List, Set import torch import detectron2.utils.comm as comm from detectron2.checkpoint import DetectionCheckpointer from detectron2.config import get_cfg from detectron2.data import MetadataCatalog, build_detection_train_loader from detectron2.engine import ( DefaultTrainer, default_argument_parser, default_setup, launch, ) from detectron2.evaluation import ( DatasetEvaluator, inference_on_dataset, print_csv_format, verify_results, ) from detectron2.projects.deeplab import add_deeplab_config, build_lr_scheduler from detectron2.solver.build import maybe_add_gradient_clipping from detectron2.utils.logger import setup_logger # MaskFormer from mask2former import add_maskformer2_config from mask2former_video import ( YTVISDatasetMapper, CocoClipDatasetMapper, build_combined_loader, YTVISEvaluator, add_maskformer2_video_config, build_detection_train_loader, build_detection_test_loader, get_detection_dataset_dicts, ) from torch.utils.data import Dataset, ConcatDataset class Trainer(DefaultTrainer): """ Extension of the Trainer class adapted to MaskFormer. """ @classmethod def build_evaluator(cls, cfg, dataset_name, output_folder=None): """ Create evaluator(s) for a given dataset. This uses the special metadata "evaluator_type" associated with each builtin dataset. For your own dataset, you can simply create an evaluator manually in your script and do not have to worry about the hacky if-else logic here. """ if output_folder is None: output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") os.makedirs(output_folder, exist_ok=True) return YTVISEvaluator(dataset_name, cfg, True, output_folder) @classmethod def build_train_loader(cls, cfg): mappers = [] for d_i, dataset_name in enumerate(cfg.DATASETS.TRAIN): if dataset_name.startswith('coco'): mappers.append( CocoClipDatasetMapper( cfg, is_train=True, is_tgt=(d_i==len(cfg.DATASETS.TRAIN)-1), src_dataset_name=dataset_name ) ) elif dataset_name.startswith('ytvis') or dataset_name.startswith('ovis'): mappers.append( YTVISDatasetMapper(cfg, is_train=True, is_tgt=(d_i==len(cfg.DATASETS.TRAIN)-1), src_dataset_name=dataset_name) ) loaders = [ build_detection_train_loader(cfg, mapper=mapper, dataset_name=dataset_name) for mapper, dataset_name in zip(mappers, cfg.DATASETS.TRAIN) ] DATASET_RATIO = [1.0, 0.75] combined_data_loader = build_combined_loader(cfg, loaders, DATASET_RATIO) return combined_data_loader @classmethod def build_test_loader(cls, cfg, dataset_name): dataset_name = cfg.DATASETS.TEST[0] mapper = YTVISDatasetMapper(cfg, is_train=False) return build_detection_test_loader(cfg, dataset_name, mapper=mapper) @classmethod def build_lr_scheduler(cls, cfg, optimizer): """ It now calls :func:`detectron2.solver.build_lr_scheduler`. Overwrite it if you'd like a different scheduler. """ return build_lr_scheduler(cfg, optimizer) @classmethod def build_optimizer(cls, cfg, model): weight_decay_norm = cfg.SOLVER.WEIGHT_DECAY_NORM weight_decay_embed = cfg.SOLVER.WEIGHT_DECAY_EMBED defaults = {} defaults["lr"] = cfg.SOLVER.BASE_LR defaults["weight_decay"] = cfg.SOLVER.WEIGHT_DECAY norm_module_types = ( torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d, torch.nn.SyncBatchNorm, # NaiveSyncBatchNorm inherits from BatchNorm2d torch.nn.GroupNorm, torch.nn.InstanceNorm1d, torch.nn.InstanceNorm2d, torch.nn.InstanceNorm3d, torch.nn.LayerNorm, torch.nn.LocalResponseNorm, ) params: List[Dict[str, Any]] = [] memo: Set[torch.nn.parameter.Parameter] = set() for module_name, module in model.named_modules(): for module_param_name, value in module.named_parameters(recurse=False): if not value.requires_grad: continue # Avoid duplicating parameters if value in memo: continue memo.add(value) hyperparams = copy.copy(defaults) if "backbone" in module_name: hyperparams["lr"] = hyperparams["lr"] * cfg.SOLVER.BACKBONE_MULTIPLIER if ( "relative_position_bias_table" in module_param_name or "absolute_pos_embed" in module_param_name ): print(module_param_name) hyperparams["weight_decay"] = 0.0 if isinstance(module, norm_module_types): hyperparams["weight_decay"] = weight_decay_norm if isinstance(module, torch.nn.Embedding): hyperparams["weight_decay"] = weight_decay_embed params.append({"params": [value], **hyperparams}) def maybe_add_full_model_gradient_clipping(optim): # detectron2 doesn't have full model gradient clipping now clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE enable = ( cfg.SOLVER.CLIP_GRADIENTS.ENABLED and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model" and clip_norm_val > 0.0 ) class FullModelGradientClippingOptimizer(optim): def step(self, closure=None): all_params = itertools.chain(*[x["params"] for x in self.param_groups]) torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val) super().step(closure=closure) return FullModelGradientClippingOptimizer if enable else optim optimizer_type = cfg.SOLVER.OPTIMIZER if optimizer_type == "SGD": optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)( params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM ) elif optimizer_type == "ADAMW": optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)( params, cfg.SOLVER.BASE_LR ) else: raise NotImplementedError(f"no optimizer type {optimizer_type}") if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model": optimizer = maybe_add_gradient_clipping(cfg, optimizer) return optimizer @classmethod def test(cls, cfg, model, evaluators=None): """ Evaluate the given model. The given model is expected to already contain weights to evaluate. Args: cfg (CfgNode): model (nn.Module): evaluators (list[DatasetEvaluator] or None): if None, will call :meth:`build_evaluator`. Otherwise, must have the same length as ``cfg.DATASETS.TEST``. Returns: dict: a dict of result metrics """ from torch.cuda.amp import autocast logger = logging.getLogger(__name__) if isinstance(evaluators, DatasetEvaluator): evaluators = [evaluators] if evaluators is not None: assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format( len(cfg.DATASETS.TEST), len(evaluators) ) results = OrderedDict() for idx, dataset_name in enumerate(cfg.DATASETS.TEST): data_loader = cls.build_test_loader(cfg, dataset_name) # When evaluators are passed in as arguments, # implicitly assume that evaluators can be created before data_loader. if evaluators is not None: evaluator = evaluators[idx] else: try: evaluator = cls.build_evaluator(cfg, dataset_name) except NotImplementedError: logger.warn( "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, " "or implement its `build_evaluator` method." ) results[dataset_name] = {} continue with autocast(): results_i = inference_on_dataset(model, data_loader, evaluator) results[dataset_name] = results_i if comm.is_main_process(): assert isinstance( results_i, dict ), "Evaluator must return a dict on the main process. Got {} instead.".format( results_i ) logger.info("Evaluation results for {} in csv format:".format(dataset_name)) print_csv_format(results_i) if len(results) == 1: results = list(results.values())[0] return results def setup(args): """ Create configs and perform basic setups. """ cfg = get_cfg() # for poly lr schedule add_deeplab_config(cfg) add_maskformer2_config(cfg) add_maskformer2_video_config(cfg) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() default_setup(cfg, args) # Setup logger for "mask_former" module setup_logger(name="mask2former") setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="mask2former_video") return cfg def main(args): cfg = setup(args) if args.eval_only: model = Trainer.build_model(cfg) DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume ) res = Trainer.test(cfg, model) if cfg.TEST.AUG.ENABLED: raise NotImplementedError if comm.is_main_process(): verify_results(cfg, res) return res trainer = Trainer(cfg) trainer.resume_or_load(resume=args.resume) return trainer.train() if __name__ == "__main__": args = default_argument_parser().parse_args() print("Command Line Args:", args) launch( main, args.num_gpus, num_machines=args.num_machines, machine_rank=args.machine_rank, dist_url=args.dist_url, args=(args,), ) ================================================ FILE: util/__init__.py ================================================ # ------------------------------------------------------------------------ # SeqFormer # ------------------------------------------------------------------------ # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) # Copyright (c) 2020 SenseTime. All Rights Reserved. # ------------------------------------------------------------------------ ================================================ FILE: util/box_ops.py ================================================ # ------------------------------------------------------------------------ # SeqFormer # ------------------------------------------------------------------------ # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) # Copyright (c) 2020 SenseTime. All Rights Reserved. # ------------------------------------------------------------------------ """ Utilities for bounding box manipulation and GIoU. """ import torch from torchvision.ops.boxes import box_area def box_cxcywh_to_xyxy(x): # print('box:\n', x) x_c, y_c, w, h = x.unbind(-1) b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] return torch.stack(b, dim=-1) def box_xyxy_to_cxcywh(x): x0, y0, x1, y1 = x.unbind(-1) b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)] return torch.stack(b, dim=-1) # modified from torchvision to also return the union def box_iou(boxes1, boxes2): area1 = box_area(boxes1) area2 = box_area(boxes2) lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] wh = (rb - lt).clamp(min=0) # [N,M,2] inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] union = area1[:, None] + area2 - inter iou = inter / (union + 1e-7) return iou, union def multi_box_iou(boxes1, boxes2): area1 = box_area(boxes1.flatten(0,1)).reshape(boxes1.shape[0], boxes1.shape[1]) area2 = box_area(boxes2.flatten(0,1)).reshape(boxes2.shape[0], boxes2.shape[1]) lt = torch.max(boxes1[:, :, None, :2], boxes2[:, None, :, :2]) # [nf,N,M,2] rb = torch.min(boxes1[:, :, None, 2:], boxes2[:, None, :, 2:]) # [nf,N,M,2] wh = (rb - lt).clamp(min=0) # [nf,N,M,2] inter = wh[:, :, :, 0] * wh[:, :, :, 1] # [nf,N,M] union = area1[:, :, None] + area2[:, None, :] - inter iou = inter / (union + 1e-7) return iou, union def generalized_box_iou(boxes1, boxes2): """ Generalized IoU from https://giou.stanford.edu/ The boxes should be in [x0, y0, x1, y1] format Returns a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) """ # degenerate boxes gives inf / nan results # so do an early check assert (boxes1[:, 2:] >= boxes1[:, :2]).all() assert (boxes2[:, 2:] >= boxes2[:, :2]).all() iou, union = box_iou(boxes1, boxes2) lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) wh = (rb - lt).clamp(min=0) # [N,M,2] area = wh[:, :, 0] * wh[:, :, 1] # return iou - (area - union) / area return iou - (area - union) / (area + 1e-7) def generalized_multi_box_iou(boxes1, boxes2): """ Generalized IoU from https://giou.stanford.edu/ The boxes should be in [x0, y0, x1, y1] format boxes1.shape = [nf, N, 4] boxes2.shape = [nf, M, 4] Returns a [nf, N, M] pairwise matrix, where N = boxes1.shape[1] and M = boxes2.shape[1] """ # degenerate boxes gives inf / nan results # so do an early check assert (boxes1[:, :, 2:] >= boxes1[:, :, :2]).all() assert (boxes2[:, :, 2:] >= boxes2[:, :, :2]).all() iou, union = multi_box_iou(boxes1, boxes2) lt = torch.min(boxes1[:, :, None, :2], boxes2[:, None, :, :2]) rb = torch.max(boxes1[:, :, None, 2:], boxes2[:, None, :, 2:]) wh = (rb - lt).clamp(min=0) # [nf,N,M,2] area = wh[:, :, :, 0] * wh[:, :, :, 1] return iou - (area - union) / (area + 1e-7) def masks_to_boxes(masks): """Compute the bounding boxes around the provided masks The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. Returns a [N, 4] tensors, with the boxes in xyxy format """ if masks.numel() == 0: return torch.zeros((0, 4), device=masks.device) h, w = masks.shape[-2:] y = torch.arange(0, h, dtype=torch.float, device=masks.device) x = torch.arange(0, w, dtype=torch.float, device=masks.device) y, x = torch.meshgrid(y, x) x_mask = (masks * x.unsqueeze(0)) x_max = x_mask.flatten(1).max(-1)[0] x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] y_mask = (masks * y.unsqueeze(0)) y_max = y_mask.flatten(1).max(-1)[0] y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] return torch.stack([x_min, y_min, x_max, y_max], 1) ================================================ FILE: util/misc.py ================================================ # ------------------------------------------------------------------------ # SeqFormer # ------------------------------------------------------------------------ # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) # Copyright (c) 2020 SenseTime. All Rights Reserved. # ------------------------------------------------------------------------ """ Misc functions, including distributed helpers. Mostly copy-paste from torchvision references. """ import os import subprocess import time from collections import defaultdict, deque import datetime import pickle from typing import Optional, List import torch import torch.nn as nn import torch.distributed as dist from torch import Tensor # needed due to empty tensor bug in pytorch and torchvision 0.5 import torchvision if float(torchvision.__version__[:3]) < 0.5: import math from torchvision.ops.misc import _NewEmptyTensorOp def _check_size_scale_factor(dim, size, scale_factor): # type: (int, Optional[List[int]], Optional[float]) -> None if size is None and scale_factor is None: raise ValueError("either size or scale_factor should be defined") if size is not None and scale_factor is not None: raise ValueError("only one of size or scale_factor should be defined") if not (scale_factor is not None and len(scale_factor) != dim): raise ValueError( "scale_factor shape must match input shape. " "Input is {}D, scale_factor size is {}".format(dim, len(scale_factor)) ) def _output_size(dim, input, size, scale_factor): # type: (int, Tensor, Optional[List[int]], Optional[float]) -> List[int] assert dim == 2 _check_size_scale_factor(dim, size, scale_factor) if size is not None: return size # if dim is not 2 or scale_factor is iterable use _ntuple instead of concat assert scale_factor is not None and isinstance(scale_factor, (int, float)) scale_factors = [scale_factor, scale_factor] # math.floor might return float in py2.7 return [ int(math.floor(input.size(i + 2) * scale_factors[i])) for i in range(dim) ] elif float(torchvision.__version__[:3]) < 0.7: from torchvision.ops import _new_empty_tensor from torchvision.ops.misc import _output_size class SmoothedValue(object): """Track a series of values and provide access to smoothed values over a window or the global series average. """ def __init__(self, window_size=20, fmt=None): if fmt is None: fmt = "{median:.4f} ({global_avg:.4f})" self.deque = deque(maxlen=window_size) self.total = 0.0 self.count = 0 self.fmt = fmt def update(self, value, n=1): self.deque.append(value) self.count += n self.total += value * n def synchronize_between_processes(self): """ Warning: does not synchronize the deque! """ if not is_dist_avail_and_initialized(): return t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') dist.barrier() dist.all_reduce(t) t = t.tolist() self.count = int(t[0]) self.total = t[1] @property def median(self): d = torch.tensor(list(self.deque)) return d.median().item() @property def avg(self): d = torch.tensor(list(self.deque), dtype=torch.float32) return d.mean().item() @property def global_avg(self): return self.total / self.count @property def max(self): return max(self.deque) @property def value(self): return self.deque[-1] def __str__(self): return self.fmt.format( median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value) def all_gather(data): """ Run all_gather on arbitrary picklable data (not necessarily tensors) Args: data: any picklable object Returns: list[data]: list of data gathered from each rank """ world_size = get_world_size() if world_size == 1: return [data] # serialized to a Tensor buffer = pickle.dumps(data) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to("cuda") # obtain Tensor size of each rank local_size = torch.tensor([tensor.numel()], device="cuda") size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # receiving Tensor from all ranks # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes tensor_list = [] for _ in size_list: tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda")) if local_size != max_size: padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda") tensor = torch.cat((tensor, padding), dim=0) dist.all_gather(tensor_list, tensor) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list def reduce_dict(input_dict, average=True): """ Args: input_dict (dict): all the values will be reduced average (bool): whether to do average or sum Reduce the values in the dictionary from all processes so that all processes have the averaged results. Returns a dict with the same fields as input_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return input_dict with torch.no_grad(): names = [] values = [] # sort the keys so that they are consistent across processes for k in sorted(input_dict.keys()): names.append(k) values.append(input_dict[k]) values = torch.stack(values, dim=0) dist.all_reduce(values) if average: values /= world_size reduced_dict = {k: v for k, v in zip(names, values)} return reduced_dict class MetricLogger(object): def __init__(self, delimiter="\t"): self.meters = defaultdict(SmoothedValue) self.delimiter = delimiter def update(self, **kwargs): for k, v in kwargs.items(): if isinstance(v, torch.Tensor): v = v.item() assert isinstance(v, (float, int)) self.meters[k].update(v) def __getattr__(self, attr): if attr in self.meters: return self.meters[attr] if attr in self.__dict__: return self.__dict__[attr] raise AttributeError("'{}' object has no attribute '{}'".format( type(self).__name__, attr)) def __str__(self): loss_str = [] for name, meter in self.meters.items(): loss_str.append( "{}: {}".format(name, str(meter)) ) return self.delimiter.join(loss_str) def synchronize_between_processes(self): for meter in self.meters.values(): meter.synchronize_between_processes() def add_meter(self, name, meter): self.meters[name] = meter def log_every(self, iterable, print_freq, header=None): i = 0 if not header: header = '' start_time = time.time() end = time.time() iter_time = SmoothedValue(fmt='{avg:.4f}') data_time = SmoothedValue(fmt='{avg:.4f}') space_fmt = ':' + str(len(str(len(iterable)))) + 'd' if torch.cuda.is_available(): log_msg = self.delimiter.join([ header, '[{0' + space_fmt + '}/{1}]', 'eta: {eta}', '{meters}', 'time: {time}', 'data: {data}', 'max mem: {memory:.0f}' ]) else: log_msg = self.delimiter.join([ header, '[{0' + space_fmt + '}/{1}]', 'eta: {eta}', '{meters}', 'time: {time}', 'data: {data}' ]) MB = 1024.0 * 1024.0 for obj in iterable: data_time.update(time.time() - end) yield obj iter_time.update(time.time() - end) if i % print_freq == 0 or i == len(iterable) - 1: eta_seconds = iter_time.global_avg * (len(iterable) - i) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if torch.cuda.is_available(): print(log_msg.format( i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time), memory=torch.cuda.max_memory_allocated() / MB)) else: print(log_msg.format( i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time))) i += 1 end = time.time() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('{} Total time: {} ({:.4f} s / it)'.format( header, total_time_str, total_time / len(iterable))) def get_sha(): cwd = os.path.dirname(os.path.abspath(__file__)) def _run(command): return subprocess.check_output(command, cwd=cwd).decode('ascii').strip() sha = 'N/A' diff = "clean" branch = 'N/A' try: sha = _run(['git', 'rev-parse', 'HEAD']) subprocess.check_output(['git', 'diff'], cwd=cwd) diff = _run(['git', 'diff-index', 'HEAD']) diff = "has uncommited changes" if diff else "clean" branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD']) except Exception: pass message = f"sha: {sha}, status: {diff}, branch: {branch}" return message def collate_fn(batch): batch = list(zip(*batch)) batch[0] = nested_tensor_from_tensor_list(batch[0], size_divisibility=32) return tuple(batch) def _max_by_axis(the_list): # type: (List[List[int]]) -> List[int] maxes = the_list[0] for sublist in the_list[1:]: for index, item in enumerate(sublist): maxes[index] = max(maxes[index], item) return maxes def nested_tensor_from_tensor_list(tensor_list: List[Tensor], size_divisibility=1, split=True): if split: tensor_list = [tensor.split(3,dim=0) for tensor in tensor_list] tensor_list = [item for sublist in tensor_list for item in sublist] # TODO make this more general if tensor_list[0].ndim == 3: # TODO make it support different-sized images max_size = _max_by_axis([list(img.shape) for img in tensor_list]) if size_divisibility > 1: stride = size_divisibility # the last two dims are H,W, both subject to divisibility requirement max_size[-2] = (max_size[-2] + (stride - 1)) // stride * stride max_size[-1] = (max_size[-1] + (stride - 1)) // stride * stride # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) batch_shape = [len(tensor_list)] + max_size b, c, h, w = batch_shape dtype = tensor_list[0].dtype device = tensor_list[0].device tensor = torch.zeros(batch_shape, dtype=dtype, device=device) mask = torch.ones((b, h, w), dtype=torch.bool, device=device) for img, pad_img, m in zip(tensor_list, tensor, mask): pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) m[: img.shape[1], :img.shape[2]] = False else: raise ValueError('not supported') return NestedTensor(tensor, mask) class NestedTensor(object): def __init__(self, tensors, mask: Optional[Tensor]): self.tensors = tensors self.mask = mask def to(self, device, non_blocking=False): # type: (Device) -> NestedTensor # noqa cast_tensor = self.tensors.to(device, non_blocking=non_blocking) mask = self.mask if mask is not None: assert mask is not None cast_mask = mask.to(device, non_blocking=non_blocking) else: cast_mask = None return NestedTensor(cast_tensor, cast_mask) def record_stream(self, *args, **kwargs): self.tensors.record_stream(*args, **kwargs) if self.mask is not None: self.mask.record_stream(*args, **kwargs) def decompose(self): return self.tensors, self.mask def __repr__(self): return str(self.tensors) def setup_for_distributed(is_master): """ This function disables printing when not in master process """ import builtins as __builtin__ builtin_print = __builtin__.print def print(*args, **kwargs): force = kwargs.pop('force', False) if is_master or force: builtin_print(*args, **kwargs) __builtin__.print = print def is_dist_avail_and_initialized(): if not dist.is_available(): return False if not dist.is_initialized(): return False return True def get_world_size(): if not is_dist_avail_and_initialized(): return 1 return dist.get_world_size() def get_rank(): if not is_dist_avail_and_initialized(): return 0 return dist.get_rank() def get_local_size(): if not is_dist_avail_and_initialized(): return 1 return int(os.environ['LOCAL_SIZE']) def get_local_rank(): if not is_dist_avail_and_initialized(): return 0 return int(os.environ['LOCAL_RANK']) def is_main_process(): return get_rank() == 0 def save_on_master(*args, **kwargs): if is_main_process(): torch.save(*args, **kwargs) def init_distributed_mode(args): if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: args.rank = int(os.environ["RANK"]) args.world_size = int(os.environ['WORLD_SIZE']) args.gpu = int(os.environ['LOCAL_RANK']) args.dist_url = 'env://' os.environ['LOCAL_SIZE'] = str(torch.cuda.device_count()) elif 'SLURM_PROCID' in os.environ: proc_id = int(os.environ['SLURM_PROCID']) ntasks = int(os.environ['SLURM_NTASKS']) node_list = os.environ['SLURM_NODELIST'] num_gpus = torch.cuda.device_count() addr = subprocess.getoutput( 'scontrol show hostname {} | head -n1'.format(node_list)) os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', '29500') os.environ['MASTER_ADDR'] = addr os.environ['WORLD_SIZE'] = str(ntasks) os.environ['RANK'] = str(proc_id) os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) os.environ['LOCAL_SIZE'] = str(num_gpus) args.dist_url = 'env://' args.world_size = ntasks args.rank = proc_id args.gpu = proc_id % num_gpus else: print('Not using distributed mode') args.distributed = False return args.distributed = True torch.cuda.set_device(args.gpu) args.dist_backend = 'nccl' print('| distributed init (rank {}): {}'.format( args.rank, args.dist_url), flush=True) torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) torch.distributed.barrier() setup_for_distributed(args.rank == 0) @torch.no_grad() def accuracy(output, target, topk=(1,)): """Computes the precision@k for the specified values of k""" if target.numel() == 0: return [torch.zeros([], device=output.device)] maxk = max(topk) batch_size = target.size(0) _, pred = output.topk(maxk, 1, True, True) pred = pred.t() correct = pred.eq(target.view(1, -1).expand_as(pred)) res = [] for k in topk: correct_k = correct[:k].view(-1).float().sum(0) res.append(correct_k.mul_(100.0 / batch_size)) return res def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None): # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor """ Equivalent to nn.functional.interpolate, but with support for empty batch sizes. This will eventually be supported natively by PyTorch, and this class can go away. """ if float(torchvision.__version__[:3]) < 0.7: if input.numel() > 0: return torch.nn.functional.interpolate( input, size, scale_factor, mode, align_corners ) output_shape = _output_size(2, input, size, scale_factor) output_shape = list(input.shape[:-2]) + list(output_shape) if float(torchvision.__version__[:3]) < 0.5: return _NewEmptyTensorOp.apply(input, output_shape) return _new_empty_tensor(input, output_shape) else: return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners) def get_total_grad_norm(parameters, norm_type=2): parameters = list(filter(lambda p: p.grad is not None, parameters)) norm_type = float(norm_type) device = parameters[0].grad.device total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type) return total_norm def inverse_sigmoid(x, eps=1e-5): x = x.clamp(min=0, max=1) x1 = x.clamp(min=eps) x2 = (1 - x).clamp(min=eps) return torch.log(x1/x2) ================================================ FILE: util/plot_utils.py ================================================ # ------------------------------------------------------------------------ # SeqFormer # ------------------------------------------------------------------------ # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) # Copyright (c) 2020 SenseTime. All Rights Reserved. # ------------------------------------------------------------------------ """ Plotting utilities to visualize training logs. """ import torch import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from pathlib import Path, PurePath def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'): ''' Function to plot specific fields from training log(s). Plots both training and test results. :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file - fields = which results to plot from each log file - plots both training and test for each field. - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots - log_name = optional, name of log file if different than default 'log.txt'. :: Outputs - matplotlib plots of results in fields, color coded for each log file. - solid lines are training results, dashed lines are test results. ''' func_name = "plot_utils.py::plot_logs" # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path, # convert single Path to list to avoid 'not iterable' error if not isinstance(logs, list): if isinstance(logs, PurePath): logs = [logs] print(f"{func_name} info: logs param expects a list argument, converted to list[Path].") else: raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \ Expect list[Path] or single Path obj, received {type(logs)}") # verify valid dir(s) and that every item in list is Path object for i, dir in enumerate(logs): if not isinstance(dir, PurePath): raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}") if dir.exists(): continue raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}") # load log file(s) and plot dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs] fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5)) for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))): for j, field in enumerate(fields): if field == 'mAP': coco_eval = pd.DataFrame(pd.np.stack(df.test_coco_eval.dropna().values)[:, 1]).ewm(com=ewm_col).mean() axs[j].plot(coco_eval, c=color) else: df.interpolate().ewm(com=ewm_col).mean().plot( y=[f'train_{field}', f'test_{field}'], ax=axs[j], color=[color] * 2, style=['-', '--'] ) for ax, field in zip(axs, fields): ax.legend([Path(p).name for p in logs]) ax.set_title(field) def plot_precision_recall(files, naming_scheme='iter'): if naming_scheme == 'exp_id': # name becomes exp_id names = [f.parts[-3] for f in files] elif naming_scheme == 'iter': names = [f.stem for f in files] else: raise ValueError(f'not supported {naming_scheme}') fig, axs = plt.subplots(ncols=2, figsize=(16, 5)) for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names): data = torch.load(f) # precision is n_iou, n_points, n_cat, n_area, max_det precision = data['precision'] recall = data['params'].recThrs scores = data['scores'] # take precision for all classes, all areas and 100 detections precision = precision[0, :, :, 0, -1].mean(1) scores = scores[0, :, :, 0, -1].mean(1) prec = precision.mean() rec = data['recall'][0, :, 0, -1].mean() print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' + f'score={scores.mean():0.3f}, ' + f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}' ) axs[0].plot(recall, precision, c=color) axs[1].plot(recall, scores, c=color) axs[0].set_title('Precision / Recall') axs[0].legend(names) axs[1].set_title('Scores / Recall') axs[1].legend(names) return fig, axs