Repository: SysCV/MaskFreeVIS
Branch: main
Commit: 0e7018b7fe61
Files: 215
Total size: 1.3 MB
Directory structure:
gitextract_tlc1nw96/
├── DATASET_prepare.md
├── LICENSE
├── README.md
├── configs/
│ ├── coco/
│ │ └── instance-segmentation/
│ │ ├── Base-COCO-InstanceSegmentation.yaml
│ │ └── maskformer2_R50_bs16_50ep.yaml
│ └── youtubevis_2019/
│ ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
│ ├── Base-YouTubeVIS-VideoInstanceSegmentation_long.yaml
│ ├── Base-YouTubeVIS-VideoInstanceSegmentation_long_bs16.yaml
│ ├── swin/
│ │ └── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml
│ ├── video_maskformer2_R101_bs16_8ep.yaml
│ ├── video_maskformer2_R50_bs16_8ep.yaml
│ └── video_maskformer2_R50_bs16_8ep_swin.yaml
├── demo/
│ ├── README.md
│ ├── demo.py
│ └── predictor.py
├── demo_video/
│ ├── README.md
│ ├── demo.py
│ ├── predictor.py
│ └── visualizer.py
├── mask2former/
│ ├── __init__.py
│ ├── config.py
│ ├── data/
│ │ ├── __init__.py
│ │ ├── dataset_mappers/
│ │ │ ├── __init__.py
│ │ │ ├── __init__.py.new
│ │ │ ├── coco_instance_new_baseline_dataset_mapper.py
│ │ │ ├── coco_panoptic_new_baseline_dataset_mapper.py
│ │ │ ├── mask_former_instance_dataset_mapper.py
│ │ │ ├── mask_former_panoptic_dataset_mapper.py
│ │ │ └── mask_former_semantic_dataset_mapper.py
│ │ └── datasets/
│ │ ├── __init__.py
│ │ ├── register_ade20k_full.py
│ │ ├── register_ade20k_instance.py
│ │ ├── register_ade20k_panoptic.py
│ │ ├── register_coco_panoptic_annos_semseg.py
│ │ ├── register_coco_stuff_10k.py
│ │ ├── register_mapillary_vistas.py
│ │ └── register_mapillary_vistas_panoptic.py
│ ├── evaluation/
│ │ ├── __init__.py
│ │ ├── __init__.py.new
│ │ └── instance_evaluation.py
│ ├── maskformer_model.py
│ ├── modeling/
│ │ ├── __init__.py
│ │ ├── backbone/
│ │ │ ├── __init__.py
│ │ │ ├── __init__.py.new
│ │ │ └── swin.py
│ │ ├── criterion.py
│ │ ├── matcher.py
│ │ ├── meta_arch/
│ │ │ ├── __init__.py
│ │ │ ├── __init__.py.new
│ │ │ ├── mask_former_head.py
│ │ │ └── per_pixel_baseline.py
│ │ ├── pixel_decoder/
│ │ │ ├── __init__.py
│ │ │ ├── __init__.py.new
│ │ │ ├── fpn.py
│ │ │ ├── msdeformattn.py
│ │ │ └── ops/
│ │ │ ├── functions/
│ │ │ │ ├── __init__.py
│ │ │ │ └── ms_deform_attn_func.py
│ │ │ ├── make.sh
│ │ │ ├── modules/
│ │ │ │ ├── __init__.py
│ │ │ │ └── ms_deform_attn.py
│ │ │ ├── setup.py
│ │ │ ├── src/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── ms_deform_attn_cpu.cpp
│ │ │ │ │ └── ms_deform_attn_cpu.h
│ │ │ │ ├── cuda/
│ │ │ │ │ ├── ms_deform_attn_cuda.cu
│ │ │ │ │ ├── ms_deform_attn_cuda.h
│ │ │ │ │ └── ms_deform_im2col_cuda.cuh
│ │ │ │ ├── ms_deform_attn.h
│ │ │ │ └── vision.cpp
│ │ │ └── test.py
│ │ └── transformer_decoder/
│ │ ├── __init__.py
│ │ ├── mask2former_transformer_decoder.py
│ │ ├── maskformer_transformer_decoder.py
│ │ ├── position_encoding.py
│ │ └── transformer.py
│ ├── test_time_augmentation.py
│ └── utils/
│ ├── __init__.py
│ ├── __init__.py.new
│ └── misc.py
├── mask2former_video/
│ ├── __init__.py
│ ├── config.py
│ ├── data_video/
│ │ ├── __init__.py
│ │ ├── augmentation.py
│ │ ├── build.py
│ │ ├── combined_loader.py
│ │ ├── dataset_mapper.py
│ │ ├── datasets/
│ │ │ ├── __init__.py
│ │ │ ├── builtin.py
│ │ │ ├── ytvis.py
│ │ │ └── ytvis_api/
│ │ │ ├── __init__.py
│ │ │ ├── ytvos.py
│ │ │ └── ytvoseval.py
│ │ └── ytvis_eval.py
│ ├── modeling/
│ │ ├── __init__.py
│ │ ├── criterion.py
│ │ ├── matcher.py
│ │ └── transformer_decoder/
│ │ ├── __init__.py
│ │ ├── position_encoding.py
│ │ └── video_mask2former_transformer_decoder.py
│ ├── utils/
│ │ ├── __init__.py
│ │ ├── __init__.py.new
│ │ └── memory.py
│ └── video_maskformer_model.py
├── mfvis_nococo/
│ ├── __init__.py
│ ├── configs/
│ │ └── youtubevis_2019/
│ │ ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
│ │ ├── video_maskformer2_R101_bs16_8ep_coco.yaml
│ │ ├── video_maskformer2_R50_bs16_8ep.yaml
│ │ └── video_maskformer2_R50_bs16_8ep_coco.yaml
│ ├── mask2former/
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── dataset_mappers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __init__.py.new
│ │ │ │ ├── coco_instance_new_baseline_dataset_mapper.py
│ │ │ │ ├── coco_panoptic_new_baseline_dataset_mapper.py
│ │ │ │ ├── mask_former_instance_dataset_mapper.py
│ │ │ │ ├── mask_former_panoptic_dataset_mapper.py
│ │ │ │ └── mask_former_semantic_dataset_mapper.py
│ │ │ └── datasets/
│ │ │ ├── __init__.py
│ │ │ ├── register_ade20k_full.py
│ │ │ ├── register_ade20k_instance.py
│ │ │ ├── register_ade20k_panoptic.py
│ │ │ ├── register_coco_panoptic_annos_semseg.py
│ │ │ ├── register_coco_stuff_10k.py
│ │ │ ├── register_mapillary_vistas.py
│ │ │ └── register_mapillary_vistas_panoptic.py
│ │ ├── evaluation/
│ │ │ ├── __init__.py
│ │ │ ├── __init__.py.new
│ │ │ └── instance_evaluation.py
│ │ ├── maskformer_model.py
│ │ ├── modeling/
│ │ │ ├── __init__.py
│ │ │ ├── backbone/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __init__.py.new
│ │ │ │ └── swin.py
│ │ │ ├── criterion.py
│ │ │ ├── matcher.py
│ │ │ ├── meta_arch/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __init__.py.new
│ │ │ │ ├── mask_former_head.py
│ │ │ │ └── per_pixel_baseline.py
│ │ │ ├── pixel_decoder/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __init__.py.new
│ │ │ │ ├── fpn.py
│ │ │ │ ├── msdeformattn.py
│ │ │ │ └── ops/
│ │ │ │ ├── functions/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── ms_deform_attn_func.py
│ │ │ │ ├── make.sh
│ │ │ │ ├── modules/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── ms_deform_attn.py
│ │ │ │ ├── setup.py
│ │ │ │ ├── src/
│ │ │ │ │ ├── cpu/
│ │ │ │ │ │ ├── ms_deform_attn_cpu.cpp
│ │ │ │ │ │ └── ms_deform_attn_cpu.h
│ │ │ │ │ ├── cuda/
│ │ │ │ │ │ ├── ms_deform_attn_cuda.cu
│ │ │ │ │ │ ├── ms_deform_attn_cuda.h
│ │ │ │ │ │ └── ms_deform_im2col_cuda.cuh
│ │ │ │ │ ├── ms_deform_attn.h
│ │ │ │ │ └── vision.cpp
│ │ │ │ └── test.py
│ │ │ └── transformer_decoder/
│ │ │ ├── __init__.py
│ │ │ ├── mask2former_transformer_decoder.py
│ │ │ ├── maskformer_transformer_decoder.py
│ │ │ ├── position_encoding.py
│ │ │ └── transformer.py
│ │ ├── test_time_augmentation.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── __init__.py.new
│ │ └── misc.py
│ ├── mask2former_video/
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── data_video/
│ │ │ ├── __init__.py
│ │ │ ├── augmentation.py
│ │ │ ├── build.py
│ │ │ ├── dataset_mapper.py
│ │ │ ├── datasets/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── builtin.py
│ │ │ │ ├── ytvis.py
│ │ │ │ └── ytvis_api/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── ytvos.py
│ │ │ │ └── ytvoseval.py
│ │ │ └── ytvis_eval.py
│ │ ├── modeling/
│ │ │ ├── __init__.py
│ │ │ ├── criterion.py
│ │ │ ├── matcher.py
│ │ │ └── transformer_decoder/
│ │ │ ├── __init__.py
│ │ │ ├── position_encoding.py
│ │ │ └── video_mask2former_transformer_decoder.py
│ │ ├── utils/
│ │ │ ├── __init__.py
│ │ │ └── memory.py
│ │ └── video_maskformer_model.py
│ ├── scripts/
│ │ ├── eval_8gpu_mask2former_r101_video.sh
│ │ ├── train_8gpu_mask2former_r101_video_coco.sh
│ │ ├── train_8gpu_mask2former_r50_video.sh
│ │ ├── train_8gpu_mask2former_r50_video_coco.sh
│ │ ├── visual_video_r101.sh
│ │ └── visual_video_r50.sh
│ └── train_net_video.py
├── requirements.txt
├── scripts/
│ ├── eval_8gpu_mask2former_r101_video.sh
│ ├── eval_8gpu_mask2former_r50_video.sh
│ ├── eval_8gpu_mask2former_swinl_video.sh
│ ├── train_8gpu_mask2former_r101_video.sh
│ ├── train_8gpu_mask2former_r50_video.sh
│ ├── train_8gpu_mask2former_swinl_video.sh
│ └── visual_video.sh
├── tools/
│ ├── README.md
│ ├── analyze_model.py
│ ├── convert-pretrained-swin-model-to-d2.py
│ ├── convert-torchvision-to-d2.py
│ ├── evaluate_coco_boundary_ap.py
│ └── evaluate_pq_for_semantic_segmentation.py
├── train_net.py
├── train_net_video.py
└── util/
├── __init__.py
├── box_ops.py
├── misc.py
└── plot_utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: DATASET_prepare.md
================================================
# Prepare Datasets for MaskFreeVIS
A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog)
for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc).
This document explains how to setup the builtin datasets so they can be used by the above APIs.
[Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`,
and how to add new datasets to them.
MaskFreeVIS has builtin support for a few datasets.
The datasets are assumed to exist in a directory specified by the environment variable
`DETECTRON2_DATASETS`.
You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`.
If left unset, the default is `./datasets` relative to your current working directory.
The model zoo contains configs and models that use these builtin datasets. We will convert each object mask to box when after reading the corresponding instance annotation.
## Expected dataset structure for [COCO](https://cocodataset.org/#download):
```
coco/
annotations/
instances_{train,val}2017.json
panoptic_{train,val}2017.json
{train,val}2017/
# image files that are mentioned in the corresponding json
panoptic_{train,val}2017/ # png annotations
panoptic_semseg_{train,val}2017/ # generated by the script mentioned below
```
Install panopticapi by:
```
pip install git+https://github.com/cocodataset/panopticapi.git
```
Then, run `python datasets/prepare_coco_semantic_annos_from_panoptic_annos.py`, to extract semantic annotations from panoptic annotations (only used for evaluation).
## Expected dataset structure for [YouTubeVIS 2019](https://competitions.codalab.org/competitions/20128):
```
ytvis_2019/
{train,valid,test}.json
{train,valid,test}/
Annotations/
JPEGImages/
```
## Expected dataset structure for [YouTubeVIS 2021](https://competitions.codalab.org/competitions/28988):
```
ytvis_2021/
{train,valid,test}.json
{train,valid,test}/
Annotations/
JPEGImages/
```
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
# MaskFreeVIS
Mask-Free Video Instance Segmentation [CVPR 2023].
This is the official pytorch implementation of [MaskFreeVIS](https://github.com/SysCV/MaskFreeVis/) built on the open-source detectron2. We aim to **remove the necessity for expensive video masks and even image masks** for training VIS models. Our project website contains more information, including the visual video comparison: [vis.xyz/pub/maskfreevis](https://www.vis.xyz/pub/maskfreevis/).
> [**Mask-Free Video Instance Segmentation**](https://arxiv.org/abs/2303.15904)
> Lei Ke, Martin Danelljan, Henghui Ding, Yu-Wing Tai, Chi-Keung Tang, Fisher Yu \
> CVPR 2023
Highlights
-----------------
- **High-performing** video instance segmentation **without using any video masks or even image mask** labels. Using SwinL and built on Mask2Former, MaskFreeVIS achieved 56.0 AP on YTVIS without using any video masks labels. Using ResNet-101, MaskFreeVIS achieves 49.1 AP without using video masks, and 47.3 AP only using COCO mask initialized model.
- **Novelty:** a new **parameter-free** Temporal KNN-patch Loss (TK-Loss), which leverages temporal masks consistency using unsupervised one-to-k patch correspondence.
- **Simple:** TK-Loss is flexible to intergrated with state-of-the-art transformer-based VIS models, with no trainable parameters.
Visualization results of MaskFreeVIS
-----------------
Introduction
-----------------
The recent advancement in Video Instance Segmentation (VIS) has largely been driven by the use of deeper and increasingly data-hungry transformer-based models. However, video masks are tedious and expensive to annotate, limiting the scale and diversity of existing VIS datasets. In this work, we aim to remove the mask-annotation requirement. We propose MaskFreeVIS, achieving highly competitive VIS performance, while only using bounding box annotations for the object state. We leverage the rich temporal mask consistency constraints in videos by introducing the Temporal KNN-patch Loss (TK-Loss), providing strong mask supervision without any labels. Our TK-Loss finds one-to-many matches across frames, through an efficient patch-matching step followed by a K-nearest neighbor selection. A consistency loss is then enforced on the found matches. Our mask-free objective is simple to implement, has no trainable parameters, is computationally efficient, yet outperforms baselines employing, e.g., state-of-the-art optical flow to enforce temporal mask consistency. We validate MaskFreeVIS on the YouTube-VIS 2019/2021, OVIS and BDD100K MOTS benchmarks. The results clearly demonstrate the efficacy of our method by drastically narrowing the gap between fully and weakly-supervised VIS performance.
Methods
-----------------
### **Installation**
Please see [Getting Started with Detectron2](https://github.com/facebookresearch/detectron2/blob/master/GETTING_STARTED.md) for full usage.
### Requirements
- Linux or macOS with Python 3.6
- PyTorch 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check
PyTorch version matches that is required by Detectron2.
- Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html).
- OpenCV is optional but needed by demo and visualization
- `pip install -r requirements.txt`
### CUDA kernel for MSDeformAttn
After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn:
`CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit.
```bash
cd mask2former/modeling/pixel_decoder/ops
sh make.sh
```
#### Building on another system
To build on a system that does not have a GPU device but provide the drivers:
```bash
TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install
```
### Example conda environment setup
```bash
conda create --name maskfreevis python=3.8 -y
conda activate maskfreevis
conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia
pip install -U opencv-python
# under your working directory
git clone git@github.com:facebookresearch/detectron2.git
cd detectron2
pip install -e .
cd ..
git clone https://github.com/SysCV/MaskFreeVIS.git
cd MaskFreeVIS
pip install -r requirements.txt
cd mask2former/modeling/pixel_decoder/ops
sh make.sh
```
### **Dataset preparation**
Please see the document [here](DATASET_prepare.md).
### **Model Zoo**
## Video Instance Segmentation (YouTubeVIS)
Using COCO image masks **without YTVIS video masks** during training:
**For below two training settings without using pseudo COCO images masks** for joint video training, please change the folder to:
```
cd mfvis_nococo
```
1) Only using **COCO mask initialized model without YTVIS video masks** during training:
2) Only using **COCO box initialized model without YTVIS video masks** during training:
Please see our script folder.
## Inference & Evaluation
First download the provided trained model from our model zoo table and put them into the mfvis_models.
```
mkdir mfvis_models
```
Refer to our [scripts folder](./scripts) for more commands:
Example evaluation scripts:
```
bash scripts/eval_8gpu_mask2former_r50_video.sh
bash scripts/eval_8gpu_mask2former_r101_video.sh
bash scripts/eval_8gpu_mask2former_swinl_video.sh
```
## Results Visualization
Example visualization script:
```
bash scripts/visual_video.sh
```
Citation
---------------
If you find MaskFreeVIS useful in your research or refer to the provided baseline results, please star :star: this repository and consider citing :pencil::
```
@inproceedings{maskfreevis,
author={Ke, Lei and Danelljan, Martin and Ding, Henghui and Tai, Yu-Wing and Tang, Chi-Keung and Yu, Fisher},
title={Mask-Free Video Instance Segmentation},
booktitle = {CVPR},
year = {2023}
}
```
## Acknowledgments
- Thanks [BoxInst](https://github.com/aim-uofa/AdelaiDet/blob/master/configs/BoxInst/README.md) image-based instance segmentation losses.
- Thanks [Mask2Former](https://github.com/facebookresearch/Mask2Former) and [VMT](https://github.com/SysCV/vmt) for providing useful inference and evaluation toolkits.
================================================
FILE: configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml
================================================
MODEL:
BACKBONE:
FREEZE_AT: 0
NAME: "build_resnet_backbone"
WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
RESNETS:
DEPTH: 50
STEM_TYPE: "basic" # not used
STEM_OUT_CHANNELS: 64
STRIDE_IN_1X1: False
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
# NORM: "SyncBN"
RES5_MULTI_GRID: [1, 1, 1] # not used
DATASETS:
TRAIN: ("coco_2017_train",)
TEST: ("coco_2017_val",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.0001
STEPS: (327778, 355092)
MAX_ITER: 368750
WARMUP_FACTOR: 1.0
WARMUP_ITERS: 10
WEIGHT_DECAY: 0.05
OPTIMIZER: "ADAMW"
BACKBONE_MULTIPLIER: 0.1
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 0.01
NORM_TYPE: 2.0
AMP:
ENABLED: True
INPUT:
IMAGE_SIZE: 1024
MIN_SCALE: 0.1
MAX_SCALE: 2.0
FORMAT: "RGB"
DATASET_MAPPER_NAME: "coco_instance_lsj"
TEST:
EVAL_PERIOD: 5000
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: True
NUM_WORKERS: 4
VERSION: 2
================================================
FILE: configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml
================================================
_BASE_: Base-COCO-InstanceSegmentation.yaml
OUTPUT_DIR: './output/'
MODEL:
META_ARCHITECTURE: "MaskFormer"
SEM_SEG_HEAD:
NAME: "MaskFormerHead"
IGNORE_VALUE: 255
NUM_CLASSES: 80
LOSS_WEIGHT: 1.0
CONVS_DIM: 256
MASK_DIM: 256
NORM: "GN"
# pixel decoder
PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
IN_FEATURES: ["res2", "res3", "res4", "res5"]
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
COMMON_STRIDE: 4
TRANSFORMER_ENC_LAYERS: 6
MASK_FORMER:
TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
DEEP_SUPERVISION: True
NO_OBJECT_WEIGHT: 0.1
CLASS_WEIGHT: 2.0
MASK_WEIGHT: 5.0
DICE_WEIGHT: 5.0
HIDDEN_DIM: 256
NUM_OBJECT_QUERIES: 100
NHEADS: 8
DROPOUT: 0.0
DIM_FEEDFORWARD: 2048
ENC_LAYERS: 0
PRE_NORM: False
ENFORCE_INPUT_PROJ: False
SIZE_DIVISIBILITY: 32
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
TRAIN_NUM_POINTS: 12544
OVERSAMPLE_RATIO: 3.0
IMPORTANCE_SAMPLE_RATIO: 0.75
TEST:
SEMANTIC_ON: False
INSTANCE_ON: True
PANOPTIC_ON: False
OVERLAP_THRESHOLD: 0.8
OBJECT_MASK_THRESHOLD: 0.8
================================================
FILE: configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation.yaml
================================================
MODEL:
BACKBONE:
FREEZE_AT: 0
NAME: "build_resnet_backbone"
WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
MASK_ON: True
RESNETS:
DEPTH: 50
STEM_TYPE: "basic" # not used
STEM_OUT_CHANNELS: 64
STRIDE_IN_1X1: False
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
# NORM: "SyncBN"
RES5_MULTI_GRID: [1, 1, 1] # not used
DATASETS:
TRAIN: ("ytvis_2019_train", "coco_2017_train_fake",)
TEST: ("ytvis_2019_val",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.0001
STEPS: (4000,)
MAX_ITER: 6000
WARMUP_FACTOR: 1.0
WARMUP_ITERS: 10
WEIGHT_DECAY: 0.05
OPTIMIZER: "ADAMW"
BACKBONE_MULTIPLIER: 0.1
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 0.01
NORM_TYPE: 2.0
AMP:
ENABLED: True
INPUT:
MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
RANDOM_FLIP: "flip_by_clip"
AUGMENTATIONS: []
MIN_SIZE_TRAIN: (360, 480)
MIN_SIZE_TEST: 360
CROP:
ENABLED: False
TYPE: "absolute_range"
SIZE: (600, 720)
FORMAT: "RGB"
TEST:
EVAL_PERIOD: 0
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 4
VERSION: 2
================================================
FILE: configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation_long.yaml
================================================
MODEL:
BACKBONE:
FREEZE_AT: 0
NAME: "build_resnet_backbone"
WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
MASK_ON: True
RESNETS:
DEPTH: 50
STEM_TYPE: "basic" # not used
STEM_OUT_CHANNELS: 64
STRIDE_IN_1X1: False
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
# NORM: "SyncBN"
RES5_MULTI_GRID: [1, 1, 1] # not used
DATASETS:
TRAIN: ("coco_2017_train_fake", "ytvis_2019_train",)
TEST: ("ytvis_2019_val",)
SOLVER:
IMS_PER_BATCH: 8
BASE_LR: 0.00005
STEPS: (75000,)
MAX_ITER: 140000
WARMUP_FACTOR: 1.0
WARMUP_ITERS: 10
WEIGHT_DECAY: 0.05
OPTIMIZER: "ADAMW"
BACKBONE_MULTIPLIER: 0.1
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 0.01
NORM_TYPE: 2.0
AMP:
ENABLED: True
INPUT:
MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
RANDOM_FLIP: "flip_by_clip"
AUGMENTATIONS: []
MIN_SIZE_TRAIN: (360, 480)
MIN_SIZE_TEST: 360
CROP:
ENABLED: False
TYPE: "absolute_range"
SIZE: (600, 720)
FORMAT: "RGB"
TEST:
EVAL_PERIOD: 0
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 4
VERSION: 2
================================================
FILE: configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation_long_bs16.yaml
================================================
MODEL:
BACKBONE:
FREEZE_AT: 0
NAME: "build_resnet_backbone"
WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
MASK_ON: True
RESNETS:
DEPTH: 50
STEM_TYPE: "basic" # not used
STEM_OUT_CHANNELS: 64
STRIDE_IN_1X1: False
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
# NORM: "SyncBN"
RES5_MULTI_GRID: [1, 1, 1] # not used
DATASETS:
TRAIN: ("coco_2017_train_fake", "ytvis_2019_train",)
TEST: ("ytvis_2019_val",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.0001
STEPS: (37500,)
MAX_ITER: 70000
WARMUP_FACTOR: 1.0
WARMUP_ITERS: 10
WEIGHT_DECAY: 0.05
OPTIMIZER: "ADAMW"
BACKBONE_MULTIPLIER: 0.1
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 0.01
NORM_TYPE: 2.0
AMP:
ENABLED: True
INPUT:
MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
RANDOM_FLIP: "flip_by_clip"
AUGMENTATIONS: []
MIN_SIZE_TRAIN: (360, 480)
MIN_SIZE_TEST: 360
CROP:
ENABLED: False
TYPE: "absolute_range"
SIZE: (600, 720)
FORMAT: "RGB"
TEST:
EVAL_PERIOD: 0
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 4
VERSION: 2
================================================
FILE: configs/youtubevis_2019/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml
================================================
_BASE_: ../video_maskformer2_R50_bs16_8ep_swin.yaml
OUTPUT_DIR: 'swinl_joint_withcoco'
MODEL:
WEIGHTS: "./pretrained_model/model_final_e5f453.pkl"
BACKBONE:
NAME: "D2SwinTransformer"
SWIN:
EMBED_DIM: 192
DEPTHS: [2, 2, 18, 2]
NUM_HEADS: [6, 12, 24, 48]
WINDOW_SIZE: 12
APE: False
DROP_PATH_RATE: 0.3
PATCH_NORM: True
PRETRAIN_IMG_SIZE: 384
#WEIGHTS: "model_final_e5f453.pkl"
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
MASK_FORMER:
NUM_OBJECT_QUERIES: 200
INPUT:
MIN_SIZE_TEST: 480
================================================
FILE: configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml
================================================
_BASE_: video_maskformer2_R50_bs16_8ep.yaml
OUTPUT_DIR: './r101_coco_joint/'
MODEL:
WEIGHTS: "pretrained_model/model_final_eba159.pkl"
RESNETS:
DEPTH: 101
STEM_TYPE: "basic" # not used
STEM_OUT_CHANNELS: 64
STRIDE_IN_1X1: False
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
# NORM: "SyncBN"
RES5_MULTI_GRID: [1, 1, 1] # not used
================================================
FILE: configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml
================================================
_BASE_: Base-YouTubeVIS-VideoInstanceSegmentation_long_bs16.yaml
OUTPUT_DIR: './r50_coco_joint/'
SEED: 29118357
MODEL:
WEIGHTS: "./pretrained_model/model_final_3c8ec9.pkl"
META_ARCHITECTURE: "VideoMaskFormer"
SEM_SEG_HEAD:
NAME: "MaskFormerHead"
IGNORE_VALUE: 255
NUM_CLASSES: 40
LOSS_WEIGHT: 1.0
CONVS_DIM: 256
MASK_DIM: 256
NORM: "GN"
# pixel decoder
PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
IN_FEATURES: ["res2", "res3", "res4", "res5"]
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
COMMON_STRIDE: 4
TRANSFORMER_ENC_LAYERS: 6
MASK_FORMER:
TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder"
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
DEEP_SUPERVISION: True
NO_OBJECT_WEIGHT: 0.1
CLASS_WEIGHT: 2.0
MASK_WEIGHT: 5.0
DICE_WEIGHT: 5.0
HIDDEN_DIM: 256
NUM_OBJECT_QUERIES: 100
NHEADS: 8
DROPOUT: 0.0
DIM_FEEDFORWARD: 2048
ENC_LAYERS: 0
PRE_NORM: False
ENFORCE_INPUT_PROJ: False
SIZE_DIVISIBILITY: 32
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
TRAIN_NUM_POINTS: 20000 #20000 #12544
OVERSAMPLE_RATIO: 3.0
IMPORTANCE_SAMPLE_RATIO: 0.75
TEST:
SEMANTIC_ON: False
INSTANCE_ON: True
PANOPTIC_ON: False
OVERLAP_THRESHOLD: 0.8
OBJECT_MASK_THRESHOLD: 0.8
INPUT:
MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
PSEUDO:
SAMPLING_FRAME_NUM: 4
SAMPLING_FRAME_RANGE: 20
AUGMENTATIONS: ['rotation']
MIN_SIZE_TRAIN: (288, 320, 352, 384, 416, 448, 480, 512)
MAX_SIZE_TRAIN: 768
CROP:
ENABLED: True
TYPE: "absolute_range"
SIZE: (384, 600)
LSJ_AUG:
ENABLED: False
IMAGE_SIZE: 768
MIN_SCALE: 0.1
MAX_SCALE: 2.0
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: True
# NUM_WORKERS: 8
================================================
FILE: configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep_swin.yaml
================================================
_BASE_: Base-YouTubeVIS-VideoInstanceSegmentation_long.yaml
OUTPUT_DIR: './swinl_joint_withcoco/'
SEED: 29118357
MODEL:
WEIGHTS: "./pretrained_model/model_final_3c8ec9.pkl"
META_ARCHITECTURE: "VideoMaskFormer"
SEM_SEG_HEAD:
NAME: "MaskFormerHead"
IGNORE_VALUE: 255
NUM_CLASSES: 40
LOSS_WEIGHT: 1.0
CONVS_DIM: 256
MASK_DIM: 256
NORM: "GN"
# pixel decoder
PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
IN_FEATURES: ["res2", "res3", "res4", "res5"]
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
COMMON_STRIDE: 4
TRANSFORMER_ENC_LAYERS: 6
MASK_FORMER:
TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder"
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
DEEP_SUPERVISION: True
NO_OBJECT_WEIGHT: 0.1
CLASS_WEIGHT: 2.0
MASK_WEIGHT: 5.0
DICE_WEIGHT: 5.0
HIDDEN_DIM: 256
NUM_OBJECT_QUERIES: 100
NHEADS: 8
DROPOUT: 0.0
DIM_FEEDFORWARD: 2048
ENC_LAYERS: 0
PRE_NORM: False
ENFORCE_INPUT_PROJ: False
SIZE_DIVISIBILITY: 32
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
TRAIN_NUM_POINTS: 20000 #20000 #12544
OVERSAMPLE_RATIO: 3.0
IMPORTANCE_SAMPLE_RATIO: 0.75
TEST:
SEMANTIC_ON: False
INSTANCE_ON: True
PANOPTIC_ON: False
OVERLAP_THRESHOLD: 0.8
OBJECT_MASK_THRESHOLD: 0.8
INPUT:
MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
PSEUDO:
SAMPLING_FRAME_NUM: 4
SAMPLING_FRAME_RANGE: 20
AUGMENTATIONS: ['rotation']
MIN_SIZE_TRAIN: (288, 320, 352, 384, 416, 448, 480, 512)
MAX_SIZE_TRAIN: 768
CROP:
ENABLED: True
TYPE: "absolute_range"
SIZE: (384, 600)
LSJ_AUG:
ENABLED: False
IMAGE_SIZE: 768
MIN_SCALE: 0.1
MAX_SCALE: 2.0
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: True
# NUM_WORKERS: 8
================================================
FILE: demo/README.md
================================================
## Mask2Former Demo
We provide a command line tool to run a simple demo of builtin configs.
The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md).
================================================
FILE: demo/demo.py
================================================
# Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py
import argparse
import glob
import multiprocessing as mp
import os
# fmt: off
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
# fmt: on
import tempfile
import time
import warnings
import cv2
import numpy as np
import tqdm
from detectron2.config import get_cfg
from detectron2.data.detection_utils import read_image
from detectron2.projects.deeplab import add_deeplab_config
from detectron2.utils.logger import setup_logger
from mask2former import add_maskformer2_config
from predictor import VisualizationDemo
# constants
WINDOW_NAME = "mask2former demo"
def setup_cfg(args):
# load config from file and command-line arguments
cfg = get_cfg()
add_deeplab_config(cfg)
add_maskformer2_config(cfg)
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.freeze()
return cfg
def get_parser():
parser = argparse.ArgumentParser(description="maskformer2 demo for builtin configs")
parser.add_argument(
"--config-file",
default="configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml",
metavar="FILE",
help="path to config file",
)
parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
parser.add_argument("--video-input", help="Path to video file.")
parser.add_argument(
"--input",
nargs="+",
help="A list of space separated input images; "
"or a single glob pattern such as 'directory/*.jpg'",
)
parser.add_argument(
"--output",
help="A file or directory to save output visualizations. "
"If not given, will show output in an OpenCV window.",
)
parser.add_argument(
"--confidence-threshold",
type=float,
default=0.5,
help="Minimum score for instance predictions to be shown",
)
parser.add_argument(
"--opts",
help="Modify config options using the command-line 'KEY VALUE' pairs",
default=[],
nargs=argparse.REMAINDER,
)
return parser
def test_opencv_video_format(codec, file_ext):
with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
filename = os.path.join(dir, "test_file" + file_ext)
writer = cv2.VideoWriter(
filename=filename,
fourcc=cv2.VideoWriter_fourcc(*codec),
fps=float(30),
frameSize=(10, 10),
isColor=True,
)
[writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
writer.release()
if os.path.isfile(filename):
return True
return False
if __name__ == "__main__":
mp.set_start_method("spawn", force=True)
args = get_parser().parse_args()
setup_logger(name="fvcore")
logger = setup_logger()
logger.info("Arguments: " + str(args))
cfg = setup_cfg(args)
demo = VisualizationDemo(cfg)
if args.input:
if len(args.input) == 1:
args.input = glob.glob(os.path.expanduser(args.input[0]))
assert args.input, "The input path(s) was not found"
for path in tqdm.tqdm(args.input, disable=not args.output):
# use PIL, to be consistent with evaluation
img = read_image(path, format="BGR")
start_time = time.time()
predictions, visualized_output = demo.run_on_image(img, args.confidence_threshold)
logger.info(
"{}: {} in {:.2f}s".format(
path,
"detected {} instances".format(len(predictions["instances"]))
if "instances" in predictions
else "finished",
time.time() - start_time,
)
)
if args.output:
if os.path.isdir(args.output):
assert os.path.isdir(args.output), args.output
out_filename = os.path.join(args.output, os.path.basename(path))
else:
#assert len(args.input) == 1, "Please specify a directory with args.output"
os.makedirs(args.output)
out_filename = os.path.join(args.output, os.path.basename(path))
#out_filename = args.output
visualized_output.save(out_filename)
else:
cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
if cv2.waitKey(0) == 27:
break # esc to quit
elif args.webcam:
assert args.input is None, "Cannot have both --input and --webcam!"
assert args.output is None, "output not yet supported with --webcam!"
cam = cv2.VideoCapture(0)
for vis in tqdm.tqdm(demo.run_on_video(cam)):
cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
cv2.imshow(WINDOW_NAME, vis)
if cv2.waitKey(1) == 27:
break # esc to quit
cam.release()
cv2.destroyAllWindows()
elif args.video_input:
video = cv2.VideoCapture(args.video_input)
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
frames_per_second = video.get(cv2.CAP_PROP_FPS)
num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
basename = os.path.basename(args.video_input)
codec, file_ext = (
("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4")
)
if codec == ".mp4v":
warnings.warn("x264 codec not available, switching to mp4v")
if args.output:
if os.path.isdir(args.output):
output_fname = os.path.join(args.output, basename)
output_fname = os.path.splitext(output_fname)[0] + file_ext
else:
output_fname = args.output
assert not os.path.isfile(output_fname), output_fname
output_file = cv2.VideoWriter(
filename=output_fname,
# some installation of opencv may not support x264 (due to its license),
# you can try other format (e.g. MPEG)
fourcc=cv2.VideoWriter_fourcc(*codec),
fps=float(frames_per_second),
frameSize=(width, height),
isColor=True,
)
assert os.path.isfile(args.video_input)
for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames):
if args.output:
output_file.write(vis_frame)
else:
cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
cv2.imshow(basename, vis_frame)
if cv2.waitKey(1) == 27:
break # esc to quit
video.release()
if args.output:
output_file.release()
else:
cv2.destroyAllWindows()
================================================
FILE: demo/predictor.py
================================================
# Copied from: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py
import atexit
import bisect
import multiprocessing as mp
from collections import deque
import cv2
import torch
import numpy as np
from detectron2.data import MetadataCatalog
from detectron2.engine.defaults import DefaultPredictor
from detectron2.utils.video_visualizer import VideoVisualizer
from detectron2.utils.visualizer import ColorMode, Visualizer
import matplotlib.pyplot as plt
class VisualizationDemo(object):
def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
"""
Args:
cfg (CfgNode):
instance_mode (ColorMode):
parallel (bool): whether to run the model in different processes from visualization.
Useful since the visualization logic can be slow.
"""
self.metadata = MetadataCatalog.get(
cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
)
self.cpu_device = torch.device("cpu")
self.instance_mode = instance_mode
self.parallel = parallel
self.cfg_vis = cfg
if parallel:
num_gpu = torch.cuda.device_count()
self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
else:
self.predictor = DefaultPredictor(cfg)
def run_on_image(self, image, conf_thre):
"""
Args:
image (np.ndarray): an image of shape (H, W, C) (in BGR order).
This is the format used by OpenCV.
Returns:
predictions (dict): the output of the model.
vis_output (VisImage): the visualized image output.
"""
vis_output = None
predictions = self.predictor(image)
# Convert image from OpenCV BGR format to Matplotlib RGB format.
image = image[:, :, ::-1]
visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
if "panoptic_seg" in predictions:
panoptic_seg, segments_info = predictions["panoptic_seg"]
vis_output = visualizer.draw_panoptic_seg_predictions(
panoptic_seg.to(self.cpu_device), segments_info
)
else:
if "sem_seg" in predictions:
vis_output = visualizer.draw_sem_seg(
predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
)
if "instances" in predictions:
instances = predictions["instances"].to(self.cpu_device)
instances = instances[instances.scores >= conf_thre]
'''
mask = instances.pred_masks.squeeze(1).data.cpu().numpy()
for i_m in range(len(mask)):
print('mask shape:', mask.shape)
print('mask max:', mask.max())
#heatmapshow = cv2.normalize(mask[i], heatmapshow, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8U)
heatmapshow = cv2.applyColorMap((mask[i_m] * 255).astype(np.uint8), cv2.COLORMAP_JET)
cv2.imwrite(str(i_m)+"_heatmap_n.jpg", heatmapshow)
'''
'''
print('instances scores:', instances.scores.shape)
print('instances scores:', instances.scores)
print('instances class:', instances.pred_classes.shape)
print('instances boxes:', instances.pred_boxes)
print('instances masks:', instances.pred_masks.shape)
instances.pred_boxes = None
'''
vis_output = visualizer.draw_instance_predictions(predictions=instances)
return predictions, vis_output
def _frame_from_video(self, video):
while video.isOpened():
success, frame = video.read()
if success:
yield frame
else:
break
def run_on_video(self, video):
"""
Visualizes predictions on frames of the input video.
Args:
video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
either a webcam or a video file.
Yields:
ndarray: BGR visualizations of each video frame.
"""
video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
def process_predictions(frame, predictions):
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
if "panoptic_seg" in predictions:
panoptic_seg, segments_info = predictions["panoptic_seg"]
vis_frame = video_visualizer.draw_panoptic_seg_predictions(
frame, panoptic_seg.to(self.cpu_device), segments_info
)
elif "instances" in predictions:
predictions = predictions["instances"].to(self.cpu_device)
vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
elif "sem_seg" in predictions:
vis_frame = video_visualizer.draw_sem_seg(
frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
)
# Converts Matplotlib RGB format to OpenCV BGR format
vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
return vis_frame
frame_gen = self._frame_from_video(video)
if self.parallel:
buffer_size = self.predictor.default_buffer_size
frame_data = deque()
for cnt, frame in enumerate(frame_gen):
frame_data.append(frame)
self.predictor.put(frame)
if cnt >= buffer_size:
frame = frame_data.popleft()
predictions = self.predictor.get()
yield process_predictions(frame, predictions)
while len(frame_data):
frame = frame_data.popleft()
predictions = self.predictor.get()
yield process_predictions(frame, predictions)
else:
for frame in frame_gen:
yield process_predictions(frame, self.predictor(frame))
class AsyncPredictor:
"""
A predictor that runs the model asynchronously, possibly on >1 GPUs.
Because rendering the visualization takes considerably amount of time,
this helps improve throughput a little bit when rendering videos.
"""
class _StopToken:
pass
class _PredictWorker(mp.Process):
def __init__(self, cfg, task_queue, result_queue):
self.cfg = cfg
self.task_queue = task_queue
self.result_queue = result_queue
super().__init__()
def run(self):
predictor = DefaultPredictor(self.cfg)
while True:
task = self.task_queue.get()
if isinstance(task, AsyncPredictor._StopToken):
break
idx, data = task
result = predictor(data)
self.result_queue.put((idx, result))
def __init__(self, cfg, num_gpus: int = 1):
"""
Args:
cfg (CfgNode):
num_gpus (int): if 0, will run on CPU
"""
num_workers = max(num_gpus, 1)
self.task_queue = mp.Queue(maxsize=num_workers * 3)
self.result_queue = mp.Queue(maxsize=num_workers * 3)
self.procs = []
for gpuid in range(max(num_gpus, 1)):
cfg = cfg.clone()
cfg.defrost()
cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
self.procs.append(
AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
)
self.put_idx = 0
self.get_idx = 0
self.result_rank = []
self.result_data = []
for p in self.procs:
p.start()
atexit.register(self.shutdown)
def put(self, image):
self.put_idx += 1
self.task_queue.put((self.put_idx, image))
def get(self):
self.get_idx += 1 # the index needed for this request
if len(self.result_rank) and self.result_rank[0] == self.get_idx:
res = self.result_data[0]
del self.result_data[0], self.result_rank[0]
return res
while True:
# make sure the results are returned in the correct order
idx, res = self.result_queue.get()
if idx == self.get_idx:
return res
insert = bisect.bisect(self.result_rank, idx)
self.result_rank.insert(insert, idx)
self.result_data.insert(insert, res)
def __len__(self):
return self.put_idx - self.get_idx
def __call__(self, image):
self.put(image)
return self.get()
def shutdown(self):
for _ in self.procs:
self.task_queue.put(AsyncPredictor._StopToken())
@property
def default_buffer_size(self):
return len(self.procs) * 5
================================================
FILE: demo_video/README.md
================================================
## Video Mask2Former Demo
We provide a command line tool to run a simple demo of builtin configs.
The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md).
================================================
FILE: demo_video/demo.py
================================================
# Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py
import argparse
import glob
import multiprocessing as mp
import os
# fmt: off
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
# fmt: on
import tempfile
import time
import warnings
import cv2
import numpy as np
import tqdm
from torch.cuda.amp import autocast
from detectron2.config import get_cfg
from detectron2.data.detection_utils import read_image
from detectron2.projects.deeplab import add_deeplab_config
from detectron2.utils.logger import setup_logger
from mask2former import add_maskformer2_config
from mask2former_video import add_maskformer2_video_config
from predictor import VisualizationDemo
import imageio
# constants
WINDOW_NAME = "mask2former video demo"
def setup_cfg(args):
# load config from file and command-line arguments
cfg = get_cfg()
add_deeplab_config(cfg)
add_maskformer2_config(cfg)
add_maskformer2_video_config(cfg)
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.freeze()
return cfg
def get_parser():
parser = argparse.ArgumentParser(description="maskformer2 demo for builtin configs")
parser.add_argument(
"--config-file",
default="configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml",
metavar="FILE",
help="path to config file",
)
parser.add_argument("--video-input", help="Path to video file.")
parser.add_argument(
"--input",
nargs="+",
help="A list of space separated input images; "
"or a single glob pattern such as 'directory/*.jpg'"
"this will be treated as frames of a video",
)
parser.add_argument(
"--output",
help="A file or directory to save output visualizations. "
"If not given, will show output in an OpenCV window.",
)
parser.add_argument(
"--save-frames",
default=False,
help="Save frame level image outputs.",
)
parser.add_argument(
"--confidence-threshold",
type=float,
default=0.5,
help="Minimum score for instance predictions to be shown",
)
parser.add_argument(
"--opts",
help="Modify config options using the command-line 'KEY VALUE' pairs",
default=[],
nargs=argparse.REMAINDER,
)
return parser
def test_opencv_video_format(codec, file_ext):
with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
filename = os.path.join(dir, "test_file" + file_ext)
writer = cv2.VideoWriter(
filename=filename,
fourcc=cv2.VideoWriter_fourcc(*codec),
fps=float(30),
frameSize=(10, 10),
isColor=True,
)
[writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
writer.release()
if os.path.isfile(filename):
return True
return False
if __name__ == "__main__":
mp.set_start_method("spawn", force=True)
args = get_parser().parse_args()
setup_logger(name="fvcore")
logger = setup_logger()
logger.info("Arguments: " + str(args))
cfg = setup_cfg(args)
demo = VisualizationDemo(cfg)
if args.output:
os.makedirs(args.output, exist_ok=True)
if args.input:
# if len(args.input) == 1:
# args.input = glob.glob(os.path.expanduser(args.input[0]))
# assert args.input, "The input path(s) was not found"
print('args input:', args.input)
args.input = args.input[0]
for file_name in os.listdir(args.input):
input_path_list = sorted([args.input + file_name + '/' + f for f in os.listdir(args.input + file_name)])
print('input path list:', input_path_list)
if len(input_path_list) == 0:
continue
vid_frames = []
for path in input_path_list:
img = read_image(path, format="BGR")
vid_frames.append(img)
start_time = time.time()
with autocast():
predictions, visualized_output = demo.run_on_video(vid_frames, args.confidence_threshold)
logger.info(
"detected {} instances per frame in {:.2f}s".format(
len(predictions["pred_scores"]), time.time() - start_time
)
)
if args.output:
if args.save_frames:
if args.output:
os.makedirs(args.output + file_name, exist_ok=True)
print('save frames')
for path, _vis_output in zip(input_path_list, visualized_output):
out_filename = os.path.join(args.output, file_name, os.path.basename(path))
_vis_output.save(out_filename)
H, W = visualized_output[0].height, visualized_output[0].width
images = []
for _vis_output in visualized_output:
frame = _vis_output.get_image()#[:, :, ::-1]
images.append(frame)
imageio.mimsave(args.output + file_name + ".gif", images, fps=5)
'''
cap = cv2.VideoCapture(-1)
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(os.path.join(args.output, "visualization.mp4"), fourcc, 10.0, (W, H), True)
for _vis_output in visualized_output:
frame = _vis_output.get_image()[:, :, ::-1]
out.write(frame)
cap.release()
out.release()
'''
elif args.video_input:
video = cv2.VideoCapture(args.video_input)
vid_frames = []
while video.isOpened():
success, frame = video.read()
if success:
vid_frames.append(frame)
else:
break
start_time = time.time()
with autocast():
predictions, visualized_output = demo.run_on_video(vid_frames)
logger.info(
"detected {} instances per frame in {:.2f}s".format(
len(predictions["pred_scores"]), time.time() - start_time
)
)
if args.output:
if args.save_frames:
for idx, _vis_output in enumerate(visualized_output):
out_filename = os.path.join(args.output, f"{idx}.jpg")
_vis_output.save(out_filename)
H, W = visualized_output[0].height, visualized_output[0].width
cap = cv2.VideoCapture(-1)
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(os.path.join(args.output, "visualization.mp4"), fourcc, 10.0, (W, H), True)
for _vis_output in visualized_output:
frame = _vis_output.get_image()[:, :, ::-1]
out.write(frame)
cap.release()
out.release()
================================================
FILE: demo_video/predictor.py
================================================
# reference: https://github.com/sukjunhwang/IFC/blob/master/projects/IFC/demo/predictor.py
import atexit
import bisect
import multiprocessing as mp
from collections import deque
import cv2
import torch
from visualizer import TrackVisualizer
from detectron2.data import MetadataCatalog
from detectron2.engine.defaults import DefaultPredictor
from detectron2.structures import Instances
from detectron2.utils.video_visualizer import VideoVisualizer
from detectron2.utils.visualizer import ColorMode
class VisualizationDemo(object):
def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
"""
Args:
cfg (CfgNode):
instance_mode (ColorMode):
parallel (bool): whether to run the model in different processes from visualization.
Useful since the visualization logic can be slow.
"""
self.metadata = MetadataCatalog.get(
cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
)
self.cpu_device = torch.device("cpu")
self.instance_mode = instance_mode
self.parallel = parallel
if parallel:
num_gpu = torch.cuda.device_count()
self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
else:
self.predictor = VideoPredictor(cfg)
def run_on_video(self, frames, conf_thre):
"""
Args:
frames (List[np.ndarray]): a list of images of shape (H, W, C) (in BGR order).
This is the format used by OpenCV.
Returns:
predictions (dict): the output of the model.
vis_output (VisImage): the visualized image output.
"""
vis_output = None
predictions = self.predictor(frames)
image_size = predictions["image_size"]
pred_scores = predictions["pred_scores"]
pred_labels = predictions["pred_labels"]
pred_masks = predictions["pred_masks"]
remain_index = [ii for ii in range(len(pred_scores)) if pred_scores[ii] >= conf_thre ]
pred_scores = [pred_scores[ind] for ind in remain_index]
pred_labels = [pred_labels[ind] for ind in remain_index]
pred_masks = [pred_masks[ind] for ind in remain_index]
frame_masks = list(zip(*pred_masks))
total_vis_output = []
for frame_idx in range(len(frames)):
frame = frames[frame_idx][:, :, ::-1]
visualizer = TrackVisualizer(frame, self.metadata, instance_mode=self.instance_mode)
ins = Instances(image_size)
if len(pred_scores) > 0:
print('pred scores:', pred_scores)
ins.scores = pred_scores
ins.pred_classes = pred_labels
ins.pred_masks = torch.stack(frame_masks[frame_idx], dim=0)
vis_output = visualizer.draw_instance_predictions(predictions=ins)
total_vis_output.append(vis_output)
return predictions, total_vis_output
class VideoPredictor(DefaultPredictor):
"""
Create a simple end-to-end predictor with the given config that runs on
single device for a single input image.
Compared to using the model directly, this class does the following additions:
1. Load checkpoint from `cfg.MODEL.WEIGHTS`.
2. Always take BGR image as the input and apply conversion defined by `cfg.INPUT.FORMAT`.
3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`.
4. Take one input image and produce a single output, instead of a batch.
If you'd like to do anything more fancy, please refer to its source code
as examples to build and use the model manually.
Attributes:
metadata (Metadata): the metadata of the underlying dataset, obtained from
cfg.DATASETS.TEST.
Examples:
::
pred = DefaultPredictor(cfg)
inputs = cv2.imread("input.jpg")
outputs = pred(inputs)
"""
def __call__(self, frames):
"""
Args:
original_image (np.ndarray): an image of shape (H, W, C) (in BGR order).
Returns:
predictions (dict):
the output of the model for one image only.
See :doc:`/tutorials/models` for details about the format.
"""
with torch.no_grad(): # https://github.com/sphinx-doc/sphinx/issues/4258
input_frames = []
for original_image in frames:
# Apply pre-processing to image.
if self.input_format == "RGB":
# whether the model expects BGR inputs or RGB
original_image = original_image[:, :, ::-1]
height, width = original_image.shape[:2]
image = self.aug.get_transform(original_image).apply_image(original_image)
image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
input_frames.append(image)
inputs = {"image": input_frames, "height": height, "width": width}
predictions = self.model([inputs])
return predictions
class AsyncPredictor:
"""
A predictor that runs the model asynchronously, possibly on >1 GPUs.
Because rendering the visualization takes considerably amount of time,
this helps improve throughput when rendering videos.
"""
class _StopToken:
pass
class _PredictWorker(mp.Process):
def __init__(self, cfg, task_queue, result_queue):
self.cfg = cfg
self.task_queue = task_queue
self.result_queue = result_queue
super().__init__()
def run(self):
predictor = VideoPredictor(self.cfg)
while True:
task = self.task_queue.get()
if isinstance(task, AsyncPredictor._StopToken):
break
idx, data = task
result = predictor(data)
self.result_queue.put((idx, result))
def __init__(self, cfg, num_gpus: int = 1):
"""
Args:
cfg (CfgNode):
num_gpus (int): if 0, will run on CPU
"""
num_workers = max(num_gpus, 1)
self.task_queue = mp.Queue(maxsize=num_workers * 3)
self.result_queue = mp.Queue(maxsize=num_workers * 3)
self.procs = []
for gpuid in range(max(num_gpus, 1)):
cfg = cfg.clone()
cfg.defrost()
cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
self.procs.append(
AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
)
self.put_idx = 0
self.get_idx = 0
self.result_rank = []
self.result_data = []
for p in self.procs:
p.start()
atexit.register(self.shutdown)
def put(self, image):
self.put_idx += 1
self.task_queue.put((self.put_idx, image))
def get(self):
self.get_idx += 1 # the index needed for this request
if len(self.result_rank) and self.result_rank[0] == self.get_idx:
res = self.result_data[0]
del self.result_data[0], self.result_rank[0]
return res
while True:
# make sure the results are returned in the correct order
idx, res = self.result_queue.get()
if idx == self.get_idx:
return res
insert = bisect.bisect(self.result_rank, idx)
self.result_rank.insert(insert, idx)
self.result_data.insert(insert, res)
def __len__(self):
return self.put_idx - self.get_idx
def __call__(self, image):
self.put(image)
return self.get()
def shutdown(self):
for _ in self.procs:
self.task_queue.put(AsyncPredictor._StopToken())
@property
def default_buffer_size(self):
return len(self.procs) * 5
================================================
FILE: demo_video/visualizer.py
================================================
# reference: https://github.com/sukjunhwang/IFC/blob/master/projects/IFC/demo/visualizer.py
import torch
import numpy as np
import matplotlib.colors as mplc
from detectron2.utils.visualizer import ColorMode, GenericMask, Visualizer, _create_text_labels
_ID_JITTERS = [[0.9047944201469568, 0.3241718265806123, 0.33443746665210006], [0.4590171386127151, 0.9095038146383864, 0.3143840671974788], [0.4769356899795538, 0.5044406738441948, 0.5354530846360839], [0.00820945625670777, 0.24099210193126785, 0.15471834055332978], [0.6195684374237388, 0.4020380013509799, 0.26100266066404676], [0.08281237756545068, 0.05900744492710419, 0.06106221202154216], [0.2264886829978755, 0.04925271007292076, 0.10214429345996079], [0.1888247470009874, 0.11275000298612425, 0.46112894830685514], [0.37415767691880975, 0.844284596118331, 0.950471611180866], [0.3817344218157631, 0.3483259270707101, 0.6572989333690541], [0.2403115731054466, 0.03078280287279167, 0.5385975692534737], [0.7035076951650824, 0.12352084932325424, 0.12873080308790197], [0.12607434914489934, 0.111244793010015, 0.09333334699716023], [0.6551607300342269, 0.7003064103554443, 0.4131794512286162], [0.13592107365596595, 0.5390702818232149, 0.004540643174930525], [0.38286244894454347, 0.709142545393449, 0.529074791609835], [0.4279376583651734, 0.5634708596431771, 0.8505569717104301], [0.3460488523902999, 0.464769595519293, 0.6676839675477276], [0.8544063246675081, 0.5041190233407755, 0.9081217697141578], [0.9207009090747208, 0.2403865944739051, 0.05375410999863772], [0.6515786136947107, 0.6299918449948327, 0.45292029442034387], [0.986174217295693, 0.2424849846977214, 0.3981993323108266], [0.22101915872994693, 0.3408589198278038, 0.006381420347677524], [0.3159785813515982, 0.1145748921741011, 0.595754317197274], [0.10263421488052715, 0.5864139253490858, 0.23908000741142432], [0.8272999391532938, 0.6123527260897751, 0.3365197327803193], [0.5269583712937912, 0.25668929554516506, 0.7888411215078127], [0.2433880265410031, 0.7240751234287827, 0.8483215810528648], [0.7254601709704898, 0.8316525547295984, 0.9325253855921963], [0.5574483824856672, 0.2935331727879944, 0.6594839453793155], [0.6209642371433579, 0.054030693198821256, 0.5080873988178534], [0.9055507077365624, 0.12865888619203514, 0.9309191861440005], [0.9914469722960537, 0.3074114506206205, 0.8762107657323488], [0.4812682518247371, 0.15055826298548158, 0.9656340505308308], [0.6459219454316445, 0.9144794010251625, 0.751338812155106], [0.860840174209798, 0.8844626353077639, 0.3604624506769899], [0.8194991672032272, 0.926399617787601, 0.8059222327343247], [0.6540413175393658, 0.04579445254618297, 0.26891917826531275], [0.37778835833987046, 0.36247927666109536, 0.7989799305827889], [0.22738304978177726, 0.9038018263773739, 0.6970838854138303], [0.6362015495896184, 0.527680794236961, 0.5570915425178721], [0.6436401915860954, 0.6316925317144524, 0.9137151236993912], [0.04161828388587163, 0.3832413349082706, 0.6880829921949752], [0.7768167825719299, 0.8933821497682587, 0.7221278391266809], [0.8632760876301346, 0.3278628094906323, 0.8421587587114462], [0.8556499133262127, 0.6497385872901932, 0.5436895688477963], [0.9861940318610894, 0.03562313777386272, 0.9183454677106616], [0.8042586091176366, 0.6167222703170994, 0.24181981557207644], [0.9504247117633057, 0.3454233714011461, 0.6883727005547743], [0.9611909135491202, 0.46384154263898114, 0.32700443315058914], [0.523542176970206, 0.446222414615845, 0.9067402987747814], [0.7536954008682911, 0.6675512338797588, 0.22538238957839196], [0.1554052265688285, 0.05746097492966129, 0.8580358872587424], [0.8540838640971405, 0.9165504335482566, 0.6806982829158964], [0.7065090319405029, 0.8683059983962002, 0.05167128320624026], [0.39134812961899124, 0.8910075505622979, 0.7639815712623922], [0.1578117311479783, 0.20047326898284668, 0.9220177338840568], [0.2017488993096358, 0.6949259970936679, 0.8729196864798128], [0.5591089340651949, 0.15576770423813258, 0.1469857469387812], [0.14510398622626974, 0.24451497734532168, 0.46574271993578786], [0.13286397822351492, 0.4178244533944635, 0.03728728952131943], [0.556463206310225, 0.14027595183361663, 0.2731537988657907], [0.4093837966398032, 0.8015225687789814, 0.8033567296903834], [0.527442563956637, 0.902232617214431, 0.7066626674362227], [0.9058355503297827, 0.34983989180213004, 0.8353262183839384], [0.7108382186953104, 0.08591307895133471, 0.21434688012521974], [0.22757345065207668, 0.7943075496583976, 0.2992305547627421], [0.20454109788173636, 0.8251670332103687, 0.012981987094547232], [0.7672562637297392, 0.005429019973062554, 0.022163616037108702], [0.37487345910117564, 0.5086240194440863, 0.9061216063654387], [0.9878004014101087, 0.006345852772772331, 0.17499753379350858], [0.030061528704491303, 0.1409704315546606, 0.3337131835834506], [0.5022506782611504, 0.5448435505388706, 0.40584238936140726], [0.39560774627423445, 0.8905943695833262, 0.5850815030921116], [0.058615671926786406, 0.5365713844300387, 0.1620457551256279], [0.41843842882069693, 0.1536005983609976, 0.3127878501592438], [0.05947621790155899, 0.5412421167331932, 0.2611322146455659], [0.5196159938235607, 0.7066461551682705, 0.970261497412556], [0.30443031606149007, 0.45158581060034975, 0.4331841153149706], [0.8848298403933996, 0.7241791700943656, 0.8917110054596072], [0.5720260591898779, 0.3072801598203052, 0.8891066705989902], [0.13964015336177327, 0.2531778096760302, 0.5703756837403124], [0.2156307542329836, 0.4139947500641685, 0.87051676884144], [0.10800455881891169, 0.05554646035458266, 0.2947027428551443], [0.35198009410633857, 0.365849666213808, 0.06525787683513773], [0.5223264108118847, 0.9032195574351178, 0.28579084943315025], [0.7607724246546966, 0.3087194381828555, 0.6253235528354899], [0.5060485442077824, 0.19173600467625274, 0.9931175692203702], [0.5131805830323746, 0.07719515392040577, 0.923212006754969], [0.3629762141280106, 0.02429179642710888, 0.6963754952399983], [0.7542592485456767, 0.6478893299494212, 0.3424965345400731], [0.49944574453364454, 0.6775665366832825, 0.33758796076989583], [0.010621818120767679, 0.8221571611173205, 0.5186257457566332], [0.5857910304290109, 0.7178133992025467, 0.9729243483606071], [0.16987399482717613, 0.9942570210657463, 0.18120758122552927], [0.016362572521240848, 0.17582788603087263, 0.7255176922640298], [0.10981764283706419, 0.9078582203470377, 0.7638063718334003], [0.9252097840441119, 0.3330197086990039, 0.27888705301420136], [0.12769972651171546, 0.11121470804891687, 0.12710743734391716], [0.5753520518360334, 0.2763862879599456, 0.6115636613363361]]
_OFF_WHITE = (1.0, 1.0, 240.0 / 255)
class TrackVisualizer(Visualizer):
def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE):
super().__init__(
img_rgb, metadata=metadata, scale=scale, instance_mode=instance_mode
)
self.cpu_device = torch.device("cpu")
def _jitter(self, color, id):
"""
Randomly modifies given color to produce a slightly different color than the color given.
Args:
color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
picked. The values in the list are in the [0.0, 1.0] range.
Returns:
jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
color after being jittered. The values in the list are in the [0.0, 1.0] range.
"""
color = mplc.to_rgb(color)
vec = _ID_JITTERS[id]
# better to do it in another color space
vec = vec / np.linalg.norm(vec) * 0.5
res = np.clip(vec + color, 0, 1)
return tuple(res)
def overlay_instances(
self,
*,
boxes=None,
labels=None,
masks=None,
keypoints=None,
assigned_colors=None,
alpha=0.5
):
"""
Args:
boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`,
or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
or a :class:`RotatedBoxes`,
or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
for the N objects in a single image,
labels (list[str]): the text to be displayed for each instance.
masks (masks-like object): Supported types are:
* :class:`detectron2.structures.PolygonMasks`,
:class:`detectron2.structures.BitMasks`.
* list[list[ndarray]]: contains the segmentation masks for all objects in one image.
The first level of the list corresponds to individual instances. The second
level to all the polygon that compose the instance, and the third level
to the polygon coordinates. The third level should have the format of
[x0, y0, x1, y1, ..., xn, yn] (n >= 3).
* list[ndarray]: each ndarray is a binary mask of shape (H, W).
* list[dict]: each dict is a COCO-style RLE.
keypoints (Keypoint or array like): an array-like object of shape (N, K, 3),
where the N is the number of instances and K is the number of keypoints.
The last dimension corresponds to (x, y, visibility or score).
assigned_colors (list[matplotlib.colors]): a list of colors, where each color
corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
for full list of formats that the colors are accepted in.
Returns:
output (VisImage): image object with visualizations.
"""
num_instances = 0
if boxes is not None:
boxes = self._convert_boxes(boxes)
num_instances = len(boxes)
if masks is not None:
# print('masks:', masks)
#masks = self._convert_masks(masks)
if num_instances:
assert len(masks) == num_instances
else:
num_instances = len(masks)
if keypoints is not None:
if num_instances:
assert len(keypoints) == num_instances
else:
num_instances = len(keypoints)
keypoints = self._convert_keypoints(keypoints)
if labels is not None:
assert len(labels) == num_instances
if assigned_colors is None:
assigned_colors = [random_color(ii, rgb=True, maximum=1) for ii in range(num_instances)]
if num_instances == 0:
return self.output
if boxes is not None and boxes.shape[1] == 5:
return self.overlay_rotated_instances(
boxes=boxes, labels=labels, assigned_colors=assigned_colors
)
# Display in largest to smallest order to reduce occlusion.
areas = None
if boxes is not None:
areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
elif masks is not None:
areas = np.asarray([x.sum() for x in masks])
if areas is not None:
sorted_idxs = np.argsort(-areas).tolist()
# Re-order overlapped instances in descending order.
boxes = boxes[sorted_idxs] if boxes is not None else None
labels = [labels[k] for k in sorted_idxs] if labels is not None else None
masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None
assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
keypoints = keypoints[sorted_idxs] if keypoints is not None else None
for i in range(num_instances):
color = assigned_colors[i]
# if boxes is not None:
# self.draw_box(boxes[i], edge_color=color)
if masks is not None:
#self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha)
binary_mask = masks[i].astype(np.uint8)
#alpha = 0.7
#print('binary mask:', binary_mask)
self.draw_binary_mask(
binary_mask,
color=color,
edge_color=None, # _OFF_WHITE
alpha=alpha,
)
if False:
# if labels is not None:
# first get a box
if boxes is not None:
x0, y0, x1, y1 = boxes[i]
text_pos = (x0, y0) # if drawing boxes, put text on the box corner.
horiz_align = "left"
elif masks is not None:
# skip small mask without polygon
if len(masks[i].polygons) == 0:
continue
x0, y0, x1, y1 = masks[i].bbox()
# draw text in the center (defined by median) when box is not drawn
# median is less sensitive to outliers.
text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1]
horiz_align = "center"
else:
continue # drawing the box confidence for keypoints isn't very useful.
# for small objects, draw text at the side to avoid occlusion
instance_area = (y1 - y0) * (x1 - x0)
if (
instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
or y1 - y0 < 40 * self.output.scale
):
if y1 >= self.output.height - 5:
text_pos = (x1, y0)
else:
text_pos = (x0, y1)
height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
font_size = (
np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
* 0.5
* self._default_font_size
)
# self.draw_text(
# labels[i],
# text_pos,
# color=lighter_color,
# horizontal_alignment=horiz_align,
# font_size=font_size,
# )
# draw keypoints
if keypoints is not None:
for keypoints_per_instance in keypoints:
self.draw_and_connect_keypoints(keypoints_per_instance)
return self.output
def draw_instance_predictions(self, predictions):
"""
Draw instance-level prediction results on an image.
Args:
predictions (Instances): the output of an instance detection/segmentation
model. Following fields will be used to draw:
"pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
Returns:
output (VisImage): image object with visualizations.
"""
preds = predictions.to(self.cpu_device)
boxes = preds.pred_boxes if preds.has("pred_boxes") else None
scores = preds.scores if preds.has("scores") else None
classes = preds.pred_classes if preds.has("pred_classes") else None
labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
if labels is not None:
labels = ["[{}] ".format(_id) + l for _id, l in enumerate(labels)]
if preds.has("pred_masks"):
masks = np.asarray(preds.pred_masks)
print('enter here==========')
# masks = [GenericMask(x, self.output.height, self.output.width) for x in masks]
else:
masks = None
if classes is None:
return self.output
colors = [
self._jitter([x / 255 for x in self.metadata.thing_colors[c]], id) for id, c in enumerate(classes)
]
alpha = 0.5
if self._instance_mode == ColorMode.IMAGE_BW:
self.output.img = self._create_grayscale_image(
(preds.pred_masks.any(dim=0) > 0).numpy()
if preds.has("pred_masks")
else None
)
alpha = 0.3
self.overlay_instances(
masks=masks,
boxes=boxes,
labels=labels,
assigned_colors=colors,
alpha=alpha,
)
return self.output
================================================
FILE: mask2former/__init__.py
================================================
from . import data # register all new datasets
from . import modeling
# config
from .config import add_maskformer2_config
# dataset loading
from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
MaskFormerInstanceDatasetMapper,
)
from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
MaskFormerPanopticDatasetMapper,
)
from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
MaskFormerSemanticDatasetMapper,
)
# models
from .maskformer_model import MaskFormer
from .test_time_augmentation import SemanticSegmentorWithTTA
# evaluation
from .evaluation.instance_evaluation import InstanceSegEvaluator
================================================
FILE: mask2former/config.py
================================================
# -*- coding: utf-8 -*-
from detectron2.config import CfgNode as CN
def add_maskformer2_config(cfg):
"""
Add config for MASK_FORMER.
"""
# NOTE: configs from original maskformer
# data config
# select the dataset mapper
cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
# Color augmentation
cfg.INPUT.COLOR_AUG_SSD = False
# We retry random cropping until no single category in semantic segmentation GT occupies more
# than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
# Pad image and segmentation GT in dataset mapper.
cfg.INPUT.SIZE_DIVISIBILITY = -1
# solver config
# weight decay on embedding
cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
# optimizer
cfg.SOLVER.OPTIMIZER = "ADAMW"
cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
# mask_former model config
cfg.MODEL.MASK_FORMER = CN()
# loss
cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0
cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0
# transformer config
cfg.MODEL.MASK_FORMER.NHEADS = 8
cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
cfg.MODEL.MASK_FORMER.PRE_NORM = False
cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100
cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False
# mask_former inference config
cfg.MODEL.MASK_FORMER.TEST = CN()
cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True
cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False
cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
# Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
# you can use this config to override
cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32
# pixel decoder config
cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
# adding transformer in pixel decoder
cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
# pixel decoder
cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
# swin transformer backbone
cfg.MODEL.SWIN = CN()
cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
cfg.MODEL.SWIN.PATCH_SIZE = 4
cfg.MODEL.SWIN.EMBED_DIM = 96
cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
cfg.MODEL.SWIN.WINDOW_SIZE = 7
cfg.MODEL.SWIN.MLP_RATIO = 4.0
cfg.MODEL.SWIN.QKV_BIAS = True
cfg.MODEL.SWIN.QK_SCALE = None
cfg.MODEL.SWIN.DROP_RATE = 0.0
cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
cfg.MODEL.SWIN.APE = False
cfg.MODEL.SWIN.PATCH_NORM = True
cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
cfg.MODEL.SWIN.USE_CHECKPOINT = False
# NOTE: maskformer2 extra configs
# transformer module
cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder"
# LSJ aug
cfg.INPUT.IMAGE_SIZE = 1024
cfg.INPUT.MIN_SCALE = 0.1
cfg.INPUT.MAX_SCALE = 2.0
# MSDeformAttn encoder configs
cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8
# point loss configs
# Number of points sampled during training for a mask point head.
cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112
# Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
# original paper.
cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0
# Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
# the original paper.
cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75
================================================
FILE: mask2former/data/__init__.py
================================================
from . import datasets
================================================
FILE: mask2former/data/dataset_mappers/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
================================================
FILE: mask2former/data/dataset_mappers/__init__.py.new
================================================
================================================
FILE: mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py
================================================
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
import copy
import logging
import numpy as np
import torch
from detectron2.config import configurable
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.data.transforms import TransformGen
from detectron2.structures import BitMasks, Instances
from pycocotools import mask as coco_mask
__all__ = ["COCOInstanceNewBaselineDatasetMapper"]
def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
"""
Compute the bounding boxes around the provided masks.
Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with
``0 <= x1 < x2`` and ``0 <= y1 < y2``.
Args:
masks (Tensor[N, H, W]): masks to transform where N is the number of masks
and (H, W) are the spatial dimensions.
Returns:
Tensor[N, 4]: bounding boxes
"""
if masks.numel() == 0:
return masks
n = masks.shape[0]
for index, mask in enumerate(masks):
y, x = torch.where(mask != 0)
if len(x) * len(y) == 0:
continue
h = torch.max(y) - torch.min(y)
w = torch.max(x) - torch.min(x)
masks[index, torch.min(y):torch.max(y), torch.min(x):torch.max(x)] = 1.0
return masks
def convert_coco_poly_to_mask(segmentations, height, width):
masks = []
for polygons in segmentations:
rles = coco_mask.frPyObjects(polygons, height, width)
mask = coco_mask.decode(rles)
if len(mask.shape) < 3:
mask = mask[..., None]
mask = torch.as_tensor(mask, dtype=torch.uint8)
mask = mask.any(dim=2)
masks.append(mask)
if masks:
masks = torch.stack(masks, dim=0)
masks = masks_to_boxes(masks)
else:
masks = torch.zeros((0, height, width), dtype=torch.uint8)
return masks
def build_transform_gen(cfg, is_train):
"""
Create a list of default :class:`Augmentation` from config.
Now it includes resizing and flipping.
Returns:
list[Augmentation]
"""
assert is_train, "Only support training augmentation"
image_size = cfg.INPUT.IMAGE_SIZE
min_scale = cfg.INPUT.MIN_SCALE
max_scale = cfg.INPUT.MAX_SCALE
augmentation = []
if cfg.INPUT.RANDOM_FLIP != "none":
augmentation.append(
T.RandomFlip(
horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
)
)
augmentation.extend([
T.ResizeScale(
min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
),
T.FixedSizeCrop(crop_size=(image_size, image_size)),
])
return augmentation
# This is specifically designed for the COCO dataset.
class COCOInstanceNewBaselineDatasetMapper:
"""
A callable which takes a dataset dict in Detectron2 Dataset format,
and map it into a format used by MaskFormer.
This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
The callable currently does the following:
1. Read the image from "file_name"
2. Applies geometric transforms to the image and annotation
3. Find and applies suitable cropping to the image and annotation
4. Prepare image and annotation to Tensors
"""
@configurable
def __init__(
self,
is_train=True,
*,
tfm_gens,
image_format,
):
"""
NOTE: this interface is experimental.
Args:
is_train: for training or inference
augmentations: a list of augmentations or deterministic transforms to apply
tfm_gens: data augmentation
image_format: an image format supported by :func:`detection_utils.read_image`.
"""
self.tfm_gens = tfm_gens
logging.getLogger(__name__).info(
"[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens))
)
self.img_format = image_format
self.is_train = is_train
@classmethod
def from_config(cls, cfg, is_train=True):
# Build augmentation
tfm_gens = build_transform_gen(cfg, is_train)
ret = {
"is_train": is_train,
"tfm_gens": tfm_gens,
"image_format": cfg.INPUT.FORMAT,
}
return ret
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
utils.check_image_size(dataset_dict, image)
# TODO: get padding mask
# by feeding a "segmentation mask" to the same transforms
padding_mask = np.ones(image.shape[:2])
image, transforms = T.apply_transform_gens(self.tfm_gens, image)
# the crop transformation has default padding value 0 for segmentation
padding_mask = transforms.apply_segmentation(padding_mask)
padding_mask = ~ padding_mask.astype(bool)
image_shape = image.shape[:2] # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))
if not self.is_train:
# USER: Modify this if you want to keep them for some reason.
dataset_dict.pop("annotations", None)
return dataset_dict
if "annotations" in dataset_dict:
# USER: Modify this if you want to keep them for some reason.
for anno in dataset_dict["annotations"]:
# Let's always keep mask
# if not self.mask_on:
# anno.pop("segmentation", None)
anno.pop("keypoints", None)
# USER: Implement additional transformations if you have other types of data
annos = [
utils.transform_instance_annotations(obj, transforms, image_shape)
for obj in dataset_dict.pop("annotations")
if obj.get("iscrowd", 0) == 0
]
# NOTE: does not support BitMask due to augmentation
# Current BitMask cannot handle empty objects
instances = utils.annotations_to_instances(annos, image_shape)
# After transforms such as cropping are applied, the bounding box may no longer
# tightly bound the object. As an example, imagine a triangle object
# [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
# bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
# the intersection of original bounding box and the cropping box.
instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
# Need to filter empty instances first (due to augmentation)
instances = utils.filter_empty_instances(instances)
# Generate masks from polygon
h, w = instances.image_size
# image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
if hasattr(instances, 'gt_masks'):
gt_masks = instances.gt_masks
gt_masks_box = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
instances.gt_masks = gt_masks_box
dataset_dict["instances"] = instances
return dataset_dict
================================================
FILE: mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py
================================================
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
import copy
import logging
import numpy as np
import torch
from detectron2.config import configurable
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.data.transforms import TransformGen
from detectron2.structures import BitMasks, Boxes, Instances
__all__ = ["COCOPanopticNewBaselineDatasetMapper"]
def build_transform_gen(cfg, is_train):
"""
Create a list of default :class:`Augmentation` from config.
Now it includes resizing and flipping.
Returns:
list[Augmentation]
"""
assert is_train, "Only support training augmentation"
image_size = cfg.INPUT.IMAGE_SIZE
min_scale = cfg.INPUT.MIN_SCALE
max_scale = cfg.INPUT.MAX_SCALE
augmentation = []
if cfg.INPUT.RANDOM_FLIP != "none":
augmentation.append(
T.RandomFlip(
horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
)
)
augmentation.extend([
T.ResizeScale(
min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
),
T.FixedSizeCrop(crop_size=(image_size, image_size)),
])
return augmentation
# This is specifically designed for the COCO dataset.
class COCOPanopticNewBaselineDatasetMapper:
"""
A callable which takes a dataset dict in Detectron2 Dataset format,
and map it into a format used by MaskFormer.
This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
The callable currently does the following:
1. Read the image from "file_name"
2. Applies geometric transforms to the image and annotation
3. Find and applies suitable cropping to the image and annotation
4. Prepare image and annotation to Tensors
"""
@configurable
def __init__(
self,
is_train=True,
*,
tfm_gens,
image_format,
):
"""
NOTE: this interface is experimental.
Args:
is_train: for training or inference
augmentations: a list of augmentations or deterministic transforms to apply
crop_gen: crop augmentation
tfm_gens: data augmentation
image_format: an image format supported by :func:`detection_utils.read_image`.
"""
self.tfm_gens = tfm_gens
logging.getLogger(__name__).info(
"[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
str(self.tfm_gens)
)
)
self.img_format = image_format
self.is_train = is_train
@classmethod
def from_config(cls, cfg, is_train=True):
# Build augmentation
tfm_gens = build_transform_gen(cfg, is_train)
ret = {
"is_train": is_train,
"tfm_gens": tfm_gens,
"image_format": cfg.INPUT.FORMAT,
}
return ret
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
utils.check_image_size(dataset_dict, image)
image, transforms = T.apply_transform_gens(self.tfm_gens, image)
image_shape = image.shape[:2] # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
if not self.is_train:
# USER: Modify this if you want to keep them for some reason.
dataset_dict.pop("annotations", None)
return dataset_dict
if "pan_seg_file_name" in dataset_dict:
pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
segments_info = dataset_dict["segments_info"]
# apply the same transformation to panoptic segmentation
pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
from panopticapi.utils import rgb2id
pan_seg_gt = rgb2id(pan_seg_gt)
instances = Instances(image_shape)
classes = []
masks = []
for segment_info in segments_info:
class_id = segment_info["category_id"]
if not segment_info["iscrowd"]:
classes.append(class_id)
masks.append(pan_seg_gt == segment_info["id"])
classes = np.array(classes)
instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
if len(masks) == 0:
# Some image does not have annotation (all ignored)
instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
instances.gt_boxes = Boxes(torch.zeros((0, 4)))
else:
masks = BitMasks(
torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
)
instances.gt_masks = masks.tensor
instances.gt_boxes = masks.get_bounding_boxes()
dataset_dict["instances"] = instances
return dataset_dict
================================================
FILE: mask2former/data/dataset_mappers/mask_former_instance_dataset_mapper.py
================================================
import copy
import logging
import numpy as np
import pycocotools.mask as mask_util
import torch
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.projects.point_rend import ColorAugSSDTransform
from detectron2.structures import BitMasks, Instances, polygons_to_bitmask
__all__ = ["MaskFormerInstanceDatasetMapper"]
class MaskFormerInstanceDatasetMapper:
"""
A callable which takes a dataset dict in Detectron2 Dataset format,
and map it into a format used by MaskFormer for instance segmentation.
The callable currently does the following:
1. Read the image from "file_name"
2. Applies geometric transforms to the image and annotation
3. Find and applies suitable cropping to the image and annotation
4. Prepare image and annotation to Tensors
"""
@configurable
def __init__(
self,
is_train=True,
*,
augmentations,
image_format,
size_divisibility,
):
"""
NOTE: this interface is experimental.
Args:
is_train: for training or inference
augmentations: a list of augmentations or deterministic transforms to apply
image_format: an image format supported by :func:`detection_utils.read_image`.
size_divisibility: pad image size to be divisible by this value
"""
self.is_train = is_train
self.tfm_gens = augmentations
self.img_format = image_format
self.size_divisibility = size_divisibility
logger = logging.getLogger(__name__)
mode = "training" if is_train else "inference"
logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
@classmethod
def from_config(cls, cfg, is_train=True):
# Build augmentation
augs = [
T.ResizeShortestEdge(
cfg.INPUT.MIN_SIZE_TRAIN,
cfg.INPUT.MAX_SIZE_TRAIN,
cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
)
]
if cfg.INPUT.CROP.ENABLED:
augs.append(
T.RandomCrop(
cfg.INPUT.CROP.TYPE,
cfg.INPUT.CROP.SIZE,
)
)
if cfg.INPUT.COLOR_AUG_SSD:
augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
augs.append(T.RandomFlip())
ret = {
"is_train": is_train,
"augmentations": augs,
"image_format": cfg.INPUT.FORMAT,
"size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
}
return ret
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
utils.check_image_size(dataset_dict, image)
aug_input = T.AugInput(image)
aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
image = aug_input.image
# transform instnace masks
assert "annotations" in dataset_dict
for anno in dataset_dict["annotations"]:
anno.pop("keypoints", None)
annos = [
utils.transform_instance_annotations(obj, transforms, image.shape[:2])
for obj in dataset_dict.pop("annotations")
if obj.get("iscrowd", 0) == 0
]
if len(annos):
assert "segmentation" in annos[0]
segms = [obj["segmentation"] for obj in annos]
masks = []
for segm in segms:
if isinstance(segm, list):
# polygon
masks.append(polygons_to_bitmask(segm, *image.shape[:2]))
elif isinstance(segm, dict):
# COCO RLE
masks.append(mask_util.decode(segm))
elif isinstance(segm, np.ndarray):
assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
segm.ndim
)
# mask array
masks.append(segm)
else:
raise ValueError(
"Cannot convert segmentation of type '{}' to BitMasks!"
"Supported types are: polygons as list[list[float] or ndarray],"
" COCO-style RLE as a dict, or a binary segmentation mask "
" in a 2D numpy array of shape HxW.".format(type(segm))
)
# Pad image and segmentation label here!
image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks]
classes = [int(obj["category_id"]) for obj in annos]
classes = torch.tensor(classes, dtype=torch.int64)
if self.size_divisibility > 0:
image_size = (image.shape[-2], image.shape[-1])
padding_size = [
0,
self.size_divisibility - image_size[1],
0,
self.size_divisibility - image_size[0],
]
# pad image
image = F.pad(image, padding_size, value=128).contiguous()
# pad mask
masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks]
image_shape = (image.shape[-2], image.shape[-1]) # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"] = image
# Prepare per-category binary masks
instances = Instances(image_shape)
instances.gt_classes = classes
if len(masks) == 0:
# Some image does not have annotation (all ignored)
instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1]))
else:
masks = BitMasks(torch.stack(masks))
instances.gt_masks = masks.tensor
dataset_dict["instances"] = instances
return dataset_dict
================================================
FILE: mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py
================================================
import copy
import logging
import numpy as np
import torch
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.structures import BitMasks, Instances
from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
__all__ = ["MaskFormerPanopticDatasetMapper"]
class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper):
"""
A callable which takes a dataset dict in Detectron2 Dataset format,
and map it into a format used by MaskFormer for panoptic segmentation.
The callable currently does the following:
1. Read the image from "file_name"
2. Applies geometric transforms to the image and annotation
3. Find and applies suitable cropping to the image and annotation
4. Prepare image and annotation to Tensors
"""
@configurable
def __init__(
self,
is_train=True,
*,
augmentations,
image_format,
ignore_label,
size_divisibility,
):
"""
NOTE: this interface is experimental.
Args:
is_train: for training or inference
augmentations: a list of augmentations or deterministic transforms to apply
image_format: an image format supported by :func:`detection_utils.read_image`.
ignore_label: the label that is ignored to evaluation
size_divisibility: pad image size to be divisible by this value
"""
super().__init__(
is_train,
augmentations=augmentations,
image_format=image_format,
ignore_label=ignore_label,
size_divisibility=size_divisibility,
)
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
utils.check_image_size(dataset_dict, image)
# semantic segmentation
if "sem_seg_file_name" in dataset_dict:
# PyTorch transformation not implemented for uint16, so converting it to double first
sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
else:
sem_seg_gt = None
# panoptic segmentation
if "pan_seg_file_name" in dataset_dict:
pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
segments_info = dataset_dict["segments_info"]
else:
pan_seg_gt = None
segments_info = None
if pan_seg_gt is None:
raise ValueError(
"Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format(
dataset_dict["file_name"]
)
)
aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
image = aug_input.image
if sem_seg_gt is not None:
sem_seg_gt = aug_input.sem_seg
# apply the same transformation to panoptic segmentation
pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
from panopticapi.utils import rgb2id
pan_seg_gt = rgb2id(pan_seg_gt)
# Pad image and segmentation label here!
image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
if sem_seg_gt is not None:
sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long"))
if self.size_divisibility > 0:
image_size = (image.shape[-2], image.shape[-1])
padding_size = [
0,
self.size_divisibility - image_size[1],
0,
self.size_divisibility - image_size[0],
]
image = F.pad(image, padding_size, value=128).contiguous()
if sem_seg_gt is not None:
sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
pan_seg_gt = F.pad(
pan_seg_gt, padding_size, value=0
).contiguous() # 0 is the VOID panoptic label
image_shape = (image.shape[-2], image.shape[-1]) # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"] = image
if sem_seg_gt is not None:
dataset_dict["sem_seg"] = sem_seg_gt.long()
if "annotations" in dataset_dict:
raise ValueError("Pemantic segmentation dataset should not have 'annotations'.")
# Prepare per-category binary masks
pan_seg_gt = pan_seg_gt.numpy()
instances = Instances(image_shape)
classes = []
masks = []
for segment_info in segments_info:
class_id = segment_info["category_id"]
if not segment_info["iscrowd"]:
classes.append(class_id)
masks.append(pan_seg_gt == segment_info["id"])
classes = np.array(classes)
instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
if len(masks) == 0:
# Some image does not have annotation (all ignored)
instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
else:
masks = BitMasks(
torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
)
instances.gt_masks = masks.tensor
dataset_dict["instances"] = instances
return dataset_dict
================================================
FILE: mask2former/data/dataset_mappers/mask_former_semantic_dataset_mapper.py
================================================
import copy
import logging
import numpy as np
import torch
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.data import MetadataCatalog
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.projects.point_rend import ColorAugSSDTransform
from detectron2.structures import BitMasks, Instances
__all__ = ["MaskFormerSemanticDatasetMapper"]
class MaskFormerSemanticDatasetMapper:
"""
A callable which takes a dataset dict in Detectron2 Dataset format,
and map it into a format used by MaskFormer for semantic segmentation.
The callable currently does the following:
1. Read the image from "file_name"
2. Applies geometric transforms to the image and annotation
3. Find and applies suitable cropping to the image and annotation
4. Prepare image and annotation to Tensors
"""
@configurable
def __init__(
self,
is_train=True,
*,
augmentations,
image_format,
ignore_label,
size_divisibility,
):
"""
NOTE: this interface is experimental.
Args:
is_train: for training or inference
augmentations: a list of augmentations or deterministic transforms to apply
image_format: an image format supported by :func:`detection_utils.read_image`.
ignore_label: the label that is ignored to evaluation
size_divisibility: pad image size to be divisible by this value
"""
self.is_train = is_train
self.tfm_gens = augmentations
self.img_format = image_format
self.ignore_label = ignore_label
self.size_divisibility = size_divisibility
logger = logging.getLogger(__name__)
mode = "training" if is_train else "inference"
logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
@classmethod
def from_config(cls, cfg, is_train=True):
# Build augmentation
augs = [
T.ResizeShortestEdge(
cfg.INPUT.MIN_SIZE_TRAIN,
cfg.INPUT.MAX_SIZE_TRAIN,
cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
)
]
if cfg.INPUT.CROP.ENABLED:
augs.append(
T.RandomCrop_CategoryAreaConstraint(
cfg.INPUT.CROP.TYPE,
cfg.INPUT.CROP.SIZE,
cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA,
cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
)
)
if cfg.INPUT.COLOR_AUG_SSD:
augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
augs.append(T.RandomFlip())
# Assume always applies to the training set.
dataset_names = cfg.DATASETS.TRAIN
meta = MetadataCatalog.get(dataset_names[0])
ignore_label = meta.ignore_label
ret = {
"is_train": is_train,
"augmentations": augs,
"image_format": cfg.INPUT.FORMAT,
"ignore_label": ignore_label,
"size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
}
return ret
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!"
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
utils.check_image_size(dataset_dict, image)
if "sem_seg_file_name" in dataset_dict:
# PyTorch transformation not implemented for uint16, so converting it to double first
sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
else:
sem_seg_gt = None
if sem_seg_gt is None:
raise ValueError(
"Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format(
dataset_dict["file_name"]
)
)
aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
image = aug_input.image
sem_seg_gt = aug_input.sem_seg
# Pad image and segmentation label here!
image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
if sem_seg_gt is not None:
sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
if self.size_divisibility > 0:
image_size = (image.shape[-2], image.shape[-1])
padding_size = [
0,
self.size_divisibility - image_size[1],
0,
self.size_divisibility - image_size[0],
]
image = F.pad(image, padding_size, value=128).contiguous()
if sem_seg_gt is not None:
sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
image_shape = (image.shape[-2], image.shape[-1]) # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"] = image
if sem_seg_gt is not None:
dataset_dict["sem_seg"] = sem_seg_gt.long()
if "annotations" in dataset_dict:
raise ValueError("Semantic segmentation dataset should not have 'annotations'.")
# Prepare per-category binary masks
if sem_seg_gt is not None:
sem_seg_gt = sem_seg_gt.numpy()
instances = Instances(image_shape)
classes = np.unique(sem_seg_gt)
# remove ignored region
classes = classes[classes != self.ignore_label]
instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
masks = []
for class_id in classes:
masks.append(sem_seg_gt == class_id)
if len(masks) == 0:
# Some image does not have annotation (all ignored)
instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1]))
else:
masks = BitMasks(
torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
)
instances.gt_masks = masks.tensor
dataset_dict["instances"] = instances
return dataset_dict
================================================
FILE: mask2former/data/datasets/__init__.py
================================================
from . import (
register_ade20k_full,
register_ade20k_panoptic,
register_coco_stuff_10k,
register_mapillary_vistas,
register_coco_panoptic_annos_semseg,
register_ade20k_instance,
register_mapillary_vistas_panoptic,
)
================================================
FILE: mask2former/data/datasets/register_ade20k_full.py
================================================
import os
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets import load_sem_seg
ADE20K_SEM_SEG_FULL_CATEGORIES = [
{"name": "wall", "id": 2978, "trainId": 0},
{"name": "building, edifice", "id": 312, "trainId": 1},
{"name": "sky", "id": 2420, "trainId": 2},
{"name": "tree", "id": 2855, "trainId": 3},
{"name": "road, route", "id": 2131, "trainId": 4},
{"name": "floor, flooring", "id": 976, "trainId": 5},
{"name": "ceiling", "id": 447, "trainId": 6},
{"name": "bed", "id": 165, "trainId": 7},
{"name": "sidewalk, pavement", "id": 2377, "trainId": 8},
{"name": "earth, ground", "id": 838, "trainId": 9},
{"name": "cabinet", "id": 350, "trainId": 10},
{"name": "person, individual, someone, somebody, mortal, soul", "id": 1831, "trainId": 11},
{"name": "grass", "id": 1125, "trainId": 12},
{"name": "windowpane, window", "id": 3055, "trainId": 13},
{"name": "car, auto, automobile, machine, motorcar", "id": 401, "trainId": 14},
{"name": "mountain, mount", "id": 1610, "trainId": 15},
{"name": "plant, flora, plant life", "id": 1910, "trainId": 16},
{"name": "table", "id": 2684, "trainId": 17},
{"name": "chair", "id": 471, "trainId": 18},
{"name": "curtain, drape, drapery, mantle, pall", "id": 687, "trainId": 19},
{"name": "door", "id": 774, "trainId": 20},
{"name": "sofa, couch, lounge", "id": 2473, "trainId": 21},
{"name": "sea", "id": 2264, "trainId": 22},
{"name": "painting, picture", "id": 1735, "trainId": 23},
{"name": "water", "id": 2994, "trainId": 24},
{"name": "mirror", "id": 1564, "trainId": 25},
{"name": "house", "id": 1276, "trainId": 26},
{"name": "rug, carpet, carpeting", "id": 2178, "trainId": 27},
{"name": "shelf", "id": 2329, "trainId": 28},
{"name": "armchair", "id": 57, "trainId": 29},
{"name": "fence, fencing", "id": 907, "trainId": 30},
{"name": "field", "id": 913, "trainId": 31},
{"name": "lamp", "id": 1395, "trainId": 32},
{"name": "rock, stone", "id": 2138, "trainId": 33},
{"name": "seat", "id": 2272, "trainId": 34},
{"name": "river", "id": 2128, "trainId": 35},
{"name": "desk", "id": 724, "trainId": 36},
{"name": "bathtub, bathing tub, bath, tub", "id": 155, "trainId": 37},
{"name": "railing, rail", "id": 2053, "trainId": 38},
{"name": "signboard, sign", "id": 2380, "trainId": 39},
{"name": "cushion", "id": 689, "trainId": 40},
{"name": "path", "id": 1788, "trainId": 41},
{"name": "work surface", "id": 3087, "trainId": 42},
{"name": "stairs, steps", "id": 2530, "trainId": 43},
{"name": "column, pillar", "id": 581, "trainId": 44},
{"name": "sink", "id": 2388, "trainId": 45},
{"name": "wardrobe, closet, press", "id": 2985, "trainId": 46},
{"name": "snow", "id": 2454, "trainId": 47},
{"name": "refrigerator, icebox", "id": 2096, "trainId": 48},
{"name": "base, pedestal, stand", "id": 137, "trainId": 49},
{"name": "bridge, span", "id": 294, "trainId": 50},
{"name": "blind, screen", "id": 212, "trainId": 51},
{"name": "runway", "id": 2185, "trainId": 52},
{"name": "cliff, drop, drop-off", "id": 524, "trainId": 53},
{"name": "sand", "id": 2212, "trainId": 54},
{"name": "fireplace, hearth, open fireplace", "id": 943, "trainId": 55},
{"name": "pillow", "id": 1869, "trainId": 56},
{"name": "screen door, screen", "id": 2251, "trainId": 57},
{"name": "toilet, can, commode, crapper, pot, potty, stool, throne", "id": 2793, "trainId": 58},
{"name": "skyscraper", "id": 2423, "trainId": 59},
{"name": "grandstand, covered stand", "id": 1121, "trainId": 60},
{"name": "box", "id": 266, "trainId": 61},
{"name": "pool table, billiard table, snooker table", "id": 1948, "trainId": 62},
{"name": "palm, palm tree", "id": 1744, "trainId": 63},
{"name": "double door", "id": 783, "trainId": 64},
{"name": "coffee table, cocktail table", "id": 571, "trainId": 65},
{"name": "counter", "id": 627, "trainId": 66},
{"name": "countertop", "id": 629, "trainId": 67},
{"name": "chest of drawers, chest, bureau, dresser", "id": 491, "trainId": 68},
{"name": "kitchen island", "id": 1374, "trainId": 69},
{"name": "boat", "id": 223, "trainId": 70},
{"name": "waterfall, falls", "id": 3016, "trainId": 71},
{
"name": "stove, kitchen stove, range, kitchen range, cooking stove",
"id": 2598,
"trainId": 72,
},
{"name": "flower", "id": 978, "trainId": 73},
{"name": "bookcase", "id": 239, "trainId": 74},
{"name": "controls", "id": 608, "trainId": 75},
{"name": "book", "id": 236, "trainId": 76},
{"name": "stairway, staircase", "id": 2531, "trainId": 77},
{"name": "streetlight, street lamp", "id": 2616, "trainId": 78},
{
"name": "computer, computing machine, computing device, data processor, electronic computer, information processing system",
"id": 591,
"trainId": 79,
},
{
"name": "bus, autobus, coach, charabanc, double-decker, jitney, motorbus, motorcoach, omnibus, passenger vehicle",
"id": 327,
"trainId": 80,
},
{"name": "swivel chair", "id": 2679, "trainId": 81},
{"name": "light, light source", "id": 1451, "trainId": 82},
{"name": "bench", "id": 181, "trainId": 83},
{"name": "case, display case, showcase, vitrine", "id": 420, "trainId": 84},
{"name": "towel", "id": 2821, "trainId": 85},
{"name": "fountain", "id": 1023, "trainId": 86},
{"name": "embankment", "id": 855, "trainId": 87},
{
"name": "television receiver, television, television set, tv, tv set, idiot box, boob tube, telly, goggle box",
"id": 2733,
"trainId": 88,
},
{"name": "van", "id": 2928, "trainId": 89},
{"name": "hill", "id": 1240, "trainId": 90},
{"name": "awning, sunshade, sunblind", "id": 77, "trainId": 91},
{"name": "poster, posting, placard, notice, bill, card", "id": 1969, "trainId": 92},
{"name": "truck, motortruck", "id": 2880, "trainId": 93},
{"name": "airplane, aeroplane, plane", "id": 14, "trainId": 94},
{"name": "pole", "id": 1936, "trainId": 95},
{"name": "tower", "id": 2828, "trainId": 96},
{"name": "court", "id": 631, "trainId": 97},
{"name": "ball", "id": 103, "trainId": 98},
{
"name": "aircraft carrier, carrier, flattop, attack aircraft carrier",
"id": 3144,
"trainId": 99,
},
{"name": "buffet, counter, sideboard", "id": 308, "trainId": 100},
{"name": "hovel, hut, hutch, shack, shanty", "id": 1282, "trainId": 101},
{"name": "apparel, wearing apparel, dress, clothes", "id": 38, "trainId": 102},
{"name": "minibike, motorbike", "id": 1563, "trainId": 103},
{"name": "animal, animate being, beast, brute, creature, fauna", "id": 29, "trainId": 104},
{"name": "chandelier, pendant, pendent", "id": 480, "trainId": 105},
{"name": "step, stair", "id": 2569, "trainId": 106},
{"name": "booth, cubicle, stall, kiosk", "id": 247, "trainId": 107},
{"name": "bicycle, bike, wheel, cycle", "id": 187, "trainId": 108},
{"name": "doorframe, doorcase", "id": 778, "trainId": 109},
{"name": "sconce", "id": 2243, "trainId": 110},
{"name": "pond", "id": 1941, "trainId": 111},
{"name": "trade name, brand name, brand, marque", "id": 2833, "trainId": 112},
{"name": "bannister, banister, balustrade, balusters, handrail", "id": 120, "trainId": 113},
{"name": "bag", "id": 95, "trainId": 114},
{"name": "traffic light, traffic signal, stoplight", "id": 2836, "trainId": 115},
{"name": "gazebo", "id": 1087, "trainId": 116},
{"name": "escalator, moving staircase, moving stairway", "id": 868, "trainId": 117},
{"name": "land, ground, soil", "id": 1401, "trainId": 118},
{"name": "board, plank", "id": 220, "trainId": 119},
{"name": "arcade machine", "id": 47, "trainId": 120},
{"name": "eiderdown, duvet, continental quilt", "id": 843, "trainId": 121},
{"name": "bar", "id": 123, "trainId": 122},
{"name": "stall, stand, sales booth", "id": 2537, "trainId": 123},
{"name": "playground", "id": 1927, "trainId": 124},
{"name": "ship", "id": 2337, "trainId": 125},
{"name": "ottoman, pouf, pouffe, puff, hassock", "id": 1702, "trainId": 126},
{
"name": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
"id": 64,
"trainId": 127,
},
{"name": "bottle", "id": 249, "trainId": 128},
{"name": "cradle", "id": 642, "trainId": 129},
{"name": "pot, flowerpot", "id": 1981, "trainId": 130},
{
"name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
"id": 609,
"trainId": 131,
},
{"name": "train, railroad train", "id": 2840, "trainId": 132},
{"name": "stool", "id": 2586, "trainId": 133},
{"name": "lake", "id": 1393, "trainId": 134},
{"name": "tank, storage tank", "id": 2704, "trainId": 135},
{"name": "ice, water ice", "id": 1304, "trainId": 136},
{"name": "basket, handbasket", "id": 146, "trainId": 137},
{"name": "manhole", "id": 1494, "trainId": 138},
{"name": "tent, collapsible shelter", "id": 2739, "trainId": 139},
{"name": "canopy", "id": 389, "trainId": 140},
{"name": "microwave, microwave oven", "id": 1551, "trainId": 141},
{"name": "barrel, cask", "id": 131, "trainId": 142},
{"name": "dirt track", "id": 738, "trainId": 143},
{"name": "beam", "id": 161, "trainId": 144},
{"name": "dishwasher, dish washer, dishwashing machine", "id": 747, "trainId": 145},
{"name": "plate", "id": 1919, "trainId": 146},
{"name": "screen, crt screen", "id": 3109, "trainId": 147},
{"name": "ruins", "id": 2179, "trainId": 148},
{"name": "washer, automatic washer, washing machine", "id": 2989, "trainId": 149},
{"name": "blanket, cover", "id": 206, "trainId": 150},
{"name": "plaything, toy", "id": 1930, "trainId": 151},
{"name": "food, solid food", "id": 1002, "trainId": 152},
{"name": "screen, silver screen, projection screen", "id": 2254, "trainId": 153},
{"name": "oven", "id": 1708, "trainId": 154},
{"name": "stage", "id": 2526, "trainId": 155},
{"name": "beacon, lighthouse, beacon light, pharos", "id": 160, "trainId": 156},
{"name": "umbrella", "id": 2901, "trainId": 157},
{"name": "sculpture", "id": 2262, "trainId": 158},
{"name": "aqueduct", "id": 44, "trainId": 159},
{"name": "container", "id": 597, "trainId": 160},
{"name": "scaffolding, staging", "id": 2235, "trainId": 161},
{"name": "hood, exhaust hood", "id": 1260, "trainId": 162},
{"name": "curb, curbing, kerb", "id": 682, "trainId": 163},
{"name": "roller coaster", "id": 2151, "trainId": 164},
{"name": "horse, equus caballus", "id": 3107, "trainId": 165},
{"name": "catwalk", "id": 432, "trainId": 166},
{"name": "glass, drinking glass", "id": 1098, "trainId": 167},
{"name": "vase", "id": 2932, "trainId": 168},
{"name": "central reservation", "id": 461, "trainId": 169},
{"name": "carousel", "id": 410, "trainId": 170},
{"name": "radiator", "id": 2046, "trainId": 171},
{"name": "closet", "id": 533, "trainId": 172},
{"name": "machine", "id": 1481, "trainId": 173},
{"name": "pier, wharf, wharfage, dock", "id": 1858, "trainId": 174},
{"name": "fan", "id": 894, "trainId": 175},
{"name": "inflatable bounce game", "id": 1322, "trainId": 176},
{"name": "pitch", "id": 1891, "trainId": 177},
{"name": "paper", "id": 1756, "trainId": 178},
{"name": "arcade, colonnade", "id": 49, "trainId": 179},
{"name": "hot tub", "id": 1272, "trainId": 180},
{"name": "helicopter", "id": 1229, "trainId": 181},
{"name": "tray", "id": 2850, "trainId": 182},
{"name": "partition, divider", "id": 1784, "trainId": 183},
{"name": "vineyard", "id": 2962, "trainId": 184},
{"name": "bowl", "id": 259, "trainId": 185},
{"name": "bullring", "id": 319, "trainId": 186},
{"name": "flag", "id": 954, "trainId": 187},
{"name": "pot", "id": 1974, "trainId": 188},
{"name": "footbridge, overcrossing, pedestrian bridge", "id": 1013, "trainId": 189},
{"name": "shower", "id": 2356, "trainId": 190},
{"name": "bag, traveling bag, travelling bag, grip, suitcase", "id": 97, "trainId": 191},
{"name": "bulletin board, notice board", "id": 318, "trainId": 192},
{"name": "confessional booth", "id": 592, "trainId": 193},
{"name": "trunk, tree trunk, bole", "id": 2885, "trainId": 194},
{"name": "forest", "id": 1017, "trainId": 195},
{"name": "elevator door", "id": 851, "trainId": 196},
{"name": "laptop, laptop computer", "id": 1407, "trainId": 197},
{"name": "instrument panel", "id": 1332, "trainId": 198},
{"name": "bucket, pail", "id": 303, "trainId": 199},
{"name": "tapestry, tapis", "id": 2714, "trainId": 200},
{"name": "platform", "id": 1924, "trainId": 201},
{"name": "jacket", "id": 1346, "trainId": 202},
{"name": "gate", "id": 1081, "trainId": 203},
{"name": "monitor, monitoring device", "id": 1583, "trainId": 204},
{
"name": "telephone booth, phone booth, call box, telephone box, telephone kiosk",
"id": 2727,
"trainId": 205,
},
{"name": "spotlight, spot", "id": 2509, "trainId": 206},
{"name": "ring", "id": 2123, "trainId": 207},
{"name": "control panel", "id": 602, "trainId": 208},
{"name": "blackboard, chalkboard", "id": 202, "trainId": 209},
{"name": "air conditioner, air conditioning", "id": 10, "trainId": 210},
{"name": "chest", "id": 490, "trainId": 211},
{"name": "clock", "id": 530, "trainId": 212},
{"name": "sand dune", "id": 2213, "trainId": 213},
{"name": "pipe, pipage, piping", "id": 1884, "trainId": 214},
{"name": "vault", "id": 2934, "trainId": 215},
{"name": "table football", "id": 2687, "trainId": 216},
{"name": "cannon", "id": 387, "trainId": 217},
{"name": "swimming pool, swimming bath, natatorium", "id": 2668, "trainId": 218},
{"name": "fluorescent, fluorescent fixture", "id": 982, "trainId": 219},
{"name": "statue", "id": 2547, "trainId": 220},
{
"name": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
"id": 1474,
"trainId": 221,
},
{"name": "exhibitor", "id": 877, "trainId": 222},
{"name": "ladder", "id": 1391, "trainId": 223},
{"name": "carport", "id": 414, "trainId": 224},
{"name": "dam", "id": 698, "trainId": 225},
{"name": "pulpit", "id": 2019, "trainId": 226},
{"name": "skylight, fanlight", "id": 2422, "trainId": 227},
{"name": "water tower", "id": 3010, "trainId": 228},
{"name": "grill, grille, grillwork", "id": 1139, "trainId": 229},
{"name": "display board", "id": 753, "trainId": 230},
{"name": "pane, pane of glass, window glass", "id": 1747, "trainId": 231},
{"name": "rubbish, trash, scrap", "id": 2175, "trainId": 232},
{"name": "ice rink", "id": 1301, "trainId": 233},
{"name": "fruit", "id": 1033, "trainId": 234},
{"name": "patio", "id": 1789, "trainId": 235},
{"name": "vending machine", "id": 2939, "trainId": 236},
{"name": "telephone, phone, telephone set", "id": 2730, "trainId": 237},
{"name": "net", "id": 1652, "trainId": 238},
{
"name": "backpack, back pack, knapsack, packsack, rucksack, haversack",
"id": 90,
"trainId": 239,
},
{"name": "jar", "id": 1349, "trainId": 240},
{"name": "track", "id": 2830, "trainId": 241},
{"name": "magazine", "id": 1485, "trainId": 242},
{"name": "shutter", "id": 2370, "trainId": 243},
{"name": "roof", "id": 2155, "trainId": 244},
{"name": "banner, streamer", "id": 118, "trainId": 245},
{"name": "landfill", "id": 1402, "trainId": 246},
{"name": "post", "id": 1957, "trainId": 247},
{"name": "altarpiece, reredos", "id": 3130, "trainId": 248},
{"name": "hat, chapeau, lid", "id": 1197, "trainId": 249},
{"name": "arch, archway", "id": 52, "trainId": 250},
{"name": "table game", "id": 2688, "trainId": 251},
{"name": "bag, handbag, pocketbook, purse", "id": 96, "trainId": 252},
{"name": "document, written document, papers", "id": 762, "trainId": 253},
{"name": "dome", "id": 772, "trainId": 254},
{"name": "pier", "id": 1857, "trainId": 255},
{"name": "shanties", "id": 2315, "trainId": 256},
{"name": "forecourt", "id": 1016, "trainId": 257},
{"name": "crane", "id": 643, "trainId": 258},
{"name": "dog, domestic dog, canis familiaris", "id": 3105, "trainId": 259},
{"name": "piano, pianoforte, forte-piano", "id": 1849, "trainId": 260},
{"name": "drawing", "id": 791, "trainId": 261},
{"name": "cabin", "id": 349, "trainId": 262},
{
"name": "ad, advertisement, advertizement, advertising, advertizing, advert",
"id": 6,
"trainId": 263,
},
{"name": "amphitheater, amphitheatre, coliseum", "id": 3114, "trainId": 264},
{"name": "monument", "id": 1587, "trainId": 265},
{"name": "henhouse", "id": 1233, "trainId": 266},
{"name": "cockpit", "id": 559, "trainId": 267},
{"name": "heater, warmer", "id": 1223, "trainId": 268},
{"name": "windmill, aerogenerator, wind generator", "id": 3049, "trainId": 269},
{"name": "pool", "id": 1943, "trainId": 270},
{"name": "elevator, lift", "id": 853, "trainId": 271},
{"name": "decoration, ornament, ornamentation", "id": 709, "trainId": 272},
{"name": "labyrinth", "id": 1390, "trainId": 273},
{"name": "text, textual matter", "id": 2748, "trainId": 274},
{"name": "printer", "id": 2007, "trainId": 275},
{"name": "mezzanine, first balcony", "id": 1546, "trainId": 276},
{"name": "mattress", "id": 1513, "trainId": 277},
{"name": "straw", "id": 2600, "trainId": 278},
{"name": "stalls", "id": 2538, "trainId": 279},
{"name": "patio, terrace", "id": 1790, "trainId": 280},
{"name": "billboard, hoarding", "id": 194, "trainId": 281},
{"name": "bus stop", "id": 326, "trainId": 282},
{"name": "trouser, pant", "id": 2877, "trainId": 283},
{"name": "console table, console", "id": 594, "trainId": 284},
{"name": "rack", "id": 2036, "trainId": 285},
{"name": "notebook", "id": 1662, "trainId": 286},
{"name": "shrine", "id": 2366, "trainId": 287},
{"name": "pantry", "id": 1754, "trainId": 288},
{"name": "cart", "id": 418, "trainId": 289},
{"name": "steam shovel", "id": 2553, "trainId": 290},
{"name": "porch", "id": 1951, "trainId": 291},
{"name": "postbox, mailbox, letter box", "id": 1963, "trainId": 292},
{"name": "figurine, statuette", "id": 918, "trainId": 293},
{"name": "recycling bin", "id": 2086, "trainId": 294},
{"name": "folding screen", "id": 997, "trainId": 295},
{"name": "telescope", "id": 2731, "trainId": 296},
{"name": "deck chair, beach chair", "id": 704, "trainId": 297},
{"name": "kennel", "id": 1365, "trainId": 298},
{"name": "coffee maker", "id": 569, "trainId": 299},
{"name": "altar, communion table, lord's table", "id": 3108, "trainId": 300},
{"name": "fish", "id": 948, "trainId": 301},
{"name": "easel", "id": 839, "trainId": 302},
{"name": "artificial golf green", "id": 63, "trainId": 303},
{"name": "iceberg", "id": 1305, "trainId": 304},
{"name": "candlestick, candle holder", "id": 378, "trainId": 305},
{"name": "shower stall, shower bath", "id": 2362, "trainId": 306},
{"name": "television stand", "id": 2734, "trainId": 307},
{
"name": "wall socket, wall plug, electric outlet, electrical outlet, outlet, electric receptacle",
"id": 2982,
"trainId": 308,
},
{"name": "skeleton", "id": 2398, "trainId": 309},
{"name": "grand piano, grand", "id": 1119, "trainId": 310},
{"name": "candy, confect", "id": 382, "trainId": 311},
{"name": "grille door", "id": 1141, "trainId": 312},
{"name": "pedestal, plinth, footstall", "id": 1805, "trainId": 313},
{"name": "jersey, t-shirt, tee shirt", "id": 3102, "trainId": 314},
{"name": "shoe", "id": 2341, "trainId": 315},
{"name": "gravestone, headstone, tombstone", "id": 1131, "trainId": 316},
{"name": "shanty", "id": 2316, "trainId": 317},
{"name": "structure", "id": 2626, "trainId": 318},
{"name": "rocking chair, rocker", "id": 3104, "trainId": 319},
{"name": "bird", "id": 198, "trainId": 320},
{"name": "place mat", "id": 1896, "trainId": 321},
{"name": "tomb", "id": 2800, "trainId": 322},
{"name": "big top", "id": 190, "trainId": 323},
{"name": "gas pump, gasoline pump, petrol pump, island dispenser", "id": 3131, "trainId": 324},
{"name": "lockers", "id": 1463, "trainId": 325},
{"name": "cage", "id": 357, "trainId": 326},
{"name": "finger", "id": 929, "trainId": 327},
{"name": "bleachers", "id": 209, "trainId": 328},
{"name": "ferris wheel", "id": 912, "trainId": 329},
{"name": "hairdresser chair", "id": 1164, "trainId": 330},
{"name": "mat", "id": 1509, "trainId": 331},
{"name": "stands", "id": 2539, "trainId": 332},
{"name": "aquarium, fish tank, marine museum", "id": 3116, "trainId": 333},
{"name": "streetcar, tram, tramcar, trolley, trolley car", "id": 2615, "trainId": 334},
{"name": "napkin, table napkin, serviette", "id": 1644, "trainId": 335},
{"name": "dummy", "id": 818, "trainId": 336},
{"name": "booklet, brochure, folder, leaflet, pamphlet", "id": 242, "trainId": 337},
{"name": "sand trap", "id": 2217, "trainId": 338},
{"name": "shop, store", "id": 2347, "trainId": 339},
{"name": "table cloth", "id": 2686, "trainId": 340},
{"name": "service station", "id": 2300, "trainId": 341},
{"name": "coffin", "id": 572, "trainId": 342},
{"name": "drawer", "id": 789, "trainId": 343},
{"name": "cages", "id": 358, "trainId": 344},
{"name": "slot machine, coin machine", "id": 2443, "trainId": 345},
{"name": "balcony", "id": 101, "trainId": 346},
{"name": "volleyball court", "id": 2969, "trainId": 347},
{"name": "table tennis", "id": 2692, "trainId": 348},
{"name": "control table", "id": 606, "trainId": 349},
{"name": "shirt", "id": 2339, "trainId": 350},
{"name": "merchandise, ware, product", "id": 1533, "trainId": 351},
{"name": "railway", "id": 2060, "trainId": 352},
{"name": "parterre", "id": 1782, "trainId": 353},
{"name": "chimney", "id": 495, "trainId": 354},
{"name": "can, tin, tin can", "id": 371, "trainId": 355},
{"name": "tanks", "id": 2707, "trainId": 356},
{"name": "fabric, cloth, material, textile", "id": 889, "trainId": 357},
{"name": "alga, algae", "id": 3156, "trainId": 358},
{"name": "system", "id": 2683, "trainId": 359},
{"name": "map", "id": 1499, "trainId": 360},
{"name": "greenhouse", "id": 1135, "trainId": 361},
{"name": "mug", "id": 1619, "trainId": 362},
{"name": "barbecue", "id": 125, "trainId": 363},
{"name": "trailer", "id": 2838, "trainId": 364},
{"name": "toilet tissue, toilet paper, bathroom tissue", "id": 2792, "trainId": 365},
{"name": "organ", "id": 1695, "trainId": 366},
{"name": "dishrag, dishcloth", "id": 746, "trainId": 367},
{"name": "island", "id": 1343, "trainId": 368},
{"name": "keyboard", "id": 1370, "trainId": 369},
{"name": "trench", "id": 2858, "trainId": 370},
{"name": "basket, basketball hoop, hoop", "id": 145, "trainId": 371},
{"name": "steering wheel, wheel", "id": 2565, "trainId": 372},
{"name": "pitcher, ewer", "id": 1892, "trainId": 373},
{"name": "goal", "id": 1103, "trainId": 374},
{"name": "bread, breadstuff, staff of life", "id": 286, "trainId": 375},
{"name": "beds", "id": 170, "trainId": 376},
{"name": "wood", "id": 3073, "trainId": 377},
{"name": "file cabinet", "id": 922, "trainId": 378},
{"name": "newspaper, paper", "id": 1655, "trainId": 379},
{"name": "motorboat", "id": 1602, "trainId": 380},
{"name": "rope", "id": 2160, "trainId": 381},
{"name": "guitar", "id": 1151, "trainId": 382},
{"name": "rubble", "id": 2176, "trainId": 383},
{"name": "scarf", "id": 2239, "trainId": 384},
{"name": "barrels", "id": 132, "trainId": 385},
{"name": "cap", "id": 394, "trainId": 386},
{"name": "leaves", "id": 1424, "trainId": 387},
{"name": "control tower", "id": 607, "trainId": 388},
{"name": "dashboard", "id": 700, "trainId": 389},
{"name": "bandstand", "id": 116, "trainId": 390},
{"name": "lectern", "id": 1425, "trainId": 391},
{"name": "switch, electric switch, electrical switch", "id": 2676, "trainId": 392},
{"name": "baseboard, mopboard, skirting board", "id": 141, "trainId": 393},
{"name": "shower room", "id": 2360, "trainId": 394},
{"name": "smoke", "id": 2449, "trainId": 395},
{"name": "faucet, spigot", "id": 897, "trainId": 396},
{"name": "bulldozer", "id": 317, "trainId": 397},
{"name": "saucepan", "id": 2228, "trainId": 398},
{"name": "shops", "id": 2351, "trainId": 399},
{"name": "meter", "id": 1543, "trainId": 400},
{"name": "crevasse", "id": 656, "trainId": 401},
{"name": "gear", "id": 1088, "trainId": 402},
{"name": "candelabrum, candelabra", "id": 373, "trainId": 403},
{"name": "sofa bed", "id": 2472, "trainId": 404},
{"name": "tunnel", "id": 2892, "trainId": 405},
{"name": "pallet", "id": 1740, "trainId": 406},
{"name": "wire, conducting wire", "id": 3067, "trainId": 407},
{"name": "kettle, boiler", "id": 1367, "trainId": 408},
{"name": "bidet", "id": 188, "trainId": 409},
{
"name": "baby buggy, baby carriage, carriage, perambulator, pram, stroller, go-cart, pushchair, pusher",
"id": 79,
"trainId": 410,
},
{"name": "music stand", "id": 1633, "trainId": 411},
{"name": "pipe, tube", "id": 1885, "trainId": 412},
{"name": "cup", "id": 677, "trainId": 413},
{"name": "parking meter", "id": 1779, "trainId": 414},
{"name": "ice hockey rink", "id": 1297, "trainId": 415},
{"name": "shelter", "id": 2334, "trainId": 416},
{"name": "weeds", "id": 3027, "trainId": 417},
{"name": "temple", "id": 2735, "trainId": 418},
{"name": "patty, cake", "id": 1791, "trainId": 419},
{"name": "ski slope", "id": 2405, "trainId": 420},
{"name": "panel", "id": 1748, "trainId": 421},
{"name": "wallet", "id": 2983, "trainId": 422},
{"name": "wheel", "id": 3035, "trainId": 423},
{"name": "towel rack, towel horse", "id": 2824, "trainId": 424},
{"name": "roundabout", "id": 2168, "trainId": 425},
{"name": "canister, cannister, tin", "id": 385, "trainId": 426},
{"name": "rod", "id": 2148, "trainId": 427},
{"name": "soap dispenser", "id": 2465, "trainId": 428},
{"name": "bell", "id": 175, "trainId": 429},
{"name": "canvas", "id": 390, "trainId": 430},
{"name": "box office, ticket office, ticket booth", "id": 268, "trainId": 431},
{"name": "teacup", "id": 2722, "trainId": 432},
{"name": "trellis", "id": 2857, "trainId": 433},
{"name": "workbench", "id": 3088, "trainId": 434},
{"name": "valley, vale", "id": 2926, "trainId": 435},
{"name": "toaster", "id": 2782, "trainId": 436},
{"name": "knife", "id": 1378, "trainId": 437},
{"name": "podium", "id": 1934, "trainId": 438},
{"name": "ramp", "id": 2072, "trainId": 439},
{"name": "tumble dryer", "id": 2889, "trainId": 440},
{"name": "fireplug, fire hydrant, plug", "id": 944, "trainId": 441},
{"name": "gym shoe, sneaker, tennis shoe", "id": 1158, "trainId": 442},
{"name": "lab bench", "id": 1383, "trainId": 443},
{"name": "equipment", "id": 867, "trainId": 444},
{"name": "rocky formation", "id": 2145, "trainId": 445},
{"name": "plastic", "id": 1915, "trainId": 446},
{"name": "calendar", "id": 361, "trainId": 447},
{"name": "caravan", "id": 402, "trainId": 448},
{"name": "check-in-desk", "id": 482, "trainId": 449},
{"name": "ticket counter", "id": 2761, "trainId": 450},
{"name": "brush", "id": 300, "trainId": 451},
{"name": "mill", "id": 1554, "trainId": 452},
{"name": "covered bridge", "id": 636, "trainId": 453},
{"name": "bowling alley", "id": 260, "trainId": 454},
{"name": "hanger", "id": 1186, "trainId": 455},
{"name": "excavator", "id": 871, "trainId": 456},
{"name": "trestle", "id": 2859, "trainId": 457},
{"name": "revolving door", "id": 2103, "trainId": 458},
{"name": "blast furnace", "id": 208, "trainId": 459},
{"name": "scale, weighing machine", "id": 2236, "trainId": 460},
{"name": "projector", "id": 2012, "trainId": 461},
{"name": "soap", "id": 2462, "trainId": 462},
{"name": "locker", "id": 1462, "trainId": 463},
{"name": "tractor", "id": 2832, "trainId": 464},
{"name": "stretcher", "id": 2617, "trainId": 465},
{"name": "frame", "id": 1024, "trainId": 466},
{"name": "grating", "id": 1129, "trainId": 467},
{"name": "alembic", "id": 18, "trainId": 468},
{"name": "candle, taper, wax light", "id": 376, "trainId": 469},
{"name": "barrier", "id": 134, "trainId": 470},
{"name": "cardboard", "id": 407, "trainId": 471},
{"name": "cave", "id": 434, "trainId": 472},
{"name": "puddle", "id": 2017, "trainId": 473},
{"name": "tarp", "id": 2717, "trainId": 474},
{"name": "price tag", "id": 2005, "trainId": 475},
{"name": "watchtower", "id": 2993, "trainId": 476},
{"name": "meters", "id": 1545, "trainId": 477},
{
"name": "light bulb, lightbulb, bulb, incandescent lamp, electric light, electric-light bulb",
"id": 1445,
"trainId": 478,
},
{"name": "tracks", "id": 2831, "trainId": 479},
{"name": "hair dryer", "id": 1161, "trainId": 480},
{"name": "skirt", "id": 2411, "trainId": 481},
{"name": "viaduct", "id": 2949, "trainId": 482},
{"name": "paper towel", "id": 1769, "trainId": 483},
{"name": "coat", "id": 552, "trainId": 484},
{"name": "sheet", "id": 2327, "trainId": 485},
{"name": "fire extinguisher, extinguisher, asphyxiator", "id": 939, "trainId": 486},
{"name": "water wheel", "id": 3013, "trainId": 487},
{"name": "pottery, clayware", "id": 1986, "trainId": 488},
{"name": "magazine rack", "id": 1486, "trainId": 489},
{"name": "teapot", "id": 2723, "trainId": 490},
{"name": "microphone, mike", "id": 1549, "trainId": 491},
{"name": "support", "id": 2649, "trainId": 492},
{"name": "forklift", "id": 1020, "trainId": 493},
{"name": "canyon", "id": 392, "trainId": 494},
{"name": "cash register, register", "id": 422, "trainId": 495},
{"name": "leaf, leafage, foliage", "id": 1419, "trainId": 496},
{"name": "remote control, remote", "id": 2099, "trainId": 497},
{"name": "soap dish", "id": 2464, "trainId": 498},
{"name": "windshield, windscreen", "id": 3058, "trainId": 499},
{"name": "cat", "id": 430, "trainId": 500},
{"name": "cue, cue stick, pool cue, pool stick", "id": 675, "trainId": 501},
{"name": "vent, venthole, vent-hole, blowhole", "id": 2941, "trainId": 502},
{"name": "videos", "id": 2955, "trainId": 503},
{"name": "shovel", "id": 2355, "trainId": 504},
{"name": "eaves", "id": 840, "trainId": 505},
{"name": "antenna, aerial, transmitting aerial", "id": 32, "trainId": 506},
{"name": "shipyard", "id": 2338, "trainId": 507},
{"name": "hen, biddy", "id": 1232, "trainId": 508},
{"name": "traffic cone", "id": 2834, "trainId": 509},
{"name": "washing machines", "id": 2991, "trainId": 510},
{"name": "truck crane", "id": 2879, "trainId": 511},
{"name": "cds", "id": 444, "trainId": 512},
{"name": "niche", "id": 1657, "trainId": 513},
{"name": "scoreboard", "id": 2246, "trainId": 514},
{"name": "briefcase", "id": 296, "trainId": 515},
{"name": "boot", "id": 245, "trainId": 516},
{"name": "sweater, jumper", "id": 2661, "trainId": 517},
{"name": "hay", "id": 1202, "trainId": 518},
{"name": "pack", "id": 1714, "trainId": 519},
{"name": "bottle rack", "id": 251, "trainId": 520},
{"name": "glacier", "id": 1095, "trainId": 521},
{"name": "pergola", "id": 1828, "trainId": 522},
{"name": "building materials", "id": 311, "trainId": 523},
{"name": "television camera", "id": 2732, "trainId": 524},
{"name": "first floor", "id": 947, "trainId": 525},
{"name": "rifle", "id": 2115, "trainId": 526},
{"name": "tennis table", "id": 2738, "trainId": 527},
{"name": "stadium", "id": 2525, "trainId": 528},
{"name": "safety belt", "id": 2194, "trainId": 529},
{"name": "cover", "id": 634, "trainId": 530},
{"name": "dish rack", "id": 740, "trainId": 531},
{"name": "synthesizer", "id": 2682, "trainId": 532},
{"name": "pumpkin", "id": 2020, "trainId": 533},
{"name": "gutter", "id": 1156, "trainId": 534},
{"name": "fruit stand", "id": 1036, "trainId": 535},
{"name": "ice floe, floe", "id": 1295, "trainId": 536},
{"name": "handle, grip, handgrip, hold", "id": 1181, "trainId": 537},
{"name": "wheelchair", "id": 3037, "trainId": 538},
{"name": "mousepad, mouse mat", "id": 1614, "trainId": 539},
{"name": "diploma", "id": 736, "trainId": 540},
{"name": "fairground ride", "id": 893, "trainId": 541},
{"name": "radio", "id": 2047, "trainId": 542},
{"name": "hotplate", "id": 1274, "trainId": 543},
{"name": "junk", "id": 1361, "trainId": 544},
{"name": "wheelbarrow", "id": 3036, "trainId": 545},
{"name": "stream", "id": 2606, "trainId": 546},
{"name": "toll plaza", "id": 2797, "trainId": 547},
{"name": "punching bag", "id": 2022, "trainId": 548},
{"name": "trough", "id": 2876, "trainId": 549},
{"name": "throne", "id": 2758, "trainId": 550},
{"name": "chair desk", "id": 472, "trainId": 551},
{"name": "weighbridge", "id": 3028, "trainId": 552},
{"name": "extractor fan", "id": 882, "trainId": 553},
{"name": "hanging clothes", "id": 1189, "trainId": 554},
{"name": "dish, dish aerial, dish antenna, saucer", "id": 743, "trainId": 555},
{"name": "alarm clock, alarm", "id": 3122, "trainId": 556},
{"name": "ski lift", "id": 2401, "trainId": 557},
{"name": "chain", "id": 468, "trainId": 558},
{"name": "garage", "id": 1061, "trainId": 559},
{"name": "mechanical shovel", "id": 1523, "trainId": 560},
{"name": "wine rack", "id": 3059, "trainId": 561},
{"name": "tramway", "id": 2843, "trainId": 562},
{"name": "treadmill", "id": 2853, "trainId": 563},
{"name": "menu", "id": 1529, "trainId": 564},
{"name": "block", "id": 214, "trainId": 565},
{"name": "well", "id": 3032, "trainId": 566},
{"name": "witness stand", "id": 3071, "trainId": 567},
{"name": "branch", "id": 277, "trainId": 568},
{"name": "duck", "id": 813, "trainId": 569},
{"name": "casserole", "id": 426, "trainId": 570},
{"name": "frying pan", "id": 1039, "trainId": 571},
{"name": "desk organizer", "id": 727, "trainId": 572},
{"name": "mast", "id": 1508, "trainId": 573},
{"name": "spectacles, specs, eyeglasses, glasses", "id": 2490, "trainId": 574},
{"name": "service elevator", "id": 2299, "trainId": 575},
{"name": "dollhouse", "id": 768, "trainId": 576},
{"name": "hammock", "id": 1172, "trainId": 577},
{"name": "clothes hanging", "id": 537, "trainId": 578},
{"name": "photocopier", "id": 1847, "trainId": 579},
{"name": "notepad", "id": 1664, "trainId": 580},
{"name": "golf cart", "id": 1110, "trainId": 581},
{"name": "footpath", "id": 1014, "trainId": 582},
{"name": "cross", "id": 662, "trainId": 583},
{"name": "baptismal font", "id": 121, "trainId": 584},
{"name": "boiler", "id": 227, "trainId": 585},
{"name": "skip", "id": 2410, "trainId": 586},
{"name": "rotisserie", "id": 2165, "trainId": 587},
{"name": "tables", "id": 2696, "trainId": 588},
{"name": "water mill", "id": 3005, "trainId": 589},
{"name": "helmet", "id": 1231, "trainId": 590},
{"name": "cover curtain", "id": 635, "trainId": 591},
{"name": "brick", "id": 292, "trainId": 592},
{"name": "table runner", "id": 2690, "trainId": 593},
{"name": "ashtray", "id": 65, "trainId": 594},
{"name": "street box", "id": 2607, "trainId": 595},
{"name": "stick", "id": 2574, "trainId": 596},
{"name": "hangers", "id": 1188, "trainId": 597},
{"name": "cells", "id": 456, "trainId": 598},
{"name": "urinal", "id": 2913, "trainId": 599},
{"name": "centerpiece", "id": 459, "trainId": 600},
{"name": "portable fridge", "id": 1955, "trainId": 601},
{"name": "dvds", "id": 827, "trainId": 602},
{"name": "golf club", "id": 1111, "trainId": 603},
{"name": "skirting board", "id": 2412, "trainId": 604},
{"name": "water cooler", "id": 2997, "trainId": 605},
{"name": "clipboard", "id": 528, "trainId": 606},
{"name": "camera, photographic camera", "id": 366, "trainId": 607},
{"name": "pigeonhole", "id": 1863, "trainId": 608},
{"name": "chips", "id": 500, "trainId": 609},
{"name": "food processor", "id": 1001, "trainId": 610},
{"name": "post box", "id": 1958, "trainId": 611},
{"name": "lid", "id": 1441, "trainId": 612},
{"name": "drum", "id": 809, "trainId": 613},
{"name": "blender", "id": 210, "trainId": 614},
{"name": "cave entrance", "id": 435, "trainId": 615},
{"name": "dental chair", "id": 718, "trainId": 616},
{"name": "obelisk", "id": 1674, "trainId": 617},
{"name": "canoe", "id": 388, "trainId": 618},
{"name": "mobile", "id": 1572, "trainId": 619},
{"name": "monitors", "id": 1584, "trainId": 620},
{"name": "pool ball", "id": 1944, "trainId": 621},
{"name": "cue rack", "id": 674, "trainId": 622},
{"name": "baggage carts", "id": 99, "trainId": 623},
{"name": "shore", "id": 2352, "trainId": 624},
{"name": "fork", "id": 1019, "trainId": 625},
{"name": "paper filer", "id": 1763, "trainId": 626},
{"name": "bicycle rack", "id": 185, "trainId": 627},
{"name": "coat rack", "id": 554, "trainId": 628},
{"name": "garland", "id": 1066, "trainId": 629},
{"name": "sports bag", "id": 2508, "trainId": 630},
{"name": "fish tank", "id": 951, "trainId": 631},
{"name": "towel dispenser", "id": 2822, "trainId": 632},
{"name": "carriage", "id": 415, "trainId": 633},
{"name": "brochure", "id": 297, "trainId": 634},
{"name": "plaque", "id": 1914, "trainId": 635},
{"name": "stringer", "id": 2619, "trainId": 636},
{"name": "iron", "id": 1338, "trainId": 637},
{"name": "spoon", "id": 2505, "trainId": 638},
{"name": "flag pole", "id": 955, "trainId": 639},
{"name": "toilet brush", "id": 2786, "trainId": 640},
{"name": "book stand", "id": 238, "trainId": 641},
{"name": "water faucet, water tap, tap, hydrant", "id": 3000, "trainId": 642},
{"name": "ticket office", "id": 2763, "trainId": 643},
{"name": "broom", "id": 299, "trainId": 644},
{"name": "dvd", "id": 822, "trainId": 645},
{"name": "ice bucket", "id": 1288, "trainId": 646},
{"name": "carapace, shell, cuticle, shield", "id": 3101, "trainId": 647},
{"name": "tureen", "id": 2894, "trainId": 648},
{"name": "folders", "id": 992, "trainId": 649},
{"name": "chess", "id": 489, "trainId": 650},
{"name": "root", "id": 2157, "trainId": 651},
{"name": "sewing machine", "id": 2309, "trainId": 652},
{"name": "model", "id": 1576, "trainId": 653},
{"name": "pen", "id": 1810, "trainId": 654},
{"name": "violin", "id": 2964, "trainId": 655},
{"name": "sweatshirt", "id": 2662, "trainId": 656},
{"name": "recycling materials", "id": 2087, "trainId": 657},
{"name": "mitten", "id": 1569, "trainId": 658},
{"name": "chopping board, cutting board", "id": 503, "trainId": 659},
{"name": "mask", "id": 1505, "trainId": 660},
{"name": "log", "id": 1468, "trainId": 661},
{"name": "mouse, computer mouse", "id": 1613, "trainId": 662},
{"name": "grill", "id": 1138, "trainId": 663},
{"name": "hole", "id": 1256, "trainId": 664},
{"name": "target", "id": 2715, "trainId": 665},
{"name": "trash bag", "id": 2846, "trainId": 666},
{"name": "chalk", "id": 477, "trainId": 667},
{"name": "sticks", "id": 2576, "trainId": 668},
{"name": "balloon", "id": 108, "trainId": 669},
{"name": "score", "id": 2245, "trainId": 670},
{"name": "hair spray", "id": 1162, "trainId": 671},
{"name": "roll", "id": 2149, "trainId": 672},
{"name": "runner", "id": 2183, "trainId": 673},
{"name": "engine", "id": 858, "trainId": 674},
{"name": "inflatable glove", "id": 1324, "trainId": 675},
{"name": "games", "id": 1055, "trainId": 676},
{"name": "pallets", "id": 1741, "trainId": 677},
{"name": "baskets", "id": 149, "trainId": 678},
{"name": "coop", "id": 615, "trainId": 679},
{"name": "dvd player", "id": 825, "trainId": 680},
{"name": "rocking horse", "id": 2143, "trainId": 681},
{"name": "buckets", "id": 304, "trainId": 682},
{"name": "bread rolls", "id": 283, "trainId": 683},
{"name": "shawl", "id": 2322, "trainId": 684},
{"name": "watering can", "id": 3017, "trainId": 685},
{"name": "spotlights", "id": 2510, "trainId": 686},
{"name": "post-it", "id": 1960, "trainId": 687},
{"name": "bowls", "id": 265, "trainId": 688},
{"name": "security camera", "id": 2282, "trainId": 689},
{"name": "runner cloth", "id": 2184, "trainId": 690},
{"name": "lock", "id": 1461, "trainId": 691},
{"name": "alarm, warning device, alarm system", "id": 3113, "trainId": 692},
{"name": "side", "id": 2372, "trainId": 693},
{"name": "roulette", "id": 2166, "trainId": 694},
{"name": "bone", "id": 232, "trainId": 695},
{"name": "cutlery", "id": 693, "trainId": 696},
{"name": "pool balls", "id": 1945, "trainId": 697},
{"name": "wheels", "id": 3039, "trainId": 698},
{"name": "spice rack", "id": 2494, "trainId": 699},
{"name": "plant pots", "id": 1908, "trainId": 700},
{"name": "towel ring", "id": 2827, "trainId": 701},
{"name": "bread box", "id": 280, "trainId": 702},
{"name": "video", "id": 2950, "trainId": 703},
{"name": "funfair", "id": 1044, "trainId": 704},
{"name": "breads", "id": 288, "trainId": 705},
{"name": "tripod", "id": 2863, "trainId": 706},
{"name": "ironing board", "id": 1342, "trainId": 707},
{"name": "skimmer", "id": 2409, "trainId": 708},
{"name": "hollow", "id": 1258, "trainId": 709},
{"name": "scratching post", "id": 2249, "trainId": 710},
{"name": "tricycle", "id": 2862, "trainId": 711},
{"name": "file box", "id": 920, "trainId": 712},
{"name": "mountain pass", "id": 1607, "trainId": 713},
{"name": "tombstones", "id": 2802, "trainId": 714},
{"name": "cooker", "id": 610, "trainId": 715},
{"name": "card game, cards", "id": 3129, "trainId": 716},
{"name": "golf bag", "id": 1108, "trainId": 717},
{"name": "towel paper", "id": 2823, "trainId": 718},
{"name": "chaise lounge", "id": 476, "trainId": 719},
{"name": "sun", "id": 2641, "trainId": 720},
{"name": "toilet paper holder", "id": 2788, "trainId": 721},
{"name": "rake", "id": 2070, "trainId": 722},
{"name": "key", "id": 1368, "trainId": 723},
{"name": "umbrella stand", "id": 2903, "trainId": 724},
{"name": "dartboard", "id": 699, "trainId": 725},
{"name": "transformer", "id": 2844, "trainId": 726},
{"name": "fireplace utensils", "id": 942, "trainId": 727},
{"name": "sweatshirts", "id": 2663, "trainId": 728},
{
"name": "cellular telephone, cellular phone, cellphone, cell, mobile phone",
"id": 457,
"trainId": 729,
},
{"name": "tallboy", "id": 2701, "trainId": 730},
{"name": "stapler", "id": 2540, "trainId": 731},
{"name": "sauna", "id": 2231, "trainId": 732},
{"name": "test tube", "id": 2746, "trainId": 733},
{"name": "palette", "id": 1738, "trainId": 734},
{"name": "shopping carts", "id": 2350, "trainId": 735},
{"name": "tools", "id": 2808, "trainId": 736},
{"name": "push button, push, button", "id": 2025, "trainId": 737},
{"name": "star", "id": 2541, "trainId": 738},
{"name": "roof rack", "id": 2156, "trainId": 739},
{"name": "barbed wire", "id": 126, "trainId": 740},
{"name": "spray", "id": 2512, "trainId": 741},
{"name": "ear", "id": 831, "trainId": 742},
{"name": "sponge", "id": 2503, "trainId": 743},
{"name": "racket", "id": 2039, "trainId": 744},
{"name": "tins", "id": 2774, "trainId": 745},
{"name": "eyeglasses", "id": 886, "trainId": 746},
{"name": "file", "id": 919, "trainId": 747},
{"name": "scarfs", "id": 2240, "trainId": 748},
{"name": "sugar bowl", "id": 2636, "trainId": 749},
{"name": "flip flop", "id": 963, "trainId": 750},
{"name": "headstones", "id": 1218, "trainId": 751},
{"name": "laptop bag", "id": 1406, "trainId": 752},
{"name": "leash", "id": 1420, "trainId": 753},
{"name": "climbing frame", "id": 526, "trainId": 754},
{"name": "suit hanger", "id": 2639, "trainId": 755},
{"name": "floor spotlight", "id": 975, "trainId": 756},
{"name": "plate rack", "id": 1921, "trainId": 757},
{"name": "sewer", "id": 2305, "trainId": 758},
{"name": "hard drive", "id": 1193, "trainId": 759},
{"name": "sprinkler", "id": 2517, "trainId": 760},
{"name": "tools box", "id": 2809, "trainId": 761},
{"name": "necklace", "id": 1647, "trainId": 762},
{"name": "bulbs", "id": 314, "trainId": 763},
{"name": "steel industry", "id": 2560, "trainId": 764},
{"name": "club", "id": 545, "trainId": 765},
{"name": "jack", "id": 1345, "trainId": 766},
{"name": "door bars", "id": 775, "trainId": 767},
{
"name": "control panel, instrument panel, control board, board, panel",
"id": 603,
"trainId": 768,
},
{"name": "hairbrush", "id": 1163, "trainId": 769},
{"name": "napkin holder", "id": 1641, "trainId": 770},
{"name": "office", "id": 1678, "trainId": 771},
{"name": "smoke detector", "id": 2450, "trainId": 772},
{"name": "utensils", "id": 2915, "trainId": 773},
{"name": "apron", "id": 42, "trainId": 774},
{"name": "scissors", "id": 2242, "trainId": 775},
{"name": "terminal", "id": 2741, "trainId": 776},
{"name": "grinder", "id": 1143, "trainId": 777},
{"name": "entry phone", "id": 862, "trainId": 778},
{"name": "newspaper stand", "id": 1654, "trainId": 779},
{"name": "pepper shaker", "id": 1826, "trainId": 780},
{"name": "onions", "id": 1689, "trainId": 781},
{
"name": "central processing unit, cpu, c p u , central processor, processor, mainframe",
"id": 3124,
"trainId": 782,
},
{"name": "tape", "id": 2710, "trainId": 783},
{"name": "bat", "id": 152, "trainId": 784},
{"name": "coaster", "id": 549, "trainId": 785},
{"name": "calculator", "id": 360, "trainId": 786},
{"name": "potatoes", "id": 1982, "trainId": 787},
{"name": "luggage rack", "id": 1478, "trainId": 788},
{"name": "salt", "id": 2203, "trainId": 789},
{"name": "street number", "id": 2612, "trainId": 790},
{"name": "viewpoint", "id": 2956, "trainId": 791},
{"name": "sword", "id": 2681, "trainId": 792},
{"name": "cd", "id": 437, "trainId": 793},
{"name": "rowing machine", "id": 2171, "trainId": 794},
{"name": "plug", "id": 1933, "trainId": 795},
{"name": "andiron, firedog, dog, dog-iron", "id": 3110, "trainId": 796},
{"name": "pepper", "id": 1824, "trainId": 797},
{"name": "tongs", "id": 2803, "trainId": 798},
{"name": "bonfire", "id": 234, "trainId": 799},
{"name": "dog dish", "id": 764, "trainId": 800},
{"name": "belt", "id": 177, "trainId": 801},
{"name": "dumbbells", "id": 817, "trainId": 802},
{"name": "videocassette recorder, vcr", "id": 3145, "trainId": 803},
{"name": "hook", "id": 1262, "trainId": 804},
{"name": "envelopes", "id": 864, "trainId": 805},
{"name": "shower faucet", "id": 2359, "trainId": 806},
{"name": "watch", "id": 2992, "trainId": 807},
{"name": "padlock", "id": 1725, "trainId": 808},
{"name": "swimming pool ladder", "id": 2667, "trainId": 809},
{"name": "spanners", "id": 2484, "trainId": 810},
{"name": "gravy boat", "id": 1133, "trainId": 811},
{"name": "notice board", "id": 1667, "trainId": 812},
{"name": "trash bags", "id": 2847, "trainId": 813},
{"name": "fire alarm", "id": 932, "trainId": 814},
{"name": "ladle", "id": 1392, "trainId": 815},
{"name": "stethoscope", "id": 2573, "trainId": 816},
{"name": "rocket", "id": 2140, "trainId": 817},
{"name": "funnel", "id": 1046, "trainId": 818},
{"name": "bowling pins", "id": 264, "trainId": 819},
{"name": "valve", "id": 2927, "trainId": 820},
{"name": "thermometer", "id": 2752, "trainId": 821},
{"name": "cups", "id": 679, "trainId": 822},
{"name": "spice jar", "id": 2493, "trainId": 823},
{"name": "night light", "id": 1658, "trainId": 824},
{"name": "soaps", "id": 2466, "trainId": 825},
{"name": "games table", "id": 1057, "trainId": 826},
{"name": "slotted spoon", "id": 2444, "trainId": 827},
{"name": "reel", "id": 2093, "trainId": 828},
{"name": "scourer", "id": 2248, "trainId": 829},
{"name": "sleeping robe", "id": 2432, "trainId": 830},
{"name": "desk mat", "id": 726, "trainId": 831},
{"name": "dumbbell", "id": 816, "trainId": 832},
{"name": "hammer", "id": 1171, "trainId": 833},
{"name": "tie", "id": 2766, "trainId": 834},
{"name": "typewriter", "id": 2900, "trainId": 835},
{"name": "shaker", "id": 2313, "trainId": 836},
{"name": "cheese dish", "id": 488, "trainId": 837},
{"name": "sea star", "id": 2265, "trainId": 838},
{"name": "racquet", "id": 2043, "trainId": 839},
{"name": "butane gas cylinder", "id": 332, "trainId": 840},
{"name": "paper weight", "id": 1771, "trainId": 841},
{"name": "shaving brush", "id": 2320, "trainId": 842},
{"name": "sunglasses", "id": 2646, "trainId": 843},
{"name": "gear shift", "id": 1089, "trainId": 844},
{"name": "towel rail", "id": 2826, "trainId": 845},
{"name": "adding machine, totalizer, totaliser", "id": 3148, "trainId": 846},
]
def _get_ade20k_full_meta():
# Id 0 is reserved for ignore_label, we change ignore_label for 0
# to 255 in our pre-processing, so all ids are shifted by 1.
stuff_ids = [k["id"] for k in ADE20K_SEM_SEG_FULL_CATEGORIES]
assert len(stuff_ids) == 847, len(stuff_ids)
# For semantic segmentation, this mapping maps from contiguous stuff id
# (in [0, 91], used in models) to ids in the dataset (used for processing results)
stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)}
stuff_classes = [k["name"] for k in ADE20K_SEM_SEG_FULL_CATEGORIES]
ret = {
"stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
"stuff_classes": stuff_classes,
}
return ret
def register_all_ade20k_full(root):
root = os.path.join(root, "ADE20K_2021_17_01")
meta = _get_ade20k_full_meta()
for name, dirname in [("train", "training"), ("val", "validation")]:
image_dir = os.path.join(root, "images_detectron2", dirname)
gt_dir = os.path.join(root, "annotations_detectron2", dirname)
name = f"ade20k_full_sem_seg_{name}"
DatasetCatalog.register(
name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="tif", image_ext="jpg")
)
MetadataCatalog.get(name).set(
stuff_classes=meta["stuff_classes"][:],
image_root=image_dir,
sem_seg_root=gt_dir,
evaluator_type="sem_seg",
ignore_label=65535, # NOTE: gt is saved in 16-bit TIFF images
)
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_ade20k_full(_root)
================================================
FILE: mask2former/data/datasets/register_ade20k_instance.py
================================================
import json
import logging
import numpy as np
import os
from PIL import Image
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets.coco import load_coco_json, register_coco_instances
from detectron2.utils.file_io import PathManager
ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}]
_PREDEFINED_SPLITS = {
# point annotations without masks
"ade20k_instance_train": (
"ADEChallengeData2016/images/training",
"ADEChallengeData2016/ade20k_instance_train.json",
),
"ade20k_instance_val": (
"ADEChallengeData2016/images/validation",
"ADEChallengeData2016/ade20k_instance_val.json",
),
}
def _get_ade_instances_meta():
thing_ids = [k["id"] for k in ADE_CATEGORIES]
assert len(thing_ids) == 100, len(thing_ids)
# Mapping from the incontiguous ADE category id to an id in [0, 99]
thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
thing_classes = [k["name"] for k in ADE_CATEGORIES]
ret = {
"thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
"thing_classes": thing_classes,
}
return ret
def register_all_ade20k_instance(root):
for key, (image_root, json_file) in _PREDEFINED_SPLITS.items():
# Assume pre-defined datasets live in `./datasets`.
register_coco_instances(
key,
_get_ade_instances_meta(),
os.path.join(root, json_file) if "://" not in json_file else json_file,
os.path.join(root, image_root),
)
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_ade20k_instance(_root)
================================================
FILE: mask2former/data/datasets/register_ade20k_panoptic.py
================================================
import json
import os
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.utils.file_io import PathManager
ADE20K_150_CATEGORIES = [
{"color": [120, 120, 120], "id": 0, "isthing": 0, "name": "wall"},
{"color": [180, 120, 120], "id": 1, "isthing": 0, "name": "building"},
{"color": [6, 230, 230], "id": 2, "isthing": 0, "name": "sky"},
{"color": [80, 50, 50], "id": 3, "isthing": 0, "name": "floor"},
{"color": [4, 200, 3], "id": 4, "isthing": 0, "name": "tree"},
{"color": [120, 120, 80], "id": 5, "isthing": 0, "name": "ceiling"},
{"color": [140, 140, 140], "id": 6, "isthing": 0, "name": "road, route"},
{"color": [204, 5, 255], "id": 7, "isthing": 1, "name": "bed"},
{"color": [230, 230, 230], "id": 8, "isthing": 1, "name": "window "},
{"color": [4, 250, 7], "id": 9, "isthing": 0, "name": "grass"},
{"color": [224, 5, 255], "id": 10, "isthing": 1, "name": "cabinet"},
{"color": [235, 255, 7], "id": 11, "isthing": 0, "name": "sidewalk, pavement"},
{"color": [150, 5, 61], "id": 12, "isthing": 1, "name": "person"},
{"color": [120, 120, 70], "id": 13, "isthing": 0, "name": "earth, ground"},
{"color": [8, 255, 51], "id": 14, "isthing": 1, "name": "door"},
{"color": [255, 6, 82], "id": 15, "isthing": 1, "name": "table"},
{"color": [143, 255, 140], "id": 16, "isthing": 0, "name": "mountain, mount"},
{"color": [204, 255, 4], "id": 17, "isthing": 0, "name": "plant"},
{"color": [255, 51, 7], "id": 18, "isthing": 1, "name": "curtain"},
{"color": [204, 70, 3], "id": 19, "isthing": 1, "name": "chair"},
{"color": [0, 102, 200], "id": 20, "isthing": 1, "name": "car"},
{"color": [61, 230, 250], "id": 21, "isthing": 0, "name": "water"},
{"color": [255, 6, 51], "id": 22, "isthing": 1, "name": "painting, picture"},
{"color": [11, 102, 255], "id": 23, "isthing": 1, "name": "sofa"},
{"color": [255, 7, 71], "id": 24, "isthing": 1, "name": "shelf"},
{"color": [255, 9, 224], "id": 25, "isthing": 0, "name": "house"},
{"color": [9, 7, 230], "id": 26, "isthing": 0, "name": "sea"},
{"color": [220, 220, 220], "id": 27, "isthing": 1, "name": "mirror"},
{"color": [255, 9, 92], "id": 28, "isthing": 0, "name": "rug"},
{"color": [112, 9, 255], "id": 29, "isthing": 0, "name": "field"},
{"color": [8, 255, 214], "id": 30, "isthing": 1, "name": "armchair"},
{"color": [7, 255, 224], "id": 31, "isthing": 1, "name": "seat"},
{"color": [255, 184, 6], "id": 32, "isthing": 1, "name": "fence"},
{"color": [10, 255, 71], "id": 33, "isthing": 1, "name": "desk"},
{"color": [255, 41, 10], "id": 34, "isthing": 0, "name": "rock, stone"},
{"color": [7, 255, 255], "id": 35, "isthing": 1, "name": "wardrobe, closet, press"},
{"color": [224, 255, 8], "id": 36, "isthing": 1, "name": "lamp"},
{"color": [102, 8, 255], "id": 37, "isthing": 1, "name": "tub"},
{"color": [255, 61, 6], "id": 38, "isthing": 1, "name": "rail"},
{"color": [255, 194, 7], "id": 39, "isthing": 1, "name": "cushion"},
{"color": [255, 122, 8], "id": 40, "isthing": 0, "name": "base, pedestal, stand"},
{"color": [0, 255, 20], "id": 41, "isthing": 1, "name": "box"},
{"color": [255, 8, 41], "id": 42, "isthing": 1, "name": "column, pillar"},
{"color": [255, 5, 153], "id": 43, "isthing": 1, "name": "signboard, sign"},
{
"color": [6, 51, 255],
"id": 44,
"isthing": 1,
"name": "chest of drawers, chest, bureau, dresser",
},
{"color": [235, 12, 255], "id": 45, "isthing": 1, "name": "counter"},
{"color": [160, 150, 20], "id": 46, "isthing": 0, "name": "sand"},
{"color": [0, 163, 255], "id": 47, "isthing": 1, "name": "sink"},
{"color": [140, 140, 140], "id": 48, "isthing": 0, "name": "skyscraper"},
{"color": [250, 10, 15], "id": 49, "isthing": 1, "name": "fireplace"},
{"color": [20, 255, 0], "id": 50, "isthing": 1, "name": "refrigerator, icebox"},
{"color": [31, 255, 0], "id": 51, "isthing": 0, "name": "grandstand, covered stand"},
{"color": [255, 31, 0], "id": 52, "isthing": 0, "name": "path"},
{"color": [255, 224, 0], "id": 53, "isthing": 1, "name": "stairs"},
{"color": [153, 255, 0], "id": 54, "isthing": 0, "name": "runway"},
{"color": [0, 0, 255], "id": 55, "isthing": 1, "name": "case, display case, showcase, vitrine"},
{
"color": [255, 71, 0],
"id": 56,
"isthing": 1,
"name": "pool table, billiard table, snooker table",
},
{"color": [0, 235, 255], "id": 57, "isthing": 1, "name": "pillow"},
{"color": [0, 173, 255], "id": 58, "isthing": 1, "name": "screen door, screen"},
{"color": [31, 0, 255], "id": 59, "isthing": 0, "name": "stairway, staircase"},
{"color": [11, 200, 200], "id": 60, "isthing": 0, "name": "river"},
{"color": [255, 82, 0], "id": 61, "isthing": 0, "name": "bridge, span"},
{"color": [0, 255, 245], "id": 62, "isthing": 1, "name": "bookcase"},
{"color": [0, 61, 255], "id": 63, "isthing": 0, "name": "blind, screen"},
{"color": [0, 255, 112], "id": 64, "isthing": 1, "name": "coffee table"},
{
"color": [0, 255, 133],
"id": 65,
"isthing": 1,
"name": "toilet, can, commode, crapper, pot, potty, stool, throne",
},
{"color": [255, 0, 0], "id": 66, "isthing": 1, "name": "flower"},
{"color": [255, 163, 0], "id": 67, "isthing": 1, "name": "book"},
{"color": [255, 102, 0], "id": 68, "isthing": 0, "name": "hill"},
{"color": [194, 255, 0], "id": 69, "isthing": 1, "name": "bench"},
{"color": [0, 143, 255], "id": 70, "isthing": 1, "name": "countertop"},
{"color": [51, 255, 0], "id": 71, "isthing": 1, "name": "stove"},
{"color": [0, 82, 255], "id": 72, "isthing": 1, "name": "palm, palm tree"},
{"color": [0, 255, 41], "id": 73, "isthing": 1, "name": "kitchen island"},
{"color": [0, 255, 173], "id": 74, "isthing": 1, "name": "computer"},
{"color": [10, 0, 255], "id": 75, "isthing": 1, "name": "swivel chair"},
{"color": [173, 255, 0], "id": 76, "isthing": 1, "name": "boat"},
{"color": [0, 255, 153], "id": 77, "isthing": 0, "name": "bar"},
{"color": [255, 92, 0], "id": 78, "isthing": 1, "name": "arcade machine"},
{"color": [255, 0, 255], "id": 79, "isthing": 0, "name": "hovel, hut, hutch, shack, shanty"},
{"color": [255, 0, 245], "id": 80, "isthing": 1, "name": "bus"},
{"color": [255, 0, 102], "id": 81, "isthing": 1, "name": "towel"},
{"color": [255, 173, 0], "id": 82, "isthing": 1, "name": "light"},
{"color": [255, 0, 20], "id": 83, "isthing": 1, "name": "truck"},
{"color": [255, 184, 184], "id": 84, "isthing": 0, "name": "tower"},
{"color": [0, 31, 255], "id": 85, "isthing": 1, "name": "chandelier"},
{"color": [0, 255, 61], "id": 86, "isthing": 1, "name": "awning, sunshade, sunblind"},
{"color": [0, 71, 255], "id": 87, "isthing": 1, "name": "street lamp"},
{"color": [255, 0, 204], "id": 88, "isthing": 1, "name": "booth"},
{"color": [0, 255, 194], "id": 89, "isthing": 1, "name": "tv"},
{"color": [0, 255, 82], "id": 90, "isthing": 1, "name": "plane"},
{"color": [0, 10, 255], "id": 91, "isthing": 0, "name": "dirt track"},
{"color": [0, 112, 255], "id": 92, "isthing": 1, "name": "clothes"},
{"color": [51, 0, 255], "id": 93, "isthing": 1, "name": "pole"},
{"color": [0, 194, 255], "id": 94, "isthing": 0, "name": "land, ground, soil"},
{
"color": [0, 122, 255],
"id": 95,
"isthing": 1,
"name": "bannister, banister, balustrade, balusters, handrail",
},
{
"color": [0, 255, 163],
"id": 96,
"isthing": 0,
"name": "escalator, moving staircase, moving stairway",
},
{
"color": [255, 153, 0],
"id": 97,
"isthing": 1,
"name": "ottoman, pouf, pouffe, puff, hassock",
},
{"color": [0, 255, 10], "id": 98, "isthing": 1, "name": "bottle"},
{"color": [255, 112, 0], "id": 99, "isthing": 0, "name": "buffet, counter, sideboard"},
{
"color": [143, 255, 0],
"id": 100,
"isthing": 0,
"name": "poster, posting, placard, notice, bill, card",
},
{"color": [82, 0, 255], "id": 101, "isthing": 0, "name": "stage"},
{"color": [163, 255, 0], "id": 102, "isthing": 1, "name": "van"},
{"color": [255, 235, 0], "id": 103, "isthing": 1, "name": "ship"},
{"color": [8, 184, 170], "id": 104, "isthing": 1, "name": "fountain"},
{
"color": [133, 0, 255],
"id": 105,
"isthing": 0,
"name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
},
{"color": [0, 255, 92], "id": 106, "isthing": 0, "name": "canopy"},
{
"color": [184, 0, 255],
"id": 107,
"isthing": 1,
"name": "washer, automatic washer, washing machine",
},
{"color": [255, 0, 31], "id": 108, "isthing": 1, "name": "plaything, toy"},
{"color": [0, 184, 255], "id": 109, "isthing": 0, "name": "pool"},
{"color": [0, 214, 255], "id": 110, "isthing": 1, "name": "stool"},
{"color": [255, 0, 112], "id": 111, "isthing": 1, "name": "barrel, cask"},
{"color": [92, 255, 0], "id": 112, "isthing": 1, "name": "basket, handbasket"},
{"color": [0, 224, 255], "id": 113, "isthing": 0, "name": "falls"},
{"color": [112, 224, 255], "id": 114, "isthing": 0, "name": "tent"},
{"color": [70, 184, 160], "id": 115, "isthing": 1, "name": "bag"},
{"color": [163, 0, 255], "id": 116, "isthing": 1, "name": "minibike, motorbike"},
{"color": [153, 0, 255], "id": 117, "isthing": 0, "name": "cradle"},
{"color": [71, 255, 0], "id": 118, "isthing": 1, "name": "oven"},
{"color": [255, 0, 163], "id": 119, "isthing": 1, "name": "ball"},
{"color": [255, 204, 0], "id": 120, "isthing": 1, "name": "food, solid food"},
{"color": [255, 0, 143], "id": 121, "isthing": 1, "name": "step, stair"},
{"color": [0, 255, 235], "id": 122, "isthing": 0, "name": "tank, storage tank"},
{"color": [133, 255, 0], "id": 123, "isthing": 1, "name": "trade name"},
{"color": [255, 0, 235], "id": 124, "isthing": 1, "name": "microwave"},
{"color": [245, 0, 255], "id": 125, "isthing": 1, "name": "pot"},
{"color": [255, 0, 122], "id": 126, "isthing": 1, "name": "animal"},
{"color": [255, 245, 0], "id": 127, "isthing": 1, "name": "bicycle"},
{"color": [10, 190, 212], "id": 128, "isthing": 0, "name": "lake"},
{"color": [214, 255, 0], "id": 129, "isthing": 1, "name": "dishwasher"},
{"color": [0, 204, 255], "id": 130, "isthing": 1, "name": "screen"},
{"color": [20, 0, 255], "id": 131, "isthing": 0, "name": "blanket, cover"},
{"color": [255, 255, 0], "id": 132, "isthing": 1, "name": "sculpture"},
{"color": [0, 153, 255], "id": 133, "isthing": 1, "name": "hood, exhaust hood"},
{"color": [0, 41, 255], "id": 134, "isthing": 1, "name": "sconce"},
{"color": [0, 255, 204], "id": 135, "isthing": 1, "name": "vase"},
{"color": [41, 0, 255], "id": 136, "isthing": 1, "name": "traffic light"},
{"color": [41, 255, 0], "id": 137, "isthing": 1, "name": "tray"},
{"color": [173, 0, 255], "id": 138, "isthing": 1, "name": "trash can"},
{"color": [0, 245, 255], "id": 139, "isthing": 1, "name": "fan"},
{"color": [71, 0, 255], "id": 140, "isthing": 0, "name": "pier"},
{"color": [122, 0, 255], "id": 141, "isthing": 0, "name": "crt screen"},
{"color": [0, 255, 184], "id": 142, "isthing": 1, "name": "plate"},
{"color": [0, 92, 255], "id": 143, "isthing": 1, "name": "monitor"},
{"color": [184, 255, 0], "id": 144, "isthing": 1, "name": "bulletin board"},
{"color": [0, 133, 255], "id": 145, "isthing": 0, "name": "shower"},
{"color": [255, 214, 0], "id": 146, "isthing": 1, "name": "radiator"},
{"color": [25, 194, 194], "id": 147, "isthing": 1, "name": "glass, drinking glass"},
{"color": [102, 255, 0], "id": 148, "isthing": 1, "name": "clock"},
{"color": [92, 0, 255], "id": 149, "isthing": 1, "name": "flag"},
]
ADE20k_COLORS = [k["color"] for k in ADE20K_150_CATEGORIES]
MetadataCatalog.get("ade20k_sem_seg_train").set(
stuff_colors=ADE20k_COLORS[:],
)
MetadataCatalog.get("ade20k_sem_seg_val").set(
stuff_colors=ADE20k_COLORS[:],
)
def load_ade20k_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
"""
Args:
image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
Returns:
list[dict]: a list of dicts in Detectron2 standard format. (See
`Using Custom Datasets `_ )
"""
def _convert_category_id(segment_info, meta):
if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
segment_info["category_id"]
]
segment_info["isthing"] = True
else:
segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
segment_info["category_id"]
]
segment_info["isthing"] = False
return segment_info
with PathManager.open(json_file) as f:
json_info = json.load(f)
ret = []
for ann in json_info["annotations"]:
image_id = ann["image_id"]
# TODO: currently we assume image and label has the same filename but
# different extension, and images have extension ".jpg" for COCO. Need
# to make image extension a user-provided argument if we extend this
# function to support other COCO-like datasets.
image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
label_file = os.path.join(gt_dir, ann["file_name"])
sem_label_file = os.path.join(semseg_dir, ann["file_name"])
segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
ret.append(
{
"file_name": image_file,
"image_id": image_id,
"pan_seg_file_name": label_file,
"sem_seg_file_name": sem_label_file,
"segments_info": segments_info,
}
)
assert len(ret), f"No images found in {image_dir}!"
assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
return ret
def register_ade20k_panoptic(
name, metadata, image_root, panoptic_root, semantic_root, panoptic_json, instances_json=None
):
"""
Register a "standard" version of ADE20k panoptic segmentation dataset named `name`.
The dictionaries in this registered dataset follows detectron2's standard format.
Hence it's called "standard".
Args:
name (str): the name that identifies a dataset,
e.g. "ade20k_panoptic_train"
metadata (dict): extra metadata associated with this dataset.
image_root (str): directory which contains all the images
panoptic_root (str): directory which contains panoptic annotation images in COCO format
panoptic_json (str): path to the json panoptic annotation file in COCO format
sem_seg_root (none): not used, to be consistent with
`register_coco_panoptic_separated`.
instances_json (str): path to the json instance annotation file
"""
panoptic_name = name
DatasetCatalog.register(
panoptic_name,
lambda: load_ade20k_panoptic_json(
panoptic_json, image_root, panoptic_root, semantic_root, metadata
),
)
MetadataCatalog.get(panoptic_name).set(
panoptic_root=panoptic_root,
image_root=image_root,
panoptic_json=panoptic_json,
json_file=instances_json,
evaluator_type="ade20k_panoptic_seg",
ignore_label=255,
label_divisor=1000,
**metadata,
)
_PREDEFINED_SPLITS_ADE20K_PANOPTIC = {
"ade20k_panoptic_train": (
"ADEChallengeData2016/images/training",
"ADEChallengeData2016/ade20k_panoptic_train",
"ADEChallengeData2016/ade20k_panoptic_train.json",
"ADEChallengeData2016/annotations_detectron2/training",
"ADEChallengeData2016/ade20k_instance_train.json",
),
"ade20k_panoptic_val": (
"ADEChallengeData2016/images/validation",
"ADEChallengeData2016/ade20k_panoptic_val",
"ADEChallengeData2016/ade20k_panoptic_val.json",
"ADEChallengeData2016/annotations_detectron2/validation",
"ADEChallengeData2016/ade20k_instance_val.json",
),
}
def get_metadata():
meta = {}
# The following metadata maps contiguous id from [0, #thing categories +
# #stuff categories) to their names and colors. We have to replica of the
# same name and color under "thing_*" and "stuff_*" because the current
# visualization function in D2 handles thing and class classes differently
# due to some heuristic used in Panoptic FPN. We keep the same naming to
# enable reusing existing visualization functions.
thing_classes = [k["name"] for k in ADE20K_150_CATEGORIES if k["isthing"] == 1]
thing_colors = [k["color"] for k in ADE20K_150_CATEGORIES if k["isthing"] == 1]
stuff_classes = [k["name"] for k in ADE20K_150_CATEGORIES]
stuff_colors = [k["color"] for k in ADE20K_150_CATEGORIES]
meta["thing_classes"] = thing_classes
meta["thing_colors"] = thing_colors
meta["stuff_classes"] = stuff_classes
meta["stuff_colors"] = stuff_colors
# Convert category id for training:
# category id: like semantic segmentation, it is the class id for each
# pixel. Since there are some classes not used in evaluation, the category
# id is not always contiguous and thus we have two set of category ids:
# - original category id: category id in the original dataset, mainly
# used for evaluation.
# - contiguous category id: [0, #classes), in order to train the linear
# softmax classifier.
thing_dataset_id_to_contiguous_id = {}
stuff_dataset_id_to_contiguous_id = {}
for i, cat in enumerate(ADE20K_150_CATEGORIES):
if cat["isthing"]:
thing_dataset_id_to_contiguous_id[cat["id"]] = i
# else:
# stuff_dataset_id_to_contiguous_id[cat["id"]] = i
# in order to use sem_seg evaluator
stuff_dataset_id_to_contiguous_id[cat["id"]] = i
meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
return meta
def register_all_ade20k_panoptic(root):
metadata = get_metadata()
for (
prefix,
(image_root, panoptic_root, panoptic_json, semantic_root, instance_json),
) in _PREDEFINED_SPLITS_ADE20K_PANOPTIC.items():
# The "standard" version of COCO panoptic segmentation dataset,
# e.g. used by Panoptic-DeepLab
register_ade20k_panoptic(
prefix,
metadata,
os.path.join(root, image_root),
os.path.join(root, panoptic_root),
os.path.join(root, semantic_root),
os.path.join(root, panoptic_json),
os.path.join(root, instance_json),
)
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_ade20k_panoptic(_root)
================================================
FILE: mask2former/data/datasets/register_coco_panoptic_annos_semseg.py
================================================
import json
import os
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets import load_sem_seg
from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
from detectron2.utils.file_io import PathManager
_PREDEFINED_SPLITS_COCO_PANOPTIC = {
"coco_2017_train_panoptic": (
# This is the original panoptic annotation directory
"coco/panoptic_train2017",
"coco/annotations/panoptic_train2017.json",
# This directory contains semantic annotations that are
# converted from panoptic annotations.
# It is used by PanopticFPN.
# You can use the script at detectron2/datasets/prepare_panoptic_fpn.py
# to create these directories.
"coco/panoptic_semseg_train2017",
),
"coco_2017_val_panoptic": (
"coco/panoptic_val2017",
"coco/annotations/panoptic_val2017.json",
"coco/panoptic_semseg_val2017",
),
}
def get_metadata():
meta = {}
# The following metadata maps contiguous id from [0, #thing categories +
# #stuff categories) to their names and colors. We have to replica of the
# same name and color under "thing_*" and "stuff_*" because the current
# visualization function in D2 handles thing and class classes differently
# due to some heuristic used in Panoptic FPN. We keep the same naming to
# enable reusing existing visualization functions.
thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
stuff_classes = [k["name"] for k in COCO_CATEGORIES]
stuff_colors = [k["color"] for k in COCO_CATEGORIES]
meta["thing_classes"] = thing_classes
meta["thing_colors"] = thing_colors
meta["stuff_classes"] = stuff_classes
meta["stuff_colors"] = stuff_colors
# Convert category id for training:
# category id: like semantic segmentation, it is the class id for each
# pixel. Since there are some classes not used in evaluation, the category
# id is not always contiguous and thus we have two set of category ids:
# - original category id: category id in the original dataset, mainly
# used for evaluation.
# - contiguous category id: [0, #classes), in order to train the linear
# softmax classifier.
thing_dataset_id_to_contiguous_id = {}
stuff_dataset_id_to_contiguous_id = {}
for i, cat in enumerate(COCO_CATEGORIES):
if cat["isthing"]:
thing_dataset_id_to_contiguous_id[cat["id"]] = i
# else:
# stuff_dataset_id_to_contiguous_id[cat["id"]] = i
# in order to use sem_seg evaluator
stuff_dataset_id_to_contiguous_id[cat["id"]] = i
meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
return meta
def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
"""
Args:
image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
Returns:
list[dict]: a list of dicts in Detectron2 standard format. (See
`Using Custom Datasets `_ )
"""
def _convert_category_id(segment_info, meta):
if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
segment_info["category_id"]
]
segment_info["isthing"] = True
else:
segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
segment_info["category_id"]
]
segment_info["isthing"] = False
return segment_info
with PathManager.open(json_file) as f:
json_info = json.load(f)
ret = []
for ann in json_info["annotations"]:
image_id = int(ann["image_id"])
# TODO: currently we assume image and label has the same filename but
# different extension, and images have extension ".jpg" for COCO. Need
# to make image extension a user-provided argument if we extend this
# function to support other COCO-like datasets.
image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
label_file = os.path.join(gt_dir, ann["file_name"])
sem_label_file = os.path.join(semseg_dir, ann["file_name"])
segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
ret.append(
{
"file_name": image_file,
"image_id": image_id,
"pan_seg_file_name": label_file,
"sem_seg_file_name": sem_label_file,
"segments_info": segments_info,
}
)
assert len(ret), f"No images found in {image_dir}!"
assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
return ret
def register_coco_panoptic_annos_sem_seg(
name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json
):
panoptic_name = name
delattr(MetadataCatalog.get(panoptic_name), "thing_classes")
delattr(MetadataCatalog.get(panoptic_name), "thing_colors")
MetadataCatalog.get(panoptic_name).set(
thing_classes=metadata["thing_classes"],
thing_colors=metadata["thing_colors"],
# thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"],
)
# the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg"
semantic_name = name + "_with_sem_seg"
DatasetCatalog.register(
semantic_name,
lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, sem_seg_root, metadata),
)
MetadataCatalog.get(semantic_name).set(
sem_seg_root=sem_seg_root,
panoptic_root=panoptic_root,
image_root=image_root,
panoptic_json=panoptic_json,
json_file=instances_json,
evaluator_type="coco_panoptic_seg",
ignore_label=255,
label_divisor=1000,
**metadata,
)
def register_all_coco_panoptic_annos_sem_seg(root):
for (
prefix,
(panoptic_root, panoptic_json, semantic_root),
) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items():
prefix_instances = prefix[: -len("_panoptic")]
instances_meta = MetadataCatalog.get(prefix_instances)
image_root, instances_json = instances_meta.image_root, instances_meta.json_file
register_coco_panoptic_annos_sem_seg(
prefix,
get_metadata(),
image_root,
os.path.join(root, panoptic_root),
os.path.join(root, panoptic_json),
os.path.join(root, semantic_root),
instances_json,
)
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_coco_panoptic_annos_sem_seg(_root)
================================================
FILE: mask2former/data/datasets/register_coco_stuff_10k.py
================================================
import os
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets import load_sem_seg
COCO_CATEGORIES = [
{"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
{"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
{"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
{"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
{"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
{"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
{"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
{"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
{"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
{"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
{"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
{"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
{"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
{"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
{"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
{"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
{"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
{"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
{"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
{"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
{"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
{"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
{"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
{"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
{"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
{"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
{"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
{"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
{"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
{"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
{"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
{"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
{"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
{"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
{"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
{"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
{"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
{"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
{"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
{"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
{"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
{"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
{"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
{"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
{"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
{"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
{"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
{"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
{"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
{"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
{"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
{"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
{"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
{"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
{"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
{"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
{"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
{"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
{"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
{"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
{"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
{"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
{"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
{"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
{"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
{"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
{"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
{"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
{"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
{"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
{"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
{"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
{"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
{"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
{"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
{"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
{"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
{"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
{"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
{"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
{"id": 92, "name": "banner", "supercategory": "textile"},
{"id": 93, "name": "blanket", "supercategory": "textile"},
{"id": 94, "name": "branch", "supercategory": "plant"},
{"id": 95, "name": "bridge", "supercategory": "building"},
{"id": 96, "name": "building-other", "supercategory": "building"},
{"id": 97, "name": "bush", "supercategory": "plant"},
{"id": 98, "name": "cabinet", "supercategory": "furniture-stuff"},
{"id": 99, "name": "cage", "supercategory": "structural"},
{"id": 100, "name": "cardboard", "supercategory": "raw-material"},
{"id": 101, "name": "carpet", "supercategory": "floor"},
{"id": 102, "name": "ceiling-other", "supercategory": "ceiling"},
{"id": 103, "name": "ceiling-tile", "supercategory": "ceiling"},
{"id": 104, "name": "cloth", "supercategory": "textile"},
{"id": 105, "name": "clothes", "supercategory": "textile"},
{"id": 106, "name": "clouds", "supercategory": "sky"},
{"id": 107, "name": "counter", "supercategory": "furniture-stuff"},
{"id": 108, "name": "cupboard", "supercategory": "furniture-stuff"},
{"id": 109, "name": "curtain", "supercategory": "textile"},
{"id": 110, "name": "desk-stuff", "supercategory": "furniture-stuff"},
{"id": 111, "name": "dirt", "supercategory": "ground"},
{"id": 112, "name": "door-stuff", "supercategory": "furniture-stuff"},
{"id": 113, "name": "fence", "supercategory": "structural"},
{"id": 114, "name": "floor-marble", "supercategory": "floor"},
{"id": 115, "name": "floor-other", "supercategory": "floor"},
{"id": 116, "name": "floor-stone", "supercategory": "floor"},
{"id": 117, "name": "floor-tile", "supercategory": "floor"},
{"id": 118, "name": "floor-wood", "supercategory": "floor"},
{"id": 119, "name": "flower", "supercategory": "plant"},
{"id": 120, "name": "fog", "supercategory": "water"},
{"id": 121, "name": "food-other", "supercategory": "food-stuff"},
{"id": 122, "name": "fruit", "supercategory": "food-stuff"},
{"id": 123, "name": "furniture-other", "supercategory": "furniture-stuff"},
{"id": 124, "name": "grass", "supercategory": "plant"},
{"id": 125, "name": "gravel", "supercategory": "ground"},
{"id": 126, "name": "ground-other", "supercategory": "ground"},
{"id": 127, "name": "hill", "supercategory": "solid"},
{"id": 128, "name": "house", "supercategory": "building"},
{"id": 129, "name": "leaves", "supercategory": "plant"},
{"id": 130, "name": "light", "supercategory": "furniture-stuff"},
{"id": 131, "name": "mat", "supercategory": "textile"},
{"id": 132, "name": "metal", "supercategory": "raw-material"},
{"id": 133, "name": "mirror-stuff", "supercategory": "furniture-stuff"},
{"id": 134, "name": "moss", "supercategory": "plant"},
{"id": 135, "name": "mountain", "supercategory": "solid"},
{"id": 136, "name": "mud", "supercategory": "ground"},
{"id": 137, "name": "napkin", "supercategory": "textile"},
{"id": 138, "name": "net", "supercategory": "structural"},
{"id": 139, "name": "paper", "supercategory": "raw-material"},
{"id": 140, "name": "pavement", "supercategory": "ground"},
{"id": 141, "name": "pillow", "supercategory": "textile"},
{"id": 142, "name": "plant-other", "supercategory": "plant"},
{"id": 143, "name": "plastic", "supercategory": "raw-material"},
{"id": 144, "name": "platform", "supercategory": "ground"},
{"id": 145, "name": "playingfield", "supercategory": "ground"},
{"id": 146, "name": "railing", "supercategory": "structural"},
{"id": 147, "name": "railroad", "supercategory": "ground"},
{"id": 148, "name": "river", "supercategory": "water"},
{"id": 149, "name": "road", "supercategory": "ground"},
{"id": 150, "name": "rock", "supercategory": "solid"},
{"id": 151, "name": "roof", "supercategory": "building"},
{"id": 152, "name": "rug", "supercategory": "textile"},
{"id": 153, "name": "salad", "supercategory": "food-stuff"},
{"id": 154, "name": "sand", "supercategory": "ground"},
{"id": 155, "name": "sea", "supercategory": "water"},
{"id": 156, "name": "shelf", "supercategory": "furniture-stuff"},
{"id": 157, "name": "sky-other", "supercategory": "sky"},
{"id": 158, "name": "skyscraper", "supercategory": "building"},
{"id": 159, "name": "snow", "supercategory": "ground"},
{"id": 160, "name": "solid-other", "supercategory": "solid"},
{"id": 161, "name": "stairs", "supercategory": "furniture-stuff"},
{"id": 162, "name": "stone", "supercategory": "solid"},
{"id": 163, "name": "straw", "supercategory": "plant"},
{"id": 164, "name": "structural-other", "supercategory": "structural"},
{"id": 165, "name": "table", "supercategory": "furniture-stuff"},
{"id": 166, "name": "tent", "supercategory": "building"},
{"id": 167, "name": "textile-other", "supercategory": "textile"},
{"id": 168, "name": "towel", "supercategory": "textile"},
{"id": 169, "name": "tree", "supercategory": "plant"},
{"id": 170, "name": "vegetable", "supercategory": "food-stuff"},
{"id": 171, "name": "wall-brick", "supercategory": "wall"},
{"id": 172, "name": "wall-concrete", "supercategory": "wall"},
{"id": 173, "name": "wall-other", "supercategory": "wall"},
{"id": 174, "name": "wall-panel", "supercategory": "wall"},
{"id": 175, "name": "wall-stone", "supercategory": "wall"},
{"id": 176, "name": "wall-tile", "supercategory": "wall"},
{"id": 177, "name": "wall-wood", "supercategory": "wall"},
{"id": 178, "name": "water-other", "supercategory": "water"},
{"id": 179, "name": "waterdrops", "supercategory": "water"},
{"id": 180, "name": "window-blind", "supercategory": "window"},
{"id": 181, "name": "window-other", "supercategory": "window"},
{"id": 182, "name": "wood", "supercategory": "solid"},
]
def _get_coco_stuff_meta():
# Id 0 is reserved for ignore_label, we change ignore_label for 0
# to 255 in our pre-processing.
stuff_ids = [k["id"] for k in COCO_CATEGORIES]
assert len(stuff_ids) == 171, len(stuff_ids)
# For semantic segmentation, this mapping maps from contiguous stuff id
# (in [0, 91], used in models) to ids in the dataset (used for processing results)
stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)}
stuff_classes = [k["name"] for k in COCO_CATEGORIES]
ret = {
"stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
"stuff_classes": stuff_classes,
}
return ret
def register_all_coco_stuff_10k(root):
root = os.path.join(root, "coco", "coco_stuff_10k")
meta = _get_coco_stuff_meta()
for name, image_dirname, sem_seg_dirname in [
("train", "images_detectron2/train", "annotations_detectron2/train"),
("test", "images_detectron2/test", "annotations_detectron2/test"),
]:
image_dir = os.path.join(root, image_dirname)
gt_dir = os.path.join(root, sem_seg_dirname)
name = f"coco_2017_{name}_stuff_10k_sem_seg"
DatasetCatalog.register(
name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg")
)
MetadataCatalog.get(name).set(
image_root=image_dir,
sem_seg_root=gt_dir,
evaluator_type="sem_seg",
ignore_label=255,
**meta,
)
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_coco_stuff_10k(_root)
================================================
FILE: mask2former/data/datasets/register_mapillary_vistas.py
================================================
import os
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets import load_sem_seg
MAPILLARY_VISTAS_SEM_SEG_CATEGORIES = [
{
"color": [165, 42, 42],
"instances": True,
"readable": "Bird",
"name": "animal--bird",
"evaluate": True,
},
{
"color": [0, 192, 0],
"instances": True,
"readable": "Ground Animal",
"name": "animal--ground-animal",
"evaluate": True,
},
{
"color": [196, 196, 196],
"instances": False,
"readable": "Curb",
"name": "construction--barrier--curb",
"evaluate": True,
},
{
"color": [190, 153, 153],
"instances": False,
"readable": "Fence",
"name": "construction--barrier--fence",
"evaluate": True,
},
{
"color": [180, 165, 180],
"instances": False,
"readable": "Guard Rail",
"name": "construction--barrier--guard-rail",
"evaluate": True,
},
{
"color": [90, 120, 150],
"instances": False,
"readable": "Barrier",
"name": "construction--barrier--other-barrier",
"evaluate": True,
},
{
"color": [102, 102, 156],
"instances": False,
"readable": "Wall",
"name": "construction--barrier--wall",
"evaluate": True,
},
{
"color": [128, 64, 255],
"instances": False,
"readable": "Bike Lane",
"name": "construction--flat--bike-lane",
"evaluate": True,
},
{
"color": [140, 140, 200],
"instances": True,
"readable": "Crosswalk - Plain",
"name": "construction--flat--crosswalk-plain",
"evaluate": True,
},
{
"color": [170, 170, 170],
"instances": False,
"readable": "Curb Cut",
"name": "construction--flat--curb-cut",
"evaluate": True,
},
{
"color": [250, 170, 160],
"instances": False,
"readable": "Parking",
"name": "construction--flat--parking",
"evaluate": True,
},
{
"color": [96, 96, 96],
"instances": False,
"readable": "Pedestrian Area",
"name": "construction--flat--pedestrian-area",
"evaluate": True,
},
{
"color": [230, 150, 140],
"instances": False,
"readable": "Rail Track",
"name": "construction--flat--rail-track",
"evaluate": True,
},
{
"color": [128, 64, 128],
"instances": False,
"readable": "Road",
"name": "construction--flat--road",
"evaluate": True,
},
{
"color": [110, 110, 110],
"instances": False,
"readable": "Service Lane",
"name": "construction--flat--service-lane",
"evaluate": True,
},
{
"color": [244, 35, 232],
"instances": False,
"readable": "Sidewalk",
"name": "construction--flat--sidewalk",
"evaluate": True,
},
{
"color": [150, 100, 100],
"instances": False,
"readable": "Bridge",
"name": "construction--structure--bridge",
"evaluate": True,
},
{
"color": [70, 70, 70],
"instances": False,
"readable": "Building",
"name": "construction--structure--building",
"evaluate": True,
},
{
"color": [150, 120, 90],
"instances": False,
"readable": "Tunnel",
"name": "construction--structure--tunnel",
"evaluate": True,
},
{
"color": [220, 20, 60],
"instances": True,
"readable": "Person",
"name": "human--person",
"evaluate": True,
},
{
"color": [255, 0, 0],
"instances": True,
"readable": "Bicyclist",
"name": "human--rider--bicyclist",
"evaluate": True,
},
{
"color": [255, 0, 100],
"instances": True,
"readable": "Motorcyclist",
"name": "human--rider--motorcyclist",
"evaluate": True,
},
{
"color": [255, 0, 200],
"instances": True,
"readable": "Other Rider",
"name": "human--rider--other-rider",
"evaluate": True,
},
{
"color": [200, 128, 128],
"instances": True,
"readable": "Lane Marking - Crosswalk",
"name": "marking--crosswalk-zebra",
"evaluate": True,
},
{
"color": [255, 255, 255],
"instances": False,
"readable": "Lane Marking - General",
"name": "marking--general",
"evaluate": True,
},
{
"color": [64, 170, 64],
"instances": False,
"readable": "Mountain",
"name": "nature--mountain",
"evaluate": True,
},
{
"color": [230, 160, 50],
"instances": False,
"readable": "Sand",
"name": "nature--sand",
"evaluate": True,
},
{
"color": [70, 130, 180],
"instances": False,
"readable": "Sky",
"name": "nature--sky",
"evaluate": True,
},
{
"color": [190, 255, 255],
"instances": False,
"readable": "Snow",
"name": "nature--snow",
"evaluate": True,
},
{
"color": [152, 251, 152],
"instances": False,
"readable": "Terrain",
"name": "nature--terrain",
"evaluate": True,
},
{
"color": [107, 142, 35],
"instances": False,
"readable": "Vegetation",
"name": "nature--vegetation",
"evaluate": True,
},
{
"color": [0, 170, 30],
"instances": False,
"readable": "Water",
"name": "nature--water",
"evaluate": True,
},
{
"color": [255, 255, 128],
"instances": True,
"readable": "Banner",
"name": "object--banner",
"evaluate": True,
},
{
"color": [250, 0, 30],
"instances": True,
"readable": "Bench",
"name": "object--bench",
"evaluate": True,
},
{
"color": [100, 140, 180],
"instances": True,
"readable": "Bike Rack",
"name": "object--bike-rack",
"evaluate": True,
},
{
"color": [220, 220, 220],
"instances": True,
"readable": "Billboard",
"name": "object--billboard",
"evaluate": True,
},
{
"color": [220, 128, 128],
"instances": True,
"readable": "Catch Basin",
"name": "object--catch-basin",
"evaluate": True,
},
{
"color": [222, 40, 40],
"instances": True,
"readable": "CCTV Camera",
"name": "object--cctv-camera",
"evaluate": True,
},
{
"color": [100, 170, 30],
"instances": True,
"readable": "Fire Hydrant",
"name": "object--fire-hydrant",
"evaluate": True,
},
{
"color": [40, 40, 40],
"instances": True,
"readable": "Junction Box",
"name": "object--junction-box",
"evaluate": True,
},
{
"color": [33, 33, 33],
"instances": True,
"readable": "Mailbox",
"name": "object--mailbox",
"evaluate": True,
},
{
"color": [100, 128, 160],
"instances": True,
"readable": "Manhole",
"name": "object--manhole",
"evaluate": True,
},
{
"color": [142, 0, 0],
"instances": True,
"readable": "Phone Booth",
"name": "object--phone-booth",
"evaluate": True,
},
{
"color": [70, 100, 150],
"instances": False,
"readable": "Pothole",
"name": "object--pothole",
"evaluate": True,
},
{
"color": [210, 170, 100],
"instances": True,
"readable": "Street Light",
"name": "object--street-light",
"evaluate": True,
},
{
"color": [153, 153, 153],
"instances": True,
"readable": "Pole",
"name": "object--support--pole",
"evaluate": True,
},
{
"color": [128, 128, 128],
"instances": True,
"readable": "Traffic Sign Frame",
"name": "object--support--traffic-sign-frame",
"evaluate": True,
},
{
"color": [0, 0, 80],
"instances": True,
"readable": "Utility Pole",
"name": "object--support--utility-pole",
"evaluate": True,
},
{
"color": [250, 170, 30],
"instances": True,
"readable": "Traffic Light",
"name": "object--traffic-light",
"evaluate": True,
},
{
"color": [192, 192, 192],
"instances": True,
"readable": "Traffic Sign (Back)",
"name": "object--traffic-sign--back",
"evaluate": True,
},
{
"color": [220, 220, 0],
"instances": True,
"readable": "Traffic Sign (Front)",
"name": "object--traffic-sign--front",
"evaluate": True,
},
{
"color": [140, 140, 20],
"instances": True,
"readable": "Trash Can",
"name": "object--trash-can",
"evaluate": True,
},
{
"color": [119, 11, 32],
"instances": True,
"readable": "Bicycle",
"name": "object--vehicle--bicycle",
"evaluate": True,
},
{
"color": [150, 0, 255],
"instances": True,
"readable": "Boat",
"name": "object--vehicle--boat",
"evaluate": True,
},
{
"color": [0, 60, 100],
"instances": True,
"readable": "Bus",
"name": "object--vehicle--bus",
"evaluate": True,
},
{
"color": [0, 0, 142],
"instances": True,
"readable": "Car",
"name": "object--vehicle--car",
"evaluate": True,
},
{
"color": [0, 0, 90],
"instances": True,
"readable": "Caravan",
"name": "object--vehicle--caravan",
"evaluate": True,
},
{
"color": [0, 0, 230],
"instances": True,
"readable": "Motorcycle",
"name": "object--vehicle--motorcycle",
"evaluate": True,
},
{
"color": [0, 80, 100],
"instances": False,
"readable": "On Rails",
"name": "object--vehicle--on-rails",
"evaluate": True,
},
{
"color": [128, 64, 64],
"instances": True,
"readable": "Other Vehicle",
"name": "object--vehicle--other-vehicle",
"evaluate": True,
},
{
"color": [0, 0, 110],
"instances": True,
"readable": "Trailer",
"name": "object--vehicle--trailer",
"evaluate": True,
},
{
"color": [0, 0, 70],
"instances": True,
"readable": "Truck",
"name": "object--vehicle--truck",
"evaluate": True,
},
{
"color": [0, 0, 192],
"instances": True,
"readable": "Wheeled Slow",
"name": "object--vehicle--wheeled-slow",
"evaluate": True,
},
{
"color": [32, 32, 32],
"instances": False,
"readable": "Car Mount",
"name": "void--car-mount",
"evaluate": True,
},
{
"color": [120, 10, 10],
"instances": False,
"readable": "Ego Vehicle",
"name": "void--ego-vehicle",
"evaluate": True,
},
{
"color": [0, 0, 0],
"instances": False,
"readable": "Unlabeled",
"name": "void--unlabeled",
"evaluate": False,
},
]
def _get_mapillary_vistas_meta():
stuff_classes = [k["readable"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES if k["evaluate"]]
assert len(stuff_classes) == 65
stuff_colors = [k["color"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES if k["evaluate"]]
assert len(stuff_colors) == 65
ret = {
"stuff_classes": stuff_classes,
"stuff_colors": stuff_colors,
}
return ret
def register_all_mapillary_vistas(root):
root = os.path.join(root, "mapillary_vistas")
meta = _get_mapillary_vistas_meta()
for name, dirname in [("train", "training"), ("val", "validation")]:
image_dir = os.path.join(root, dirname, "images")
gt_dir = os.path.join(root, dirname, "labels")
name = f"mapillary_vistas_sem_seg_{name}"
DatasetCatalog.register(
name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg")
)
MetadataCatalog.get(name).set(
image_root=image_dir,
sem_seg_root=gt_dir,
evaluator_type="sem_seg",
ignore_label=65, # different from other datasets, Mapillary Vistas sets ignore_label to 65
**meta,
)
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_mapillary_vistas(_root)
================================================
FILE: mask2former/data/datasets/register_mapillary_vistas_panoptic.py
================================================
import json
import os
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.utils.file_io import PathManager
MAPILLARY_VISTAS_SEM_SEG_CATEGORIES = [
{'color': [165, 42, 42],
'id': 1,
'isthing': 1,
'name': 'Bird',
'supercategory': 'animal--bird'},
{'color': [0, 192, 0],
'id': 2,
'isthing': 1,
'name': 'Ground Animal',
'supercategory': 'animal--ground-animal'},
{'color': [196, 196, 196],
'id': 3,
'isthing': 0,
'name': 'Curb',
'supercategory': 'construction--barrier--curb'},
{'color': [190, 153, 153],
'id': 4,
'isthing': 0,
'name': 'Fence',
'supercategory': 'construction--barrier--fence'},
{'color': [180, 165, 180],
'id': 5,
'isthing': 0,
'name': 'Guard Rail',
'supercategory': 'construction--barrier--guard-rail'},
{'color': [90, 120, 150],
'id': 6,
'isthing': 0,
'name': 'Barrier',
'supercategory': 'construction--barrier--other-barrier'},
{'color': [102, 102, 156],
'id': 7,
'isthing': 0,
'name': 'Wall',
'supercategory': 'construction--barrier--wall'},
{'color': [128, 64, 255],
'id': 8,
'isthing': 0,
'name': 'Bike Lane',
'supercategory': 'construction--flat--bike-lane'},
{'color': [140, 140, 200],
'id': 9,
'isthing': 1,
'name': 'Crosswalk - Plain',
'supercategory': 'construction--flat--crosswalk-plain'},
{'color': [170, 170, 170],
'id': 10,
'isthing': 0,
'name': 'Curb Cut',
'supercategory': 'construction--flat--curb-cut'},
{'color': [250, 170, 160],
'id': 11,
'isthing': 0,
'name': 'Parking',
'supercategory': 'construction--flat--parking'},
{'color': [96, 96, 96],
'id': 12,
'isthing': 0,
'name': 'Pedestrian Area',
'supercategory': 'construction--flat--pedestrian-area'},
{'color': [230, 150, 140],
'id': 13,
'isthing': 0,
'name': 'Rail Track',
'supercategory': 'construction--flat--rail-track'},
{'color': [128, 64, 128],
'id': 14,
'isthing': 0,
'name': 'Road',
'supercategory': 'construction--flat--road'},
{'color': [110, 110, 110],
'id': 15,
'isthing': 0,
'name': 'Service Lane',
'supercategory': 'construction--flat--service-lane'},
{'color': [244, 35, 232],
'id': 16,
'isthing': 0,
'name': 'Sidewalk',
'supercategory': 'construction--flat--sidewalk'},
{'color': [150, 100, 100],
'id': 17,
'isthing': 0,
'name': 'Bridge',
'supercategory': 'construction--structure--bridge'},
{'color': [70, 70, 70],
'id': 18,
'isthing': 0,
'name': 'Building',
'supercategory': 'construction--structure--building'},
{'color': [150, 120, 90],
'id': 19,
'isthing': 0,
'name': 'Tunnel',
'supercategory': 'construction--structure--tunnel'},
{'color': [220, 20, 60],
'id': 20,
'isthing': 1,
'name': 'Person',
'supercategory': 'human--person'},
{'color': [255, 0, 0],
'id': 21,
'isthing': 1,
'name': 'Bicyclist',
'supercategory': 'human--rider--bicyclist'},
{'color': [255, 0, 100],
'id': 22,
'isthing': 1,
'name': 'Motorcyclist',
'supercategory': 'human--rider--motorcyclist'},
{'color': [255, 0, 200],
'id': 23,
'isthing': 1,
'name': 'Other Rider',
'supercategory': 'human--rider--other-rider'},
{'color': [200, 128, 128],
'id': 24,
'isthing': 1,
'name': 'Lane Marking - Crosswalk',
'supercategory': 'marking--crosswalk-zebra'},
{'color': [255, 255, 255],
'id': 25,
'isthing': 0,
'name': 'Lane Marking - General',
'supercategory': 'marking--general'},
{'color': [64, 170, 64],
'id': 26,
'isthing': 0,
'name': 'Mountain',
'supercategory': 'nature--mountain'},
{'color': [230, 160, 50],
'id': 27,
'isthing': 0,
'name': 'Sand',
'supercategory': 'nature--sand'},
{'color': [70, 130, 180],
'id': 28,
'isthing': 0,
'name': 'Sky',
'supercategory': 'nature--sky'},
{'color': [190, 255, 255],
'id': 29,
'isthing': 0,
'name': 'Snow',
'supercategory': 'nature--snow'},
{'color': [152, 251, 152],
'id': 30,
'isthing': 0,
'name': 'Terrain',
'supercategory': 'nature--terrain'},
{'color': [107, 142, 35],
'id': 31,
'isthing': 0,
'name': 'Vegetation',
'supercategory': 'nature--vegetation'},
{'color': [0, 170, 30],
'id': 32,
'isthing': 0,
'name': 'Water',
'supercategory': 'nature--water'},
{'color': [255, 255, 128],
'id': 33,
'isthing': 1,
'name': 'Banner',
'supercategory': 'object--banner'},
{'color': [250, 0, 30],
'id': 34,
'isthing': 1,
'name': 'Bench',
'supercategory': 'object--bench'},
{'color': [100, 140, 180],
'id': 35,
'isthing': 1,
'name': 'Bike Rack',
'supercategory': 'object--bike-rack'},
{'color': [220, 220, 220],
'id': 36,
'isthing': 1,
'name': 'Billboard',
'supercategory': 'object--billboard'},
{'color': [220, 128, 128],
'id': 37,
'isthing': 1,
'name': 'Catch Basin',
'supercategory': 'object--catch-basin'},
{'color': [222, 40, 40],
'id': 38,
'isthing': 1,
'name': 'CCTV Camera',
'supercategory': 'object--cctv-camera'},
{'color': [100, 170, 30],
'id': 39,
'isthing': 1,
'name': 'Fire Hydrant',
'supercategory': 'object--fire-hydrant'},
{'color': [40, 40, 40],
'id': 40,
'isthing': 1,
'name': 'Junction Box',
'supercategory': 'object--junction-box'},
{'color': [33, 33, 33],
'id': 41,
'isthing': 1,
'name': 'Mailbox',
'supercategory': 'object--mailbox'},
{'color': [100, 128, 160],
'id': 42,
'isthing': 1,
'name': 'Manhole',
'supercategory': 'object--manhole'},
{'color': [142, 0, 0],
'id': 43,
'isthing': 1,
'name': 'Phone Booth',
'supercategory': 'object--phone-booth'},
{'color': [70, 100, 150],
'id': 44,
'isthing': 0,
'name': 'Pothole',
'supercategory': 'object--pothole'},
{'color': [210, 170, 100],
'id': 45,
'isthing': 1,
'name': 'Street Light',
'supercategory': 'object--street-light'},
{'color': [153, 153, 153],
'id': 46,
'isthing': 1,
'name': 'Pole',
'supercategory': 'object--support--pole'},
{'color': [128, 128, 128],
'id': 47,
'isthing': 1,
'name': 'Traffic Sign Frame',
'supercategory': 'object--support--traffic-sign-frame'},
{'color': [0, 0, 80],
'id': 48,
'isthing': 1,
'name': 'Utility Pole',
'supercategory': 'object--support--utility-pole'},
{'color': [250, 170, 30],
'id': 49,
'isthing': 1,
'name': 'Traffic Light',
'supercategory': 'object--traffic-light'},
{'color': [192, 192, 192],
'id': 50,
'isthing': 1,
'name': 'Traffic Sign (Back)',
'supercategory': 'object--traffic-sign--back'},
{'color': [220, 220, 0],
'id': 51,
'isthing': 1,
'name': 'Traffic Sign (Front)',
'supercategory': 'object--traffic-sign--front'},
{'color': [140, 140, 20],
'id': 52,
'isthing': 1,
'name': 'Trash Can',
'supercategory': 'object--trash-can'},
{'color': [119, 11, 32],
'id': 53,
'isthing': 1,
'name': 'Bicycle',
'supercategory': 'object--vehicle--bicycle'},
{'color': [150, 0, 255],
'id': 54,
'isthing': 1,
'name': 'Boat',
'supercategory': 'object--vehicle--boat'},
{'color': [0, 60, 100],
'id': 55,
'isthing': 1,
'name': 'Bus',
'supercategory': 'object--vehicle--bus'},
{'color': [0, 0, 142],
'id': 56,
'isthing': 1,
'name': 'Car',
'supercategory': 'object--vehicle--car'},
{'color': [0, 0, 90],
'id': 57,
'isthing': 1,
'name': 'Caravan',
'supercategory': 'object--vehicle--caravan'},
{'color': [0, 0, 230],
'id': 58,
'isthing': 1,
'name': 'Motorcycle',
'supercategory': 'object--vehicle--motorcycle'},
{'color': [0, 80, 100],
'id': 59,
'isthing': 0,
'name': 'On Rails',
'supercategory': 'object--vehicle--on-rails'},
{'color': [128, 64, 64],
'id': 60,
'isthing': 1,
'name': 'Other Vehicle',
'supercategory': 'object--vehicle--other-vehicle'},
{'color': [0, 0, 110],
'id': 61,
'isthing': 1,
'name': 'Trailer',
'supercategory': 'object--vehicle--trailer'},
{'color': [0, 0, 70],
'id': 62,
'isthing': 1,
'name': 'Truck',
'supercategory': 'object--vehicle--truck'},
{'color': [0, 0, 192],
'id': 63,
'isthing': 1,
'name': 'Wheeled Slow',
'supercategory': 'object--vehicle--wheeled-slow'},
{'color': [32, 32, 32],
'id': 64,
'isthing': 0,
'name': 'Car Mount',
'supercategory': 'void--car-mount'},
{'color': [120, 10, 10],
'id': 65,
'isthing': 0,
'name': 'Ego Vehicle',
'supercategory': 'void--ego-vehicle'}
]
def load_mapillary_vistas_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
"""
Args:
image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
Returns:
list[dict]: a list of dicts in Detectron2 standard format. (See
`Using Custom Datasets `_ )
"""
def _convert_category_id(segment_info, meta):
if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
segment_info["category_id"]
]
segment_info["isthing"] = True
else:
segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
segment_info["category_id"]
]
segment_info["isthing"] = False
return segment_info
with PathManager.open(json_file) as f:
json_info = json.load(f)
ret = []
for ann in json_info["annotations"]:
image_id = ann["image_id"]
# TODO: currently we assume image and label has the same filename but
# different extension, and images have extension ".jpg" for COCO. Need
# to make image extension a user-provided argument if we extend this
# function to support other COCO-like datasets.
image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
label_file = os.path.join(gt_dir, ann["file_name"])
sem_label_file = os.path.join(semseg_dir, ann["file_name"])
segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
ret.append(
{
"file_name": image_file,
"image_id": image_id,
"pan_seg_file_name": label_file,
"sem_seg_file_name": sem_label_file,
"segments_info": segments_info,
}
)
assert len(ret), f"No images found in {image_dir}!"
assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
return ret
def register_mapillary_vistas_panoptic(
name, metadata, image_root, panoptic_root, semantic_root, panoptic_json, instances_json=None
):
"""
Register a "standard" version of ADE20k panoptic segmentation dataset named `name`.
The dictionaries in this registered dataset follows detectron2's standard format.
Hence it's called "standard".
Args:
name (str): the name that identifies a dataset,
e.g. "ade20k_panoptic_train"
metadata (dict): extra metadata associated with this dataset.
image_root (str): directory which contains all the images
panoptic_root (str): directory which contains panoptic annotation images in COCO format
panoptic_json (str): path to the json panoptic annotation file in COCO format
sem_seg_root (none): not used, to be consistent with
`register_coco_panoptic_separated`.
instances_json (str): path to the json instance annotation file
"""
panoptic_name = name
DatasetCatalog.register(
panoptic_name,
lambda: load_mapillary_vistas_panoptic_json(
panoptic_json, image_root, panoptic_root, semantic_root, metadata
),
)
MetadataCatalog.get(panoptic_name).set(
panoptic_root=panoptic_root,
image_root=image_root,
panoptic_json=panoptic_json,
json_file=instances_json,
evaluator_type="mapillary_vistas_panoptic_seg",
ignore_label=65, # different from other datasets, Mapillary Vistas sets ignore_label to 65
label_divisor=1000,
**metadata,
)
_PREDEFINED_SPLITS_ADE20K_PANOPTIC = {
"mapillary_vistas_panoptic_train": (
"mapillary_vistas/training/images",
"mapillary_vistas/training/panoptic",
"mapillary_vistas/training/panoptic/panoptic_2018.json",
"mapillary_vistas/training/labels",
),
"mapillary_vistas_panoptic_val": (
"mapillary_vistas/validation/images",
"mapillary_vistas/validation/panoptic",
"mapillary_vistas/validation/panoptic/panoptic_2018.json",
"mapillary_vistas/validation/labels",
),
}
def get_metadata():
meta = {}
# The following metadata maps contiguous id from [0, #thing categories +
# #stuff categories) to their names and colors. We have to replica of the
# same name and color under "thing_*" and "stuff_*" because the current
# visualization function in D2 handles thing and class classes differently
# due to some heuristic used in Panoptic FPN. We keep the same naming to
# enable reusing existing visualization functions.
thing_classes = [k["name"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES]
thing_colors = [k["color"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES]
stuff_classes = [k["name"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES]
stuff_colors = [k["color"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES]
meta["thing_classes"] = thing_classes
meta["thing_colors"] = thing_colors
meta["stuff_classes"] = stuff_classes
meta["stuff_colors"] = stuff_colors
# Convert category id for training:
# category id: like semantic segmentation, it is the class id for each
# pixel. Since there are some classes not used in evaluation, the category
# id is not always contiguous and thus we have two set of category ids:
# - original category id: category id in the original dataset, mainly
# used for evaluation.
# - contiguous category id: [0, #classes), in order to train the linear
# softmax classifier.
thing_dataset_id_to_contiguous_id = {}
stuff_dataset_id_to_contiguous_id = {}
for i, cat in enumerate(MAPILLARY_VISTAS_SEM_SEG_CATEGORIES):
if cat["isthing"]:
thing_dataset_id_to_contiguous_id[cat["id"]] = i
# else:
# stuff_dataset_id_to_contiguous_id[cat["id"]] = i
# in order to use sem_seg evaluator
stuff_dataset_id_to_contiguous_id[cat["id"]] = i
meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
return meta
def register_all_mapillary_vistas_panoptic(root):
metadata = get_metadata()
for (
prefix,
(image_root, panoptic_root, panoptic_json, semantic_root),
) in _PREDEFINED_SPLITS_ADE20K_PANOPTIC.items():
# The "standard" version of COCO panoptic segmentation dataset,
# e.g. used by Panoptic-DeepLab
register_mapillary_vistas_panoptic(
prefix,
metadata,
os.path.join(root, image_root),
os.path.join(root, panoptic_root),
os.path.join(root, semantic_root),
os.path.join(root, panoptic_json),
)
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_mapillary_vistas_panoptic(_root)
================================================
FILE: mask2former/evaluation/__init__.py
================================================
================================================
FILE: mask2former/evaluation/__init__.py.new
================================================
================================================
FILE: mask2former/evaluation/instance_evaluation.py
================================================
import contextlib
import copy
import io
import itertools
import json
import logging
import numpy as np
import os
import pickle
from collections import OrderedDict
import pycocotools.mask as mask_util
import torch
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from tabulate import tabulate
import detectron2.utils.comm as comm
from detectron2.config import CfgNode
from detectron2.data import MetadataCatalog
from detectron2.data.datasets.coco import convert_to_coco_json
from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
from detectron2.evaluation.fast_eval_api import COCOeval_opt
from detectron2.structures import Boxes, BoxMode, pairwise_iou
from detectron2.utils.file_io import PathManager
from detectron2.utils.logger import create_small_table
# modified from COCOEvaluator for instance segmetnat
class InstanceSegEvaluator(COCOEvaluator):
"""
Evaluate AR for object proposals, AP for instance detection/segmentation, AP
for keypoint detection outputs using COCO's metrics.
See http://cocodataset.org/#detection-eval and
http://cocodataset.org/#keypoints-eval to understand its metrics.
The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
the metric cannot be computed (e.g. due to no predictions made).
In addition to COCO, this evaluator is able to support any bounding box detection,
instance segmentation, or keypoint detection dataset.
"""
def _eval_predictions(self, predictions, img_ids=None):
"""
Evaluate predictions. Fill self._results with the metrics of the tasks.
"""
self._logger.info("Preparing results for COCO format ...")
coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
tasks = self._tasks or self._tasks_from_predictions(coco_results)
# unmap the category ids for COCO
if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
# all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
# num_classes = len(all_contiguous_ids)
# assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
for result in coco_results:
category_id = result["category_id"]
# assert category_id < num_classes, (
# f"A prediction has class={category_id}, "
# f"but the dataset only has {num_classes} classes and "
# f"predicted class id should be in [0, {num_classes - 1}]."
# )
assert category_id in reverse_id_mapping, (
f"A prediction has class={category_id}, "
f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
)
result["category_id"] = reverse_id_mapping[category_id]
if self._output_dir:
file_path = os.path.join(self._output_dir, "coco_instances_results.json")
self._logger.info("Saving results to {}".format(file_path))
with PathManager.open(file_path, "w") as f:
f.write(json.dumps(coco_results))
f.flush()
if not self._do_evaluation:
self._logger.info("Annotations are not available for evaluation.")
return
self._logger.info(
"Evaluating predictions with {} COCO API...".format(
"unofficial" if self._use_fast_impl else "official"
)
)
for task in sorted(tasks):
assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
coco_eval = (
_evaluate_predictions_on_coco(
self._coco_api,
coco_results,
task,
kpt_oks_sigmas=self._kpt_oks_sigmas,
use_fast_impl=self._use_fast_impl,
img_ids=img_ids,
max_dets_per_image=self._max_dets_per_image,
)
if len(coco_results) > 0
else None # cocoapi does not handle empty results very well
)
res = self._derive_coco_results(
coco_eval, task, class_names=self._metadata.get("thing_classes")
)
self._results[task] = res
================================================
FILE: mask2former/maskformer_model.py
================================================
from typing import Tuple
import torch
from torch import nn
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.data import MetadataCatalog
from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head
from detectron2.modeling.backbone import Backbone
from detectron2.modeling.postprocessing import sem_seg_postprocess
from detectron2.structures import Boxes, ImageList, Instances, BitMasks
from detectron2.utils.memory import retry_if_cuda_oom
from .modeling.criterion import SetCriterion
from .modeling.matcher import HungarianMatcher
from skimage import color
import cv2
import numpy as np
def unfold_wo_center(x, kernel_size, dilation):
assert x.dim() == 4
assert kernel_size % 2 == 1
# using SAME padding
padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
unfolded_x = F.unfold(
x, kernel_size=kernel_size,
padding=padding,
dilation=dilation
)
unfolded_x = unfolded_x.reshape(
x.size(0), x.size(1), -1, x.size(2), x.size(3)
)
# remove the center pixels
size = kernel_size ** 2
unfolded_x = torch.cat((
unfolded_x[:, :, :size // 2],
unfolded_x[:, :, size // 2 + 1:]
), dim=2)
return unfolded_x
def get_images_color_similarity(images, kernel_size, dilation):
assert images.dim() == 4
assert images.size(0) == 1
unfolded_images = unfold_wo_center(
images, kernel_size=kernel_size, dilation=dilation
)
diff = images[:, :, None] - unfolded_images
similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5)
return similarity
@META_ARCH_REGISTRY.register()
class MaskFormer(nn.Module):
"""
Main class for mask classification semantic segmentation architectures.
"""
@configurable
def __init__(
self,
*,
backbone: Backbone,
sem_seg_head: nn.Module,
criterion: nn.Module,
num_queries: int,
object_mask_threshold: float,
overlap_threshold: float,
metadata,
size_divisibility: int,
sem_seg_postprocess_before_inference: bool,
pixel_mean: Tuple[float],
pixel_std: Tuple[float],
# inference
semantic_on: bool,
panoptic_on: bool,
instance_on: bool,
test_topk_per_image: int,
):
"""
Args:
backbone: a backbone module, must follow detectron2's backbone interface
sem_seg_head: a module that predicts semantic segmentation from backbone features
criterion: a module that defines the loss
num_queries: int, number of queries
object_mask_threshold: float, threshold to filter query based on classification score
for panoptic segmentation inference
overlap_threshold: overlap threshold used in general inference for panoptic segmentation
metadata: dataset meta, get `thing` and `stuff` category names for panoptic
segmentation inference
size_divisibility: Some backbones require the input height and width to be divisible by a
specific integer. We can use this to override such requirement.
sem_seg_postprocess_before_inference: whether to resize the prediction back
to original input size before semantic segmentation inference or after.
For high-resolution dataset like Mapillary, resizing predictions before
inference will cause OOM error.
pixel_mean, pixel_std: list or tuple with #channels element, representing
the per-channel mean and std to be used to normalize the input image
semantic_on: bool, whether to output semantic segmentation prediction
instance_on: bool, whether to output instance segmentation prediction
panoptic_on: bool, whether to output panoptic segmentation prediction
test_topk_per_image: int, instance segmentation parameter, keep topk instances per image
"""
super().__init__()
self.backbone = backbone
self.sem_seg_head = sem_seg_head
self.criterion = criterion
self.num_queries = num_queries
self.overlap_threshold = overlap_threshold
self.object_mask_threshold = object_mask_threshold
self.metadata = metadata
if size_divisibility < 0:
# use backbone size_divisibility if not set
size_divisibility = self.backbone.size_divisibility
self.size_divisibility = size_divisibility
self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference
self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
# additional args
self.semantic_on = semantic_on
self.instance_on = instance_on
self.panoptic_on = panoptic_on
self.test_topk_per_image = test_topk_per_image
if not self.semantic_on:
assert self.sem_seg_postprocess_before_inference
@classmethod
def from_config(cls, cfg):
backbone = build_backbone(cfg)
sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
# Loss parameters:
deep_supervision = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
no_object_weight = cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT
# loss weights
class_weight = cfg.MODEL.MASK_FORMER.CLASS_WEIGHT
dice_weight = cfg.MODEL.MASK_FORMER.DICE_WEIGHT
mask_weight = cfg.MODEL.MASK_FORMER.MASK_WEIGHT
# building criterion
matcher = HungarianMatcher(
cost_class=class_weight,
cost_mask=mask_weight,
cost_dice=dice_weight,
num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS,
)
weight_dict = {"loss_ce": class_weight, "loss_mask": mask_weight, "loss_dice": dice_weight, "loss_bound": mask_weight}
if deep_supervision:
dec_layers = cfg.MODEL.MASK_FORMER.DEC_LAYERS
aux_weight_dict = {}
for i in range(dec_layers - 1):
aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
weight_dict.update(aux_weight_dict)
losses = ["labels", "masks"]
criterion = SetCriterion(
sem_seg_head.num_classes,
matcher=matcher,
weight_dict=weight_dict,
eos_coef=no_object_weight,
losses=losses,
num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS,
oversample_ratio=cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO,
importance_sample_ratio=cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO,
)
return {
"backbone": backbone,
"sem_seg_head": sem_seg_head,
"criterion": criterion,
"num_queries": cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES,
"object_mask_threshold": cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD,
"overlap_threshold": cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD,
"metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
"size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY,
"sem_seg_postprocess_before_inference": (
cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE
or cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON
or cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON
),
"pixel_mean": cfg.MODEL.PIXEL_MEAN,
"pixel_std": cfg.MODEL.PIXEL_STD,
# inference
"semantic_on": cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON,
"instance_on": cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON,
"panoptic_on": cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON,
"test_topk_per_image": cfg.TEST.DETECTIONS_PER_IMAGE,
}
@property
def device(self):
return self.pixel_mean.device
def forward(self, batched_inputs):
"""
Args:
batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
* "image": Tensor, image in (C, H, W) format.
* "instances": per-region ground truth
* Other information that's included in the original dicts, such as:
"height", "width" (int): the output resolution of the model (may be different
from input resolution), used in inference.
Returns:
list[dict]:
each dict has the results for one image. The dict contains the following keys:
* "sem_seg":
A Tensor that represents the
per-pixel segmentation prediced by the head.
The prediction has shape KxHxW that represents the logits of
each class for each pixel.
* "panoptic_seg":
A tuple that represent panoptic output
panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
segments_info (list[dict]): Describe each segment in `panoptic_seg`.
Each dict contains keys "id", "category_id", "isthing".
"""
images = [x["image"].to(self.device) for x in batched_inputs]
# if self.training:
# downsampled_images = [F.avg_pool2d(img.float(), kernel_size=4, stride=4, padding=0)[[2, 1, 0]] for img in images]
# images_lab = [torch.as_tensor(color.rgb2lab(ds_image.byte().permute(1, 2, 0).cpu().numpy()), device=ds_image.device, dtype=torch.float32).permute(2, 0, 1) for ds_image in downsampled_images]
# images_lab_sim = [get_images_color_similarity(img_lab.unsqueeze(0), 3, 2) for img_lab in images_lab] # ori is 0.3, 0.5, 0.7
# # for i_m, im_sim in enumerate(images_lab_sim):
# # heatmapshow = cv2.applyColorMap((im_sim[0, 0] * 255).cpu().numpy().astype(np.uint8), cv2.COLORMAP_JET)
# # cv2.imwrite('./vis_debug3/'+str(batched_inputs[i_m]['image_id'])+"_heatmap_n_bina_new1.jpg", heatmapshow)
# # cv2.imwrite('./vis_debug3/'+str(batched_inputs[i_m]['image_id'])+"_img.jpg", downsampled_images[i_m].byte().permute(1, 2, 0).cpu().numpy())
# # print('images_lab_sim shape:', [im_sim.shape1 for im_sim in images_lab_sim])
# print('mask in image_masks:', [m.shape for m in image_masks])
# print('mask in image_masks max:', [m.max() for m in image_masks])
# print('mask in image_masks min:', [m.min() for m in image_masks])
# print('mask in image_masks percent:', [m.sum() / (m.shape[0] * m.shape[1]) for m in image_masks])
if self.training:
rs_images = ImageList.from_tensors(images, self.size_divisibility)
image_masks = [~ x["padding_mask"].to(self.device) for x in batched_inputs]
image_masks_back = [x["padding_mask"].to(self.device) for x in batched_inputs]
# for ii, i_mask in enumerate(image_masks):
# print('index:', ii, 'i_mask:', i_mask.shape)
# print('index:', ii, 'i_mask:', i_mask.max())
# cv2.imwrite('vis_mask_check/'+str(batched_inputs[ii]['image_id'])+str(ii)+'_mask.jpg', i_mask.float().cpu().numpy() * 255)
# print('mask in image_masks:', [m.shape for m in image_masks])
# print('mask in image_masks max:', [m.max() for m in image_masks])
# print('mask in image_masks min:', [m.min() for m in image_masks])
image_masks_bool = [((m.sum() / (m.shape[0] * m.shape[1])) > 0.25).float()*((m_b.sum() / (m.shape[0] * m.shape[1])) > 0.25).float() for m, m_b in zip(image_masks, image_masks_back)] #0.25, 0.64
#image_masks_bool = [((m.sum() / (m.shape[0] * m.shape[1])) > 1.0).float() for m in image_masks] #0.25, 0.64
# print('len image_masks_bool:', image_masks_bool)
downsampled_images = F.avg_pool2d(rs_images.tensor.float(), kernel_size=4, stride=4, padding=0) #for img in images]
# print('len downsampled_images:', len(downsampled_images))
images_lab = [torch.as_tensor(color.rgb2lab(ds_image[[2, 1, 0]].byte().permute(1, 2, 0).cpu().numpy()), device=ds_image.device, dtype=torch.float32).permute(2, 0, 1) for ds_image in downsampled_images]
images_lab_sim = [get_images_color_similarity(img_lab.unsqueeze(0), 3, 2) * float(img_m_bool) for img_lab, img_m_bool in zip(images_lab, image_masks_bool)] # ori is 0.3, 0.5, 0.7
# for i_m, im_sim in enumerate(images_lab_sim):
# heatmapshow = cv2.applyColorMap((im_sim[0, 0] * 255).cpu().numpy().astype(np.uint8), cv2.COLORMAP_JET)
# cv2.imwrite('./vis_debug3/'+str(batched_inputs[i_m]['image_id'])+"_heatmap_n_bina_new1.jpg", heatmapshow)
# cv2.imwrite('./vis_debug3/'+str(batched_inputs[i_m]['image_id'])+"_img.jpg", downsampled_images[i_m].byte().permute(1, 2, 0).cpu().numpy())
# print('images_lab_sim shape:', [im_sim.shape1 for im_sim in images_lab_sim])
# ori_images = ImageList.from_tensors(images, self.size_divisibility)
# ori_images_tensor = ori_images.tensor[:, :, ::4, ::4]
# print('ori images:', ori_images_tensor.shape)
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
images = ImageList.from_tensors(images, self.size_divisibility)
features = self.backbone(images.tensor)
outputs = self.sem_seg_head(features)
if self.training:
# mask classification target
if "instances" in batched_inputs[0]:
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
targets = self.prepare_targets(gt_instances, images)
else:
targets = None
# bipartite matching-based loss
losses = self.criterion(outputs, targets, images_lab_sim)
for k in list(losses.keys()):
if k in self.criterion.weight_dict:
losses[k] *= self.criterion.weight_dict[k]
else:
# remove this loss if not specified in `weight_dict`
losses.pop(k)
return losses
else:
mask_cls_results = outputs["pred_logits"]
mask_pred_results = outputs["pred_masks"]
# upsample masks
mask_pred_results = F.interpolate(
mask_pred_results,
size=(images.tensor.shape[-2], images.tensor.shape[-1]),
mode="bilinear",
align_corners=False,
)
del outputs
processed_results = []
for mask_cls_result, mask_pred_result, input_per_image, image_size in zip(
mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes
):
height = input_per_image.get("height", image_size[0])
width = input_per_image.get("width", image_size[1])
processed_results.append({})
if self.sem_seg_postprocess_before_inference:
mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
mask_pred_result, image_size, height, width
)
mask_cls_result = mask_cls_result.to(mask_pred_result)
# semantic segmentation inference
if self.semantic_on:
r = retry_if_cuda_oom(self.semantic_inference)(mask_cls_result, mask_pred_result)
if not self.sem_seg_postprocess_before_inference:
r = retry_if_cuda_oom(sem_seg_postprocess)(r, image_size, height, width)
processed_results[-1]["sem_seg"] = r
# panoptic segmentation inference
if self.panoptic_on:
panoptic_r = retry_if_cuda_oom(self.panoptic_inference)(mask_cls_result, mask_pred_result)
processed_results[-1]["panoptic_seg"] = panoptic_r
# instance segmentation inference
if self.instance_on:
instance_r = retry_if_cuda_oom(self.instance_inference)(mask_cls_result, mask_pred_result)
processed_results[-1]["instances"] = instance_r
return processed_results
def prepare_targets(self, targets, images):
h_pad, w_pad = images.tensor.shape[-2:]
new_targets = []
for targets_per_image in targets:
# pad gt
gt_masks = targets_per_image.gt_masks
padded_masks = torch.zeros((gt_masks.shape[0], h_pad, w_pad), dtype=gt_masks.dtype, device=gt_masks.device)
padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks
new_targets.append(
{
"labels": targets_per_image.gt_classes,
"masks": padded_masks,
}
)
return new_targets
def semantic_inference(self, mask_cls, mask_pred):
mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
mask_pred = mask_pred.sigmoid()
semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred)
return semseg
def panoptic_inference(self, mask_cls, mask_pred):
scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
mask_pred = mask_pred.sigmoid()
keep = labels.ne(self.sem_seg_head.num_classes) & (scores > self.object_mask_threshold)
cur_scores = scores[keep]
cur_classes = labels[keep]
cur_masks = mask_pred[keep]
cur_mask_cls = mask_cls[keep]
cur_mask_cls = cur_mask_cls[:, :-1]
cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
h, w = cur_masks.shape[-2:]
panoptic_seg = torch.zeros((h, w), dtype=torch.int32, device=cur_masks.device)
segments_info = []
current_segment_id = 0
if cur_masks.shape[0] == 0:
# We didn't detect any mask :(
return panoptic_seg, segments_info
else:
# take argmax
cur_mask_ids = cur_prob_masks.argmax(0)
stuff_memory_list = {}
for k in range(cur_classes.shape[0]):
pred_class = cur_classes[k].item()
isthing = pred_class in self.metadata.thing_dataset_id_to_contiguous_id.values()
mask_area = (cur_mask_ids == k).sum().item()
original_area = (cur_masks[k] >= 0.5).sum().item()
mask = (cur_mask_ids == k) & (cur_masks[k] >= 0.5)
if mask_area > 0 and original_area > 0 and mask.sum().item() > 0:
if mask_area / original_area < self.overlap_threshold:
continue
# merge stuff regions
if not isthing:
if int(pred_class) in stuff_memory_list.keys():
panoptic_seg[mask] = stuff_memory_list[int(pred_class)]
continue
else:
stuff_memory_list[int(pred_class)] = current_segment_id + 1
current_segment_id += 1
panoptic_seg[mask] = current_segment_id
segments_info.append(
{
"id": current_segment_id,
"isthing": bool(isthing),
"category_id": int(pred_class),
}
)
return panoptic_seg, segments_info
def instance_inference(self, mask_cls, mask_pred):
# mask_pred is already processed to have the same shape as original input
image_size = mask_pred.shape[-2:]
# [Q, K]
scores = F.softmax(mask_cls, dim=-1)[:, :-1]
labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1)
# scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.num_queries, sorted=False)
scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.test_topk_per_image, sorted=False)
labels_per_image = labels[topk_indices]
topk_indices = topk_indices // self.sem_seg_head.num_classes
# mask_pred = mask_pred.unsqueeze(1).repeat(1, self.sem_seg_head.num_classes, 1).flatten(0, 1)
mask_pred = mask_pred[topk_indices]
# if this is panoptic segmentation, we only keep the "thing" classes
if self.panoptic_on:
keep = torch.zeros_like(scores_per_image).bool()
for i, lab in enumerate(labels_per_image):
keep[i] = lab in self.metadata.thing_dataset_id_to_contiguous_id.values()
scores_per_image = scores_per_image[keep]
labels_per_image = labels_per_image[keep]
mask_pred = mask_pred[keep]
result = Instances(image_size)
# mask (before sigmoid)
result.pred_masks = (mask_pred > 0).float()
# result.pred_masks = (mask_pred.sigmoid() >= 0.5)*(mask_pred.sigmoid() < 0.75).float()
# result.pred_boxes = Boxes(torch.zeros(mask_pred.size(0), 4))
# Uncomment the following to get boxes from masks (this is slow)
result.pred_boxes = BitMasks(mask_pred > 0).get_bounding_boxes()
# calculate average mask prob
mask_scores_per_image = (mask_pred.sigmoid().flatten(1) * result.pred_masks.flatten(1)).sum(1) / (result.pred_masks.flatten(1).sum(1) + 1e-6)
result.scores = scores_per_image * mask_scores_per_image
result.pred_classes = labels_per_image
return result
================================================
FILE: mask2former/modeling/__init__.py
================================================
from .backbone.swin import D2SwinTransformer
from .pixel_decoder.fpn import BasePixelDecoder
from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
from .meta_arch.mask_former_head import MaskFormerHead
from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
================================================
FILE: mask2former/modeling/backbone/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
================================================
FILE: mask2former/modeling/backbone/__init__.py.new
================================================
================================================
FILE: mask2former/modeling/backbone/swin.py
================================================
# --------------------------------------------------------
# Swin Transformer
# Copyright (c) 2021 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ze Liu, Yutong Lin, Yixuan Wei
# --------------------------------------------------------
# Modified by Bowen Cheng from https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation/blob/main/mmseg/models/backbones/swin_transformer.py
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
class Mlp(nn.Module):
"""Multilayer perceptron."""
def __init__(
self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0
):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
def window_partition(x, window_size):
"""
Args:
x: (B, H, W, C)
window_size (int): window size
Returns:
windows: (num_windows*B, window_size, window_size, C)
"""
B, H, W, C = x.shape
x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
return windows
def window_reverse(windows, window_size, H, W):
"""
Args:
windows: (num_windows*B, window_size, window_size, C)
window_size (int): Window size
H (int): Height of image
W (int): Width of image
Returns:
x: (B, H, W, C)
"""
B = int(windows.shape[0] / (H * W / window_size / window_size))
x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
return x
class WindowAttention(nn.Module):
"""Window based multi-head self attention (W-MSA) module with relative position bias.
It supports both of shifted and non-shifted window.
Args:
dim (int): Number of input channels.
window_size (tuple[int]): The height and width of the window.
num_heads (int): Number of attention heads.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
"""
def __init__(
self,
dim,
window_size,
num_heads,
qkv_bias=True,
qk_scale=None,
attn_drop=0.0,
proj_drop=0.0,
):
super().__init__()
self.dim = dim
self.window_size = window_size # Wh, Ww
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim ** -0.5
# define a parameter table of relative position bias
self.relative_position_bias_table = nn.Parameter(
torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
) # 2*Wh-1 * 2*Ww-1, nH
# get pair-wise relative position index for each token inside the window
coords_h = torch.arange(self.window_size[0])
coords_w = torch.arange(self.window_size[1])
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
relative_coords[:, :, 1] += self.window_size[1] - 1
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
self.register_buffer("relative_position_index", relative_position_index)
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
trunc_normal_(self.relative_position_bias_table, std=0.02)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x, mask=None):
"""Forward function.
Args:
x: input features with shape of (num_windows*B, N, C)
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
"""
B_, N, C = x.shape
qkv = (
self.qkv(x)
.reshape(B_, N, 3, self.num_heads, C // self.num_heads)
.permute(2, 0, 3, 1, 4)
)
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
q = q * self.scale
attn = q @ k.transpose(-2, -1)
relative_position_bias = self.relative_position_bias_table[
self.relative_position_index.view(-1)
].view(
self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.permute(
2, 0, 1
).contiguous() # nH, Wh*Ww, Wh*Ww
attn = attn + relative_position_bias.unsqueeze(0)
if mask is not None:
nW = mask.shape[0]
attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
attn = attn.view(-1, self.num_heads, N, N)
attn = self.softmax(attn)
else:
attn = self.softmax(attn)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
class SwinTransformerBlock(nn.Module):
"""Swin Transformer Block.
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads.
window_size (int): Window size.
shift_size (int): Shift size for SW-MSA.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float, optional): Stochastic depth rate. Default: 0.0
act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
"""
def __init__(
self,
dim,
num_heads,
window_size=7,
shift_size=0,
mlp_ratio=4.0,
qkv_bias=True,
qk_scale=None,
drop=0.0,
attn_drop=0.0,
drop_path=0.0,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm,
):
super().__init__()
self.dim = dim
self.num_heads = num_heads
self.window_size = window_size
self.shift_size = shift_size
self.mlp_ratio = mlp_ratio
assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
self.norm1 = norm_layer(dim)
self.attn = WindowAttention(
dim,
window_size=to_2tuple(self.window_size),
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop,
)
self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(
in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop
)
self.H = None
self.W = None
def forward(self, x, mask_matrix):
"""Forward function.
Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
mask_matrix: Attention mask for cyclic shift.
"""
B, L, C = x.shape
H, W = self.H, self.W
assert L == H * W, "input feature has wrong size"
shortcut = x
x = self.norm1(x)
x = x.view(B, H, W, C)
# pad feature maps to multiples of window size
pad_l = pad_t = 0
pad_r = (self.window_size - W % self.window_size) % self.window_size
pad_b = (self.window_size - H % self.window_size) % self.window_size
x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
_, Hp, Wp, _ = x.shape
# cyclic shift
if self.shift_size > 0:
shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
attn_mask = mask_matrix
else:
shifted_x = x
attn_mask = None
# partition windows
x_windows = window_partition(
shifted_x, self.window_size
) # nW*B, window_size, window_size, C
x_windows = x_windows.view(
-1, self.window_size * self.window_size, C
) # nW*B, window_size*window_size, C
# W-MSA/SW-MSA
attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C
# merge windows
attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
# reverse cyclic shift
if self.shift_size > 0:
x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
else:
x = shifted_x
if pad_r > 0 or pad_b > 0:
x = x[:, :H, :W, :].contiguous()
x = x.view(B, H * W, C)
# FFN
x = shortcut + self.drop_path(x)
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class PatchMerging(nn.Module):
"""Patch Merging Layer
Args:
dim (int): Number of input channels.
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
"""
def __init__(self, dim, norm_layer=nn.LayerNorm):
super().__init__()
self.dim = dim
self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
self.norm = norm_layer(4 * dim)
def forward(self, x, H, W):
"""Forward function.
Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
"""
B, L, C = x.shape
assert L == H * W, "input feature has wrong size"
x = x.view(B, H, W, C)
# padding
pad_input = (H % 2 == 1) or (W % 2 == 1)
if pad_input:
x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
x = self.norm(x)
x = self.reduction(x)
return x
class BasicLayer(nn.Module):
"""A basic Swin Transformer layer for one stage.
Args:
dim (int): Number of feature channels
depth (int): Depths of this stage.
num_heads (int): Number of attention head.
window_size (int): Local window size. Default: 7.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
"""
def __init__(
self,
dim,
depth,
num_heads,
window_size=7,
mlp_ratio=4.0,
qkv_bias=True,
qk_scale=None,
drop=0.0,
attn_drop=0.0,
drop_path=0.0,
norm_layer=nn.LayerNorm,
downsample=None,
use_checkpoint=False,
):
super().__init__()
self.window_size = window_size
self.shift_size = window_size // 2
self.depth = depth
self.use_checkpoint = use_checkpoint
# build blocks
self.blocks = nn.ModuleList(
[
SwinTransformerBlock(
dim=dim,
num_heads=num_heads,
window_size=window_size,
shift_size=0 if (i % 2 == 0) else window_size // 2,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop,
attn_drop=attn_drop,
drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
norm_layer=norm_layer,
)
for i in range(depth)
]
)
# patch merging layer
if downsample is not None:
self.downsample = downsample(dim=dim, norm_layer=norm_layer)
else:
self.downsample = None
def forward(self, x, H, W):
"""Forward function.
Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
"""
# calculate attention mask for SW-MSA
Hp = int(np.ceil(H / self.window_size)) * self.window_size
Wp = int(np.ceil(W / self.window_size)) * self.window_size
img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1
h_slices = (
slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None),
)
w_slices = (
slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None),
)
cnt = 0
for h in h_slices:
for w in w_slices:
img_mask[:, h, w, :] = cnt
cnt += 1
mask_windows = window_partition(
img_mask, self.window_size
) # nW, window_size, window_size, 1
mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
attn_mask == 0, float(0.0)
)
for blk in self.blocks:
blk.H, blk.W = H, W
if self.use_checkpoint:
x = checkpoint.checkpoint(blk, x, attn_mask)
else:
x = blk(x, attn_mask)
if self.downsample is not None:
x_down = self.downsample(x, H, W)
Wh, Ww = (H + 1) // 2, (W + 1) // 2
return x, H, W, x_down, Wh, Ww
else:
return x, H, W, x, H, W
class PatchEmbed(nn.Module):
"""Image to Patch Embedding
Args:
patch_size (int): Patch token size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
norm_layer (nn.Module, optional): Normalization layer. Default: None
"""
def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
super().__init__()
patch_size = to_2tuple(patch_size)
self.patch_size = patch_size
self.in_chans = in_chans
self.embed_dim = embed_dim
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
if norm_layer is not None:
self.norm = norm_layer(embed_dim)
else:
self.norm = None
def forward(self, x):
"""Forward function."""
# padding
_, _, H, W = x.size()
if W % self.patch_size[1] != 0:
x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
if H % self.patch_size[0] != 0:
x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
x = self.proj(x) # B C Wh Ww
if self.norm is not None:
Wh, Ww = x.size(2), x.size(3)
x = x.flatten(2).transpose(1, 2)
x = self.norm(x)
x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
return x
class SwinTransformer(nn.Module):
"""Swin Transformer backbone.
A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` -
https://arxiv.org/pdf/2103.14030
Args:
pretrain_img_size (int): Input image size for training the pretrained model,
used in absolute postion embedding. Default 224.
patch_size (int | tuple(int)): Patch size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
depths (tuple[int]): Depths of each Swin Transformer stage.
num_heads (tuple[int]): Number of attention head of each stage.
window_size (int): Window size. Default: 7.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
drop_rate (float): Dropout rate.
attn_drop_rate (float): Attention dropout rate. Default: 0.
drop_path_rate (float): Stochastic depth rate. Default: 0.2.
norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
patch_norm (bool): If True, add normalization after patch embedding. Default: True.
out_indices (Sequence[int]): Output from which stages.
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-1 means not freezing any parameters.
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
"""
def __init__(
self,
pretrain_img_size=224,
patch_size=4,
in_chans=3,
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4.0,
qkv_bias=True,
qk_scale=None,
drop_rate=0.0,
attn_drop_rate=0.0,
drop_path_rate=0.2,
norm_layer=nn.LayerNorm,
ape=False,
patch_norm=True,
out_indices=(0, 1, 2, 3),
frozen_stages=-1,
use_checkpoint=False,
):
super().__init__()
self.pretrain_img_size = pretrain_img_size
self.num_layers = len(depths)
self.embed_dim = embed_dim
self.ape = ape
self.patch_norm = patch_norm
self.out_indices = out_indices
self.frozen_stages = frozen_stages
# split image into non-overlapping patches
self.patch_embed = PatchEmbed(
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim,
norm_layer=norm_layer if self.patch_norm else None,
)
# absolute position embedding
if self.ape:
pretrain_img_size = to_2tuple(pretrain_img_size)
patch_size = to_2tuple(patch_size)
patches_resolution = [
pretrain_img_size[0] // patch_size[0],
pretrain_img_size[1] // patch_size[1],
]
self.absolute_pos_embed = nn.Parameter(
torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1])
)
trunc_normal_(self.absolute_pos_embed, std=0.02)
self.pos_drop = nn.Dropout(p=drop_rate)
# stochastic depth
dpr = [
x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
] # stochastic depth decay rule
# build layers
self.layers = nn.ModuleList()
for i_layer in range(self.num_layers):
layer = BasicLayer(
dim=int(embed_dim * 2 ** i_layer),
depth=depths[i_layer],
num_heads=num_heads[i_layer],
window_size=window_size,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
norm_layer=norm_layer,
downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
use_checkpoint=use_checkpoint,
)
self.layers.append(layer)
num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
self.num_features = num_features
# add a norm layer for each output
for i_layer in out_indices:
layer = norm_layer(num_features[i_layer])
layer_name = f"norm{i_layer}"
self.add_module(layer_name, layer)
self._freeze_stages()
def _freeze_stages(self):
if self.frozen_stages >= 0:
self.patch_embed.eval()
for param in self.patch_embed.parameters():
param.requires_grad = False
if self.frozen_stages >= 1 and self.ape:
self.absolute_pos_embed.requires_grad = False
if self.frozen_stages >= 2:
self.pos_drop.eval()
for i in range(0, self.frozen_stages - 1):
m = self.layers[i]
m.eval()
for param in m.parameters():
param.requires_grad = False
def init_weights(self, pretrained=None):
"""Initialize the weights in backbone.
Args:
pretrained (str, optional): Path to pre-trained weights.
Defaults to None.
"""
def _init_weights(m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=0.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
def forward(self, x):
"""Forward function."""
x = self.patch_embed(x)
Wh, Ww = x.size(2), x.size(3)
if self.ape:
# interpolate the position embedding to the corresponding size
absolute_pos_embed = F.interpolate(
self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
)
x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C
else:
x = x.flatten(2).transpose(1, 2)
x = self.pos_drop(x)
outs = {}
for i in range(self.num_layers):
layer = self.layers[i]
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
if i in self.out_indices:
norm_layer = getattr(self, f"norm{i}")
x_out = norm_layer(x_out)
out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
outs["res{}".format(i + 2)] = out
return outs
def train(self, mode=True):
"""Convert the model into training mode while keep layers freezed."""
super(SwinTransformer, self).train(mode)
self._freeze_stages()
@BACKBONE_REGISTRY.register()
class D2SwinTransformer(SwinTransformer, Backbone):
def __init__(self, cfg, input_shape):
pretrain_img_size = cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE
patch_size = cfg.MODEL.SWIN.PATCH_SIZE
in_chans = 3
embed_dim = cfg.MODEL.SWIN.EMBED_DIM
depths = cfg.MODEL.SWIN.DEPTHS
num_heads = cfg.MODEL.SWIN.NUM_HEADS
window_size = cfg.MODEL.SWIN.WINDOW_SIZE
mlp_ratio = cfg.MODEL.SWIN.MLP_RATIO
qkv_bias = cfg.MODEL.SWIN.QKV_BIAS
qk_scale = cfg.MODEL.SWIN.QK_SCALE
drop_rate = cfg.MODEL.SWIN.DROP_RATE
attn_drop_rate = cfg.MODEL.SWIN.ATTN_DROP_RATE
drop_path_rate = cfg.MODEL.SWIN.DROP_PATH_RATE
norm_layer = nn.LayerNorm
ape = cfg.MODEL.SWIN.APE
patch_norm = cfg.MODEL.SWIN.PATCH_NORM
use_checkpoint = cfg.MODEL.SWIN.USE_CHECKPOINT
super().__init__(
pretrain_img_size,
patch_size,
in_chans,
embed_dim,
depths,
num_heads,
window_size,
mlp_ratio,
qkv_bias,
qk_scale,
drop_rate,
attn_drop_rate,
drop_path_rate,
norm_layer,
ape,
patch_norm,
use_checkpoint=use_checkpoint,
)
self._out_features = cfg.MODEL.SWIN.OUT_FEATURES
self._out_feature_strides = {
"res2": 4,
"res3": 8,
"res4": 16,
"res5": 32,
}
self._out_feature_channels = {
"res2": self.num_features[0],
"res3": self.num_features[1],
"res4": self.num_features[2],
"res5": self.num_features[3],
}
def forward(self, x):
"""
Args:
x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
Returns:
dict[str->Tensor]: names and the corresponding features
"""
assert (
x.dim() == 4
), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!"
outputs = {}
y = super().forward(x)
for k in y.keys():
if k in self._out_features:
outputs[k] = y[k]
return outputs
def output_shape(self):
return {
name: ShapeSpec(
channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
)
for name in self._out_features
}
@property
def size_divisibility(self):
return 32
================================================
FILE: mask2former/modeling/criterion.py
================================================
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py
"""
MaskFormer criterion.
"""
import logging
import torch
import torch.nn.functional as F
from torch import nn
from detectron2.utils.comm import get_world_size
from detectron2.projects.point_rend.point_features import (
get_uncertain_point_coords_with_randomness,
point_sample,
)
from ..utils.misc import is_dist_avail_and_initialized, nested_tensor_from_tensor_list
def unfold_wo_center(x, kernel_size, dilation):
assert x.dim() == 4
assert kernel_size % 2 == 1
# using SAME padding
padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
unfolded_x = F.unfold(
x, kernel_size=kernel_size,
padding=padding,
dilation=dilation
)
unfolded_x = unfolded_x.reshape(
x.size(0), x.size(1), -1, x.size(2), x.size(3)
)
# remove the center pixels
size = kernel_size ** 2
unfolded_x = torch.cat((
unfolded_x[:, :, :size // 2],
unfolded_x[:, :, size // 2 + 1:]
), dim=2)
return unfolded_x
def compute_pairwise_term(mask_logits, pairwise_size, pairwise_dilation):
assert mask_logits.dim() == 4
log_fg_prob = F.logsigmoid(mask_logits)
log_bg_prob = F.logsigmoid(-mask_logits)
log_fg_prob_unfold = unfold_wo_center(
log_fg_prob, kernel_size=pairwise_size,
dilation=pairwise_dilation
)
log_bg_prob_unfold = unfold_wo_center(
log_bg_prob, kernel_size=pairwise_size,
dilation=pairwise_dilation
)
# the probability of making the same prediction = p_i * p_j + (1 - p_i) * (1 - p_j)
# we compute the the probability in log space to avoid numerical instability
log_same_fg_prob = log_fg_prob[:, :, None] + log_fg_prob_unfold
log_same_bg_prob = log_bg_prob[:, :, None] + log_bg_prob_unfold
max_ = torch.max(log_same_fg_prob, log_same_bg_prob)
log_same_prob = torch.log(
torch.exp(log_same_fg_prob - max_) +
torch.exp(log_same_bg_prob - max_)
) + max_
# loss = -log(prob)
return -log_same_prob[:, 0]
def get_incoherent_mask(input_masks, sfact):
mask = input_masks.float()
w = input_masks.shape[-1]
h = input_masks.shape[-2]
mask_small = F.interpolate(mask, (h//sfact, w//sfact), mode='bilinear')
mask_recover = F.interpolate(mask_small, (h, w), mode='bilinear')
mask_uncertain = (mask - mask_recover).abs()
mask_uncertain = (mask_uncertain > 0.01).float()
return mask_uncertain
def dice_coefficient(x, target):
eps = 1e-5
n_inst = x.size(0)
x = x.reshape(n_inst, -1)
target = target.reshape(n_inst, -1)
intersection = (x * target).sum(dim=1)
union = (x ** 2.0).sum(dim=1) + (target ** 2.0).sum(dim=1) + eps
loss = 1. - (2 * intersection / union)
return loss
def compute_project_term(mask_scores, gt_bitmasks):
mask_losses_y = dice_coefficient(
mask_scores.max(dim=2, keepdim=True)[0],
gt_bitmasks.max(dim=2, keepdim=True)[0]
)
mask_losses_x = dice_coefficient(
mask_scores.max(dim=3, keepdim=True)[0],
gt_bitmasks.max(dim=3, keepdim=True)[0]
)
return (mask_losses_x + mask_losses_y).mean()
def dice_loss(
inputs: torch.Tensor,
targets: torch.Tensor,
num_masks: float,
):
"""
Compute the DICE loss, similar to generalized IOU for masks
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
"""
inputs = inputs.sigmoid()
inputs = inputs.flatten(1)
numerator = 2 * (inputs * targets).sum(-1)
denominator = inputs.sum(-1) + targets.sum(-1)
loss = 1 - (numerator + 1) / (denominator + 1)
return loss.sum() / num_masks
dice_loss_jit = torch.jit.script(
dice_loss
) # type: torch.jit.ScriptModule
def sigmoid_ce_loss(
inputs: torch.Tensor,
targets: torch.Tensor,
num_masks: float,
):
"""
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
Returns:
Loss tensor
"""
loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
return loss.mean(1).sum() / num_masks
sigmoid_ce_loss_jit = torch.jit.script(
sigmoid_ce_loss
) # type: torch.jit.ScriptModule
def calculate_uncertainty(logits):
"""
We estimate uncerainty as L1 distance between 0.0 and the logit prediction in 'logits' for the
foreground class in `classes`.
Args:
logits (Tensor): A tensor of shape (R, 1, ...) for class-specific or
class-agnostic, where R is the total number of predicted masks in all images and C is
the number of foreground classes. The values are logits.
Returns:
scores (Tensor): A tensor of shape (R, 1, ...) that contains uncertainty scores with
the most uncertain locations having the highest uncertainty score.
"""
assert logits.shape[1] == 1
gt_class_logits = logits.clone()
return -(torch.abs(gt_class_logits))
class SetCriterion(nn.Module):
"""This class computes the loss for DETR.
The process happens in two steps:
1) we compute hungarian assignment between ground truth boxes and the outputs of the model
2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
"""
def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses,
num_points, oversample_ratio, importance_sample_ratio):
"""Create the criterion.
Parameters:
num_classes: number of object categories, omitting the special no-object category
matcher: module able to compute a matching between targets and proposals
weight_dict: dict containing as key the names of the losses and as values their relative weight.
eos_coef: relative classification weight applied to the no-object category
losses: list of all the losses to be applied. See get_loss for list of available losses.
"""
super().__init__()
self.num_classes = num_classes
self.matcher = matcher
self.weight_dict = weight_dict
self.eos_coef = eos_coef
self.losses = losses
empty_weight = torch.ones(self.num_classes + 1)
empty_weight[-1] = self.eos_coef
self.register_buffer("empty_weight", empty_weight)
# pointwise mask loss parameters
self.num_points = num_points
self.oversample_ratio = oversample_ratio
self.importance_sample_ratio = importance_sample_ratio
self.laplacian_kernel = torch.tensor([-1, -1, -1, -1, 8, -1, -1, -1, -1], dtype=torch.float32).reshape(1, 1, 3, 3).requires_grad_(False)
self.register_buffer("_iter", torch.zeros([1]))
self._warmup_iters = 1000 #20000
def loss_labels(self, outputs, targets, indices, num_masks):
"""Classification loss (NLL)
targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
"""
assert "pred_logits" in outputs
src_logits = outputs["pred_logits"].float()
idx = self._get_src_permutation_idx(indices)
target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
target_classes = torch.full(
src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
)
target_classes[idx] = target_classes_o
loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
losses = {"loss_ce": loss_ce}
return losses
def loss_masks_proj(self, outputs, targets, indices, num_masks, images_lab_sim):
assert "pred_masks" in outputs
self._iter += 1
src_idx = self._get_src_permutation_idx(indices)
tgt_idx = self._get_tgt_permutation_idx(indices)
src_masks = outputs["pred_masks"]
src_masks = src_masks[src_idx]
masks = [t["masks"] for t in targets]
# TODO use valid to mask invalid areas due to padding in loss
target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
target_masks = target_masks.to(src_masks)
target_masks = target_masks[tgt_idx]
if len(src_idx[0].tolist()) > 0:
images_lab_sim = torch.cat([images_lab_sim[ind] for ind in src_idx[0].tolist()])
# No need to upsample predictions as we are using normalized coordinates :)
# N x 1 x H x W
src_masks = src_masks[:, None]
target_masks = target_masks[:, None]
target_masks = F.interpolate(target_masks, (src_masks.shape[-2], src_masks.shape[-1]), mode='bilinear')
if src_masks.shape[0] > 0:
loss_prj_term = compute_project_term(src_masks.sigmoid(), target_masks)
pairwise_losses = compute_pairwise_term(
src_masks, 3, 2
)
inc_mask = get_incoherent_mask(src_masks.detach().sigmoid() > 0.5, 2) #* images_lab_sim).bool()
inc_mask = F.conv2d(inc_mask, self.laplacian_kernel.to(inc_mask.device), padding=1).abs()
inc_mask = (inc_mask > 0.5).float()
weights = (images_lab_sim >= 0.3).float() * target_masks.float() #* inc_mask
loss_pairwise = ((pairwise_losses * weights).sum() / weights.sum().clamp(min=1.0)) * 0.25
warmup_factor = min(self._iter.item() / float(self._warmup_iters), 1.0)
loss_pairwise = loss_pairwise * warmup_factor #* 0.
else:
loss_prj_term = src_masks.sum() * 0.
loss_pairwise = src_masks.sum() * 0.
losses = {
"loss_mask": loss_prj_term,
"loss_bound": loss_pairwise,
}
del src_masks
del target_masks
return losses
def loss_masks(self, outputs, targets, indices, num_masks):
"""Compute the losses related to the masks: the focal loss and the dice loss.
targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
"""
assert "pred_masks" in outputs
src_idx = self._get_src_permutation_idx(indices)
tgt_idx = self._get_tgt_permutation_idx(indices)
src_masks = outputs["pred_masks"]
src_masks = src_masks[src_idx]
masks = [t["masks"] for t in targets]
# TODO use valid to mask invalid areas due to padding in loss
target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
target_masks = target_masks.to(src_masks)
target_masks = target_masks[tgt_idx]
# No need to upsample predictions as we are using normalized coordinates :)
# N x 1 x H x W
src_masks = src_masks[:, None]
target_masks = target_masks[:, None]
with torch.no_grad():
# sample point_coords
point_coords = get_uncertain_point_coords_with_randomness(
src_masks,
lambda logits: calculate_uncertainty(logits),
self.num_points,
self.oversample_ratio,
self.importance_sample_ratio,
)
# get gt labels
point_labels = point_sample(
target_masks,
point_coords,
align_corners=False,
).squeeze(1)
point_logits = point_sample(
src_masks,
point_coords,
align_corners=False,
).squeeze(1)
losses = {
"loss_mask": sigmoid_ce_loss_jit(point_logits, point_labels, num_masks),
"loss_dice": dice_loss_jit(point_logits, point_labels, num_masks),
}
del src_masks
del target_masks
return losses
def _get_src_permutation_idx(self, indices):
# permute predictions following indices
batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
src_idx = torch.cat([src for (src, _) in indices])
return batch_idx, src_idx
def _get_tgt_permutation_idx(self, indices):
# permute targets following indices
batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
tgt_idx = torch.cat([tgt for (_, tgt) in indices])
return batch_idx, tgt_idx
def get_loss(self, loss, outputs, targets, indices, num_masks, images_lab_sim):
loss_map = {
'labels': self.loss_labels,
'masks': self.loss_masks_proj,
}
assert loss in loss_map, f"do you really want to compute {loss} loss?"
if loss == 'masks':
return loss_map[loss](outputs, targets, indices, num_masks, images_lab_sim)
else:
return loss_map[loss](outputs, targets, indices, num_masks)
def forward(self, outputs, targets, images_lab_sim):
"""This performs the loss computation.
Parameters:
outputs: dict of tensors, see the output specification of the model for the format
targets: list of dicts, such that len(targets) == batch_size.
The expected keys in each dict depends on the losses applied, see each loss' doc
"""
outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
# Retrieve the matching between the outputs of the last layer and the targets
indices = self.matcher(outputs_without_aux, targets)
# Compute the average number of target boxes accross all nodes, for normalization purposes
num_masks = sum(len(t["labels"]) for t in targets)
num_masks = torch.as_tensor(
[num_masks], dtype=torch.float, device=next(iter(outputs.values())).device
)
if is_dist_avail_and_initialized():
torch.distributed.all_reduce(num_masks)
num_masks = torch.clamp(num_masks / get_world_size(), min=1).item()
# Compute all the requested losses
losses = {}
for loss in self.losses:
losses.update(self.get_loss(loss, outputs, targets, indices, num_masks, images_lab_sim))
# In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
if "aux_outputs" in outputs:
for i, aux_outputs in enumerate(outputs["aux_outputs"]):
indices = self.matcher(aux_outputs, targets)
for loss in self.losses:
l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_masks, images_lab_sim)
l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
losses.update(l_dict)
return losses
def __repr__(self):
head = "Criterion " + self.__class__.__name__
body = [
"matcher: {}".format(self.matcher.__repr__(_repr_indent=8)),
"losses: {}".format(self.losses),
"weight_dict: {}".format(self.weight_dict),
"num_classes: {}".format(self.num_classes),
"eos_coef: {}".format(self.eos_coef),
"num_points: {}".format(self.num_points),
"oversample_ratio: {}".format(self.oversample_ratio),
"importance_sample_ratio: {}".format(self.importance_sample_ratio),
]
_repr_indent = 4
lines = [head] + [" " * _repr_indent + line for line in body]
return "\n".join(lines)
================================================
FILE: mask2former/modeling/matcher.py
================================================
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py
"""
Modules to compute the matching cost and solve the corresponding LSAP.
"""
import torch
import torch.nn.functional as F
from scipy.optimize import linear_sum_assignment
from torch import nn
from torch.cuda.amp import autocast
from detectron2.projects.point_rend.point_features import point_sample
from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou, generalized_multi_box_iou
def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
"""
Compute the DICE loss, similar to generalized IOU for masks
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
"""
inputs = inputs #.sigmoid()
inputs = inputs.flatten(1)
numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
loss = 1 - (numerator + 1) / (denominator + 1)
return loss
batch_dice_loss_jit = torch.jit.script(
batch_dice_loss
) # type: torch.jit.ScriptModule
def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
"""
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
Returns:
Loss tensor
"""
hw = inputs.shape[1]
pos = F.binary_cross_entropy(
inputs, torch.ones_like(inputs), reduction="none"
)
neg = F.binary_cross_entropy(
inputs, torch.zeros_like(inputs), reduction="none"
)
loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum(
"nc,mc->nm", neg, (1 - targets)
)
return loss / hw
batch_sigmoid_ce_loss_jit = torch.jit.script(
batch_sigmoid_ce_loss
) # type: torch.jit.ScriptModule
def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
"""
Compute the bounding boxes around the provided masks.
Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with
``0 <= x1 < x2`` and ``0 <= y1 < y2``.
Args:
masks (Tensor[N, H, W]): masks to transform where N is the number of masks
and (H, W) are the spatial dimensions.
Returns:
Tensor[N, 4]: bounding boxes
"""
if masks.numel() == 0:
return masks
n = masks.shape[0]
for index, mask in enumerate(masks):
y, x = torch.where(mask != 0)
if len(x) * len(y) == 0:
continue
h = torch.max(y) - torch.min(y)
w = torch.max(x) - torch.min(x)
masks[index, torch.min(y):torch.max(y), torch.min(x):torch.max(x)] = 1.0
return masks
def masks_to_boxes_cc(masks: torch.Tensor) -> torch.Tensor:
"""
Compute the bounding boxes around the provided masks.
Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with
``0 <= x1 < x2`` and ``0 <= y1 < y2``.
Args:
masks (Tensor[N, H, W]): masks to transform where N is the number of masks
and (H, W) are the spatial dimensions.
Returns:
Tensor[N, 4]: bounding boxes
"""
if masks.numel() == 0:
return torch.zeros((0, 4), device=masks.device, dtype=torch.float)
n = masks.shape[0]
h = masks.shape[1]
w = masks.shape[2]
bounding_boxes = torch.zeros((n, 4), device=masks.device, dtype=torch.float)
for index, mask in enumerate(masks):
y, x = torch.where(mask != 0)
if len(x) * len(y) == 0:
continue
bounding_boxes[index, 0] = torch.min(x) / float(w)
bounding_boxes[index, 1] = torch.min(y) / float(h)
bounding_boxes[index, 2] = torch.max(x) / float(w)
bounding_boxes[index, 3] = torch.max(y) / float(h)
return bounding_boxes
class HungarianMatcher(nn.Module):
"""This class computes an assignment between the targets and the predictions of the network
For efficiency reasons, the targets don't include the no_object. Because of this, in general,
there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
while the others are un-matched (and thus treated as non-objects).
"""
def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0):
"""Creates the matcher
Params:
cost_class: This is the relative weight of the classification error in the matching cost
cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost
cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost
"""
super().__init__()
self.cost_class = cost_class
self.cost_mask = cost_mask
self.cost_dice = cost_dice
self.cost_giou = 2.0
self.cost_bbox = 5.0
assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0"
self.num_points = num_points
@torch.no_grad()
def memory_efficient_forward(self, outputs, targets):
"""More memory-friendly matching"""
bs, num_queries = outputs["pred_logits"].shape[:2]
indices = []
# Iterate through batch size
for b in range(bs):
out_prob = outputs["pred_logits"][b].softmax(-1) # [num_queries, num_classes]
tgt_ids = targets[b]["labels"]
# Compute the classification cost. Contrary to the loss, we don't use the NLL,
# but approximate it in 1 - proba[target class].
# The 1 is a constant that doesn't change the matching, it can be ommitted.
cost_class = -out_prob[:, tgt_ids]
out_mask = outputs["pred_masks"][b] # [num_queries, H_pred, W_pred]
out_mask_box = masks_to_boxes_cc((out_mask.sigmoid() > 0.5).float())
# gt masks are already padded when preparing target
tgt_mask = targets[b]["masks"].to(out_mask)
tgt_mask_box = masks_to_boxes_cc(tgt_mask)
# print('tgt_mask_box shape:', tgt_mask_box.shape)
with autocast(enabled=False):
cost_bbox = torch.cdist(out_mask_box, tgt_mask_box)
cost_giou = -generalized_box_iou(out_mask_box, tgt_mask_box)
if torch.isnan(cost_bbox).any():
print('cost_bbox:', cost_bbox)
if torch.isnan(cost_giou).any():
print('cost_giou:', cost_giou)
C = (
self.cost_bbox * cost_bbox
+ self.cost_class * cost_class
+ self.cost_giou * cost_giou
)
C = C.reshape(num_queries, -1).cpu()
indices.append(linear_sum_assignment(C))
return [
(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
for i, j in indices
]
@torch.no_grad()
def forward(self, outputs, targets):
"""Performs the matching
Params:
outputs: This is a dict that contains at least these entries:
"pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
"pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks
targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
"labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
objects in the target) containing the class labels
"masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks
Returns:
A list of size batch_size, containing tuples of (index_i, index_j) where:
- index_i is the indices of the selected predictions (in order)
- index_j is the indices of the corresponding selected targets (in order)
For each batch element, it holds:
len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
"""
return self.memory_efficient_forward(outputs, targets)
def __repr__(self, _repr_indent=4):
head = "Matcher " + self.__class__.__name__
body = [
"cost_class: {}".format(self.cost_class),
"cost_mask: {}".format(self.cost_mask),
"cost_dice: {}".format(self.cost_dice),
]
lines = [head] + [" " * _repr_indent + line for line in body]
return "\n".join(lines)
================================================
FILE: mask2former/modeling/meta_arch/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
================================================
FILE: mask2former/modeling/meta_arch/__init__.py.new
================================================
================================================
FILE: mask2former/modeling/meta_arch/mask_former_head.py
================================================
import logging
from copy import deepcopy
from typing import Callable, Dict, List, Optional, Tuple, Union
import fvcore.nn.weight_init as weight_init
from torch import nn
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.layers import Conv2d, ShapeSpec, get_norm
from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder
from ..pixel_decoder.fpn import build_pixel_decoder
@SEM_SEG_HEADS_REGISTRY.register()
class MaskFormerHead(nn.Module):
_version = 2
def _load_from_state_dict(
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
):
version = local_metadata.get("version", None)
if version is None or version < 2:
# Do not warn if train from scratch
scratch = True
logger = logging.getLogger(__name__)
for k in list(state_dict.keys()):
newk = k
'''
if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
newk = k.replace(prefix, prefix + "pixel_decoder.")
# logger.debug(f"{k} ==> {newk}")
'''
if newk != k:
state_dict[newk] = state_dict[k]
del state_dict[k]
scratch = False
if not scratch:
logger.warning(
f"Weight format of {self.__class__.__name__} have changed! "
"Please upgrade your models. Applying automatic conversion now ..."
)
@configurable
def __init__(
self,
input_shape: Dict[str, ShapeSpec],
*,
num_classes: int,
pixel_decoder: nn.Module,
loss_weight: float = 1.0,
ignore_value: int = -1,
# extra parameters
transformer_predictor: nn.Module,
transformer_in_feature: str,
):
"""
NOTE: this interface is experimental.
Args:
input_shape: shapes (channels and stride) of the input features
num_classes: number of classes to predict
pixel_decoder: the pixel decoder module
loss_weight: loss weight
ignore_value: category id to be ignored during training.
transformer_predictor: the transformer decoder that makes prediction
transformer_in_feature: input feature name to the transformer_predictor
"""
super().__init__()
input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
self.in_features = [k for k, v in input_shape]
feature_strides = [v.stride for k, v in input_shape]
feature_channels = [v.channels for k, v in input_shape]
self.ignore_value = ignore_value
self.common_stride = 4
self.loss_weight = loss_weight
self.pixel_decoder = pixel_decoder
self.predictor = transformer_predictor
self.transformer_in_feature = transformer_in_feature
self.num_classes = num_classes
@classmethod
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
# figure out in_channels to transformer predictor
if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder": # for maskformer2
transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
else:
transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels
return {
"input_shape": {
k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
},
"ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
"num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
"pixel_decoder": build_pixel_decoder(cfg, input_shape),
"loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
"transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
"transformer_predictor": build_transformer_decoder(
cfg,
transformer_predictor_in_channels,
mask_classification=True,
),
}
def forward(self, features, mask=None):
return self.layers(features, mask)
def layers(self, features, mask=None):
mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features)
if self.transformer_in_feature == "multi_scale_pixel_decoder":
predictions = self.predictor(multi_scale_features, mask_features, mask)
else:
if self.transformer_in_feature == "transformer_encoder":
assert (
transformer_encoder_features is not None
), "Please use the TransformerEncoderPixelDecoder."
predictions = self.predictor(transformer_encoder_features, mask_features, mask)
elif self.transformer_in_feature == "pixel_embedding":
predictions = self.predictor(mask_features, mask_features, mask)
else:
predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask)
return predictions
================================================
FILE: mask2former/modeling/meta_arch/per_pixel_baseline.py
================================================
import logging
from typing import Callable, Dict, List, Optional, Tuple, Union
import fvcore.nn.weight_init as weight_init
from torch import nn
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.layers import Conv2d, ShapeSpec, get_norm
from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
from ..transformer_decoder.maskformer_transformer_decoder import StandardTransformerDecoder
from ..pixel_decoder.fpn import build_pixel_decoder
@SEM_SEG_HEADS_REGISTRY.register()
class PerPixelBaselineHead(nn.Module):
_version = 2
def _load_from_state_dict(
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
):
version = local_metadata.get("version", None)
if version is None or version < 2:
logger = logging.getLogger(__name__)
# Do not warn if train from scratch
scratch = True
logger = logging.getLogger(__name__)
for k in list(state_dict.keys()):
newk = k
if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
newk = k.replace(prefix, prefix + "pixel_decoder.")
# logger.warning(f"{k} ==> {newk}")
if newk != k:
state_dict[newk] = state_dict[k]
del state_dict[k]
scratch = False
if not scratch:
logger.warning(
f"Weight format of {self.__class__.__name__} have changed! "
"Please upgrade your models. Applying automatic conversion now ..."
)
@configurable
def __init__(
self,
input_shape: Dict[str, ShapeSpec],
*,
num_classes: int,
pixel_decoder: nn.Module,
loss_weight: float = 1.0,
ignore_value: int = -1,
):
"""
NOTE: this interface is experimental.
Args:
input_shape: shapes (channels and stride) of the input features
num_classes: number of classes to predict
pixel_decoder: the pixel decoder module
loss_weight: loss weight
ignore_value: category id to be ignored during training.
"""
super().__init__()
input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
self.in_features = [k for k, v in input_shape]
feature_strides = [v.stride for k, v in input_shape]
feature_channels = [v.channels for k, v in input_shape]
self.ignore_value = ignore_value
self.common_stride = 4
self.loss_weight = loss_weight
self.pixel_decoder = pixel_decoder
self.predictor = Conv2d(
self.pixel_decoder.mask_dim, num_classes, kernel_size=1, stride=1, padding=0
)
weight_init.c2_msra_fill(self.predictor)
@classmethod
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
return {
"input_shape": {
k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
},
"ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
"num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
"pixel_decoder": build_pixel_decoder(cfg, input_shape),
"loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
}
def forward(self, features, targets=None):
"""
Returns:
In training, returns (None, dict of losses)
In inference, returns (CxHxW logits, {})
"""
x = self.layers(features)
if self.training:
return None, self.losses(x, targets)
else:
x = F.interpolate(
x, scale_factor=self.common_stride, mode="bilinear", align_corners=False
)
return x, {}
def layers(self, features):
x, _, _ = self.pixel_decoder.forward_features(features)
x = self.predictor(x)
return x
def losses(self, predictions, targets):
predictions = predictions.float() # https://github.com/pytorch/pytorch/issues/48163
predictions = F.interpolate(
predictions, scale_factor=self.common_stride, mode="bilinear", align_corners=False
)
loss = F.cross_entropy(
predictions, targets, reduction="mean", ignore_index=self.ignore_value
)
losses = {"loss_sem_seg": loss * self.loss_weight}
return losses
@SEM_SEG_HEADS_REGISTRY.register()
class PerPixelBaselinePlusHead(PerPixelBaselineHead):
def _load_from_state_dict(
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
):
version = local_metadata.get("version", None)
if version is None or version < 2:
# Do not warn if train from scratch
scratch = True
logger = logging.getLogger(__name__)
for k in list(state_dict.keys()):
newk = k
if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
newk = k.replace(prefix, prefix + "pixel_decoder.")
logger.debug(f"{k} ==> {newk}")
if newk != k:
state_dict[newk] = state_dict[k]
del state_dict[k]
scratch = False
if not scratch:
logger.warning(
f"Weight format of {self.__class__.__name__} have changed! "
"Please upgrade your models. Applying automatic conversion now ..."
)
@configurable
def __init__(
self,
input_shape: Dict[str, ShapeSpec],
*,
# extra parameters
transformer_predictor: nn.Module,
transformer_in_feature: str,
deep_supervision: bool,
# inherit parameters
num_classes: int,
pixel_decoder: nn.Module,
loss_weight: float = 1.0,
ignore_value: int = -1,
):
"""
NOTE: this interface is experimental.
Args:
input_shape: shapes (channels and stride) of the input features
transformer_predictor: the transformer decoder that makes prediction
transformer_in_feature: input feature name to the transformer_predictor
deep_supervision: whether or not to add supervision to the output of
every transformer decoder layer
num_classes: number of classes to predict
pixel_decoder: the pixel decoder module
loss_weight: loss weight
ignore_value: category id to be ignored during training.
"""
super().__init__(
input_shape,
num_classes=num_classes,
pixel_decoder=pixel_decoder,
loss_weight=loss_weight,
ignore_value=ignore_value,
)
del self.predictor
self.predictor = transformer_predictor
self.transformer_in_feature = transformer_in_feature
self.deep_supervision = deep_supervision
@classmethod
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
ret = super().from_config(cfg, input_shape)
ret["transformer_in_feature"] = cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE
if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
else:
in_channels = input_shape[ret["transformer_in_feature"]].channels
ret["transformer_predictor"] = StandardTransformerDecoder(
cfg, in_channels, mask_classification=False
)
ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
return ret
def forward(self, features, targets=None):
"""
Returns:
In training, returns (None, dict of losses)
In inference, returns (CxHxW logits, {})
"""
x, aux_outputs = self.layers(features)
if self.training:
if self.deep_supervision:
losses = self.losses(x, targets)
for i, aux_output in enumerate(aux_outputs):
losses["loss_sem_seg" + f"_{i}"] = self.losses(
aux_output["pred_masks"], targets
)["loss_sem_seg"]
return None, losses
else:
return None, self.losses(x, targets)
else:
x = F.interpolate(
x, scale_factor=self.common_stride, mode="bilinear", align_corners=False
)
return x, {}
def layers(self, features):
mask_features, transformer_encoder_features, _ = self.pixel_decoder.forward_features(features)
if self.transformer_in_feature == "transformer_encoder":
assert (
transformer_encoder_features is not None
), "Please use the TransformerEncoderPixelDecoder."
predictions = self.predictor(transformer_encoder_features, mask_features)
else:
predictions = self.predictor(features[self.transformer_in_feature], mask_features)
if self.deep_supervision:
return predictions["pred_masks"], predictions["aux_outputs"]
else:
return predictions["pred_masks"], None
================================================
FILE: mask2former/modeling/pixel_decoder/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
================================================
FILE: mask2former/modeling/pixel_decoder/__init__.py.new
================================================
================================================
FILE: mask2former/modeling/pixel_decoder/fpn.py
================================================
import logging
import numpy as np
from typing import Callable, Dict, List, Optional, Tuple, Union
import fvcore.nn.weight_init as weight_init
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_
from torch.cuda.amp import autocast
from detectron2.config import configurable
from detectron2.layers import Conv2d, DeformConv, ShapeSpec, get_norm
from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
from ..transformer_decoder.position_encoding import PositionEmbeddingSine
from ..transformer_decoder.transformer import TransformerEncoder, TransformerEncoderLayer, _get_clones, _get_activation_fn
def build_pixel_decoder(cfg, input_shape):
"""
Build a pixel decoder from `cfg.MODEL.MASK_FORMER.PIXEL_DECODER_NAME`.
"""
name = cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME
model = SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)
forward_features = getattr(model, "forward_features", None)
if not callable(forward_features):
raise ValueError(
"Only SEM_SEG_HEADS with forward_features method can be used as pixel decoder. "
f"Please implement forward_features for {name} to only return mask features."
)
return model
# This is a modified FPN decoder.
@SEM_SEG_HEADS_REGISTRY.register()
class BasePixelDecoder(nn.Module):
@configurable
def __init__(
self,
input_shape: Dict[str, ShapeSpec],
*,
conv_dim: int,
mask_dim: int,
norm: Optional[Union[str, Callable]] = None,
):
"""
NOTE: this interface is experimental.
Args:
input_shape: shapes (channels and stride) of the input features
conv_dims: number of output channels for the intermediate conv layers.
mask_dim: number of output channels for the final conv layer.
norm (str or callable): normalization for all conv layers
"""
super().__init__()
input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5"
feature_channels = [v.channels for k, v in input_shape]
lateral_convs = []
output_convs = []
use_bias = norm == ""
for idx, in_channels in enumerate(feature_channels):
if idx == len(self.in_features) - 1:
output_norm = get_norm(norm, conv_dim)
output_conv = Conv2d(
in_channels,
conv_dim,
kernel_size=3,
stride=1,
padding=1,
bias=use_bias,
norm=output_norm,
activation=F.relu,
)
weight_init.c2_xavier_fill(output_conv)
self.add_module("layer_{}".format(idx + 1), output_conv)
lateral_convs.append(None)
output_convs.append(output_conv)
else:
lateral_norm = get_norm(norm, conv_dim)
output_norm = get_norm(norm, conv_dim)
lateral_conv = Conv2d(
in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm
)
output_conv = Conv2d(
conv_dim,
conv_dim,
kernel_size=3,
stride=1,
padding=1,
bias=use_bias,
norm=output_norm,
activation=F.relu,
)
weight_init.c2_xavier_fill(lateral_conv)
weight_init.c2_xavier_fill(output_conv)
self.add_module("adapter_{}".format(idx + 1), lateral_conv)
self.add_module("layer_{}".format(idx + 1), output_conv)
lateral_convs.append(lateral_conv)
output_convs.append(output_conv)
# Place convs into top-down order (from low to high resolution)
# to make the top-down computation in forward clearer.
self.lateral_convs = lateral_convs[::-1]
self.output_convs = output_convs[::-1]
self.mask_dim = mask_dim
self.mask_features = Conv2d(
conv_dim,
mask_dim,
kernel_size=3,
stride=1,
padding=1,
)
weight_init.c2_xavier_fill(self.mask_features)
self.maskformer_num_feature_levels = 3 # always use 3 scales
@classmethod
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
ret = {}
ret["input_shape"] = {
k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
}
ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM
return ret
def forward_features(self, features):
multi_scale_features = []
num_cur_levels = 0
# Reverse feature maps into top-down order (from low to high resolution)
for idx, f in enumerate(self.in_features[::-1]):
x = features[f]
lateral_conv = self.lateral_convs[idx]
output_conv = self.output_convs[idx]
if lateral_conv is None:
y = output_conv(x)
else:
cur_fpn = lateral_conv(x)
# Following FPN implementation, we use nearest upsampling here
y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest")
y = output_conv(y)
if num_cur_levels < self.maskformer_num_feature_levels:
multi_scale_features.append(y)
num_cur_levels += 1
return self.mask_features(y), None, multi_scale_features
def forward(self, features, targets=None):
logger = logging.getLogger(__name__)
logger.warning("Calling forward() may cause unpredicted behavior of PixelDecoder module.")
return self.forward_features(features)
class TransformerEncoderOnly(nn.Module):
def __init__(
self,
d_model=512,
nhead=8,
num_encoder_layers=6,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
normalize_before=False,
):
super().__init__()
encoder_layer = TransformerEncoderLayer(
d_model, nhead, dim_feedforward, dropout, activation, normalize_before
)
encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
self._reset_parameters()
self.d_model = d_model
self.nhead = nhead
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def forward(self, src, mask, pos_embed):
# flatten NxCxHxW to HWxNxC
bs, c, h, w = src.shape
src = src.flatten(2).permute(2, 0, 1)
pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
if mask is not None:
mask = mask.flatten(1)
memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
return memory.permute(1, 2, 0).view(bs, c, h, w)
# This is a modified FPN decoder with extra Transformer encoder that processes the lowest-resolution feature map.
@SEM_SEG_HEADS_REGISTRY.register()
class TransformerEncoderPixelDecoder(BasePixelDecoder):
@configurable
def __init__(
self,
input_shape: Dict[str, ShapeSpec],
*,
transformer_dropout: float,
transformer_nheads: int,
transformer_dim_feedforward: int,
transformer_enc_layers: int,
transformer_pre_norm: bool,
conv_dim: int,
mask_dim: int,
norm: Optional[Union[str, Callable]] = None,
):
"""
NOTE: this interface is experimental.
Args:
input_shape: shapes (channels and stride) of the input features
transformer_dropout: dropout probability in transformer
transformer_nheads: number of heads in transformer
transformer_dim_feedforward: dimension of feedforward network
transformer_enc_layers: number of transformer encoder layers
transformer_pre_norm: whether to use pre-layernorm or not
conv_dims: number of output channels for the intermediate conv layers.
mask_dim: number of output channels for the final conv layer.
norm (str or callable): normalization for all conv layers
"""
super().__init__(input_shape, conv_dim=conv_dim, mask_dim=mask_dim, norm=norm)
input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5"
feature_strides = [v.stride for k, v in input_shape]
feature_channels = [v.channels for k, v in input_shape]
in_channels = feature_channels[len(self.in_features) - 1]
self.input_proj = Conv2d(in_channels, conv_dim, kernel_size=1)
weight_init.c2_xavier_fill(self.input_proj)
self.transformer = TransformerEncoderOnly(
d_model=conv_dim,
dropout=transformer_dropout,
nhead=transformer_nheads,
dim_feedforward=transformer_dim_feedforward,
num_encoder_layers=transformer_enc_layers,
normalize_before=transformer_pre_norm,
)
N_steps = conv_dim // 2
self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
# update layer
use_bias = norm == ""
output_norm = get_norm(norm, conv_dim)
output_conv = Conv2d(
conv_dim,
conv_dim,
kernel_size=3,
stride=1,
padding=1,
bias=use_bias,
norm=output_norm,
activation=F.relu,
)
weight_init.c2_xavier_fill(output_conv)
delattr(self, "layer_{}".format(len(self.in_features)))
self.add_module("layer_{}".format(len(self.in_features)), output_conv)
self.output_convs[0] = output_conv
@classmethod
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
ret = super().from_config(cfg, input_shape)
ret["transformer_dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
ret["transformer_nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
ret["transformer_dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
ret[
"transformer_enc_layers"
] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS # a separate config
ret["transformer_pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
return ret
def forward_features(self, features):
multi_scale_features = []
num_cur_levels = 0
# Reverse feature maps into top-down order (from low to high resolution)
for idx, f in enumerate(self.in_features[::-1]):
x = features[f]
lateral_conv = self.lateral_convs[idx]
output_conv = self.output_convs[idx]
if lateral_conv is None:
transformer = self.input_proj(x)
pos = self.pe_layer(x)
transformer = self.transformer(transformer, None, pos)
y = output_conv(transformer)
# save intermediate feature as input to Transformer decoder
transformer_encoder_features = transformer
else:
cur_fpn = lateral_conv(x)
# Following FPN implementation, we use nearest upsampling here
y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest")
y = output_conv(y)
if num_cur_levels < self.maskformer_num_feature_levels:
multi_scale_features.append(y)
num_cur_levels += 1
return self.mask_features(y), transformer_encoder_features, multi_scale_features
def forward(self, features, targets=None):
logger = logging.getLogger(__name__)
logger.warning("Calling forward() may cause unpredicted behavior of PixelDecoder module.")
return self.forward_features(features)
================================================
FILE: mask2former/modeling/pixel_decoder/msdeformattn.py
================================================
import logging
import numpy as np
from typing import Callable, Dict, List, Optional, Tuple, Union
import fvcore.nn.weight_init as weight_init
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_
from torch.cuda.amp import autocast
from detectron2.config import configurable
from detectron2.layers import Conv2d, ShapeSpec, get_norm
from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
from ..transformer_decoder.position_encoding import PositionEmbeddingSine
from ..transformer_decoder.transformer import _get_clones, _get_activation_fn
from .ops.modules import MSDeformAttn
# MSDeformAttn Transformer encoder in deformable detr
class MSDeformAttnTransformerEncoderOnly(nn.Module):
def __init__(self, d_model=256, nhead=8,
num_encoder_layers=6, dim_feedforward=1024, dropout=0.1,
activation="relu",
num_feature_levels=4, enc_n_points=4,
):
super().__init__()
self.d_model = d_model
self.nhead = nhead
encoder_layer = MSDeformAttnTransformerEncoderLayer(d_model, dim_feedforward,
dropout, activation,
num_feature_levels, nhead, enc_n_points)
self.encoder = MSDeformAttnTransformerEncoder(encoder_layer, num_encoder_layers)
self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
for m in self.modules():
if isinstance(m, MSDeformAttn):
m._reset_parameters()
normal_(self.level_embed)
def get_valid_ratio(self, mask):
_, H, W = mask.shape
valid_H = torch.sum(~mask[:, :, 0], 1)
valid_W = torch.sum(~mask[:, 0, :], 1)
valid_ratio_h = valid_H.float() / H
valid_ratio_w = valid_W.float() / W
valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
return valid_ratio
def forward(self, srcs, pos_embeds):
masks = [torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) for x in srcs]
# prepare input for encoder
src_flatten = []
mask_flatten = []
lvl_pos_embed_flatten = []
spatial_shapes = []
for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
bs, c, h, w = src.shape
spatial_shape = (h, w)
spatial_shapes.append(spatial_shape)
src = src.flatten(2).transpose(1, 2)
mask = mask.flatten(1)
pos_embed = pos_embed.flatten(2).transpose(1, 2)
lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
lvl_pos_embed_flatten.append(lvl_pos_embed)
src_flatten.append(src)
mask_flatten.append(mask)
src_flatten = torch.cat(src_flatten, 1)
mask_flatten = torch.cat(mask_flatten, 1)
lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
# encoder
memory = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten)
return memory, spatial_shapes, level_start_index
class MSDeformAttnTransformerEncoderLayer(nn.Module):
def __init__(self,
d_model=256, d_ffn=1024,
dropout=0.1, activation="relu",
n_levels=4, n_heads=8, n_points=4):
super().__init__()
# self attention
self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
self.dropout1 = nn.Dropout(dropout)
self.norm1 = nn.LayerNorm(d_model)
# ffn
self.linear1 = nn.Linear(d_model, d_ffn)
self.activation = _get_activation_fn(activation)
self.dropout2 = nn.Dropout(dropout)
self.linear2 = nn.Linear(d_ffn, d_model)
self.dropout3 = nn.Dropout(dropout)
self.norm2 = nn.LayerNorm(d_model)
@staticmethod
def with_pos_embed(tensor, pos):
return tensor if pos is None else tensor + pos
def forward_ffn(self, src):
src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
src = src + self.dropout3(src2)
src = self.norm2(src)
return src
def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None):
# self attention
src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask)
src = src + self.dropout1(src2)
src = self.norm1(src)
# ffn
src = self.forward_ffn(src)
return src
class MSDeformAttnTransformerEncoder(nn.Module):
def __init__(self, encoder_layer, num_layers):
super().__init__()
self.layers = _get_clones(encoder_layer, num_layers)
self.num_layers = num_layers
@staticmethod
def get_reference_points(spatial_shapes, valid_ratios, device):
reference_points_list = []
for lvl, (H_, W_) in enumerate(spatial_shapes):
ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
ref = torch.stack((ref_x, ref_y), -1)
reference_points_list.append(ref)
reference_points = torch.cat(reference_points_list, 1)
reference_points = reference_points[:, :, None] * valid_ratios[:, None]
return reference_points
def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None):
output = src
reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
for _, layer in enumerate(self.layers):
output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask)
return output
@SEM_SEG_HEADS_REGISTRY.register()
class MSDeformAttnPixelDecoder(nn.Module):
@configurable
def __init__(
self,
input_shape: Dict[str, ShapeSpec],
*,
transformer_dropout: float,
transformer_nheads: int,
transformer_dim_feedforward: int,
transformer_enc_layers: int,
conv_dim: int,
mask_dim: int,
norm: Optional[Union[str, Callable]] = None,
# deformable transformer encoder args
transformer_in_features: List[str],
common_stride: int,
):
"""
NOTE: this interface is experimental.
Args:
input_shape: shapes (channels and stride) of the input features
transformer_dropout: dropout probability in transformer
transformer_nheads: number of heads in transformer
transformer_dim_feedforward: dimension of feedforward network
transformer_enc_layers: number of transformer encoder layers
conv_dims: number of output channels for the intermediate conv layers.
mask_dim: number of output channels for the final conv layer.
norm (str or callable): normalization for all conv layers
"""
super().__init__()
transformer_input_shape = {
k: v for k, v in input_shape.items() if k in transformer_in_features
}
# this is the input shape of pixel decoder
input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5"
self.feature_strides = [v.stride for k, v in input_shape]
self.feature_channels = [v.channels for k, v in input_shape]
# this is the input shape of transformer encoder (could use less features than pixel decoder
transformer_input_shape = sorted(transformer_input_shape.items(), key=lambda x: x[1].stride)
self.transformer_in_features = [k for k, v in transformer_input_shape] # starting from "res2" to "res5"
transformer_in_channels = [v.channels for k, v in transformer_input_shape]
self.transformer_feature_strides = [v.stride for k, v in transformer_input_shape] # to decide extra FPN layers
self.transformer_num_feature_levels = len(self.transformer_in_features)
if self.transformer_num_feature_levels > 1:
input_proj_list = []
# from low resolution to high resolution (res5 -> res2)
for in_channels in transformer_in_channels[::-1]:
input_proj_list.append(nn.Sequential(
nn.Conv2d(in_channels, conv_dim, kernel_size=1),
nn.GroupNorm(32, conv_dim),
))
self.input_proj = nn.ModuleList(input_proj_list)
else:
self.input_proj = nn.ModuleList([
nn.Sequential(
nn.Conv2d(transformer_in_channels[-1], conv_dim, kernel_size=1),
nn.GroupNorm(32, conv_dim),
)])
for proj in self.input_proj:
nn.init.xavier_uniform_(proj[0].weight, gain=1)
nn.init.constant_(proj[0].bias, 0)
self.transformer = MSDeformAttnTransformerEncoderOnly(
d_model=conv_dim,
dropout=transformer_dropout,
nhead=transformer_nheads,
dim_feedforward=transformer_dim_feedforward,
num_encoder_layers=transformer_enc_layers,
num_feature_levels=self.transformer_num_feature_levels,
)
N_steps = conv_dim // 2
self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
self.mask_dim = mask_dim
# use 1x1 conv instead
self.mask_features = Conv2d(
conv_dim,
mask_dim,
kernel_size=1,
stride=1,
padding=0,
)
weight_init.c2_xavier_fill(self.mask_features)
self.maskformer_num_feature_levels = 3 # always use 3 scales
self.common_stride = common_stride
# extra fpn levels
stride = min(self.transformer_feature_strides)
self.num_fpn_levels = int(np.log2(stride) - np.log2(self.common_stride))
lateral_convs = []
output_convs = []
use_bias = norm == ""
for idx, in_channels in enumerate(self.feature_channels[:self.num_fpn_levels]):
lateral_norm = get_norm(norm, conv_dim)
output_norm = get_norm(norm, conv_dim)
lateral_conv = Conv2d(
in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm
)
output_conv = Conv2d(
conv_dim,
conv_dim,
kernel_size=3,
stride=1,
padding=1,
bias=use_bias,
norm=output_norm,
activation=F.relu,
)
weight_init.c2_xavier_fill(lateral_conv)
weight_init.c2_xavier_fill(output_conv)
self.add_module("adapter_{}".format(idx + 1), lateral_conv)
self.add_module("layer_{}".format(idx + 1), output_conv)
lateral_convs.append(lateral_conv)
output_convs.append(output_conv)
# Place convs into top-down order (from low to high resolution)
# to make the top-down computation in forward clearer.
self.lateral_convs = lateral_convs[::-1]
self.output_convs = output_convs[::-1]
@classmethod
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
ret = {}
ret["input_shape"] = {
k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
}
ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM
ret["transformer_dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
ret["transformer_nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
# ret["transformer_dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
ret["transformer_dim_feedforward"] = 1024 # use 1024 for deformable transformer encoder
ret[
"transformer_enc_layers"
] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS # a separate config
ret["transformer_in_features"] = cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES
ret["common_stride"] = cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE
return ret
@autocast(enabled=False)
def forward_features(self, features):
srcs = []
pos = []
# Reverse feature maps into top-down order (from low to high resolution)
for idx, f in enumerate(self.transformer_in_features[::-1]):
x = features[f].float() # deformable detr does not support half precision
srcs.append(self.input_proj[idx](x))
pos.append(self.pe_layer(x))
y, spatial_shapes, level_start_index = self.transformer(srcs, pos)
bs = y.shape[0]
split_size_or_sections = [None] * self.transformer_num_feature_levels
for i in range(self.transformer_num_feature_levels):
if i < self.transformer_num_feature_levels - 1:
split_size_or_sections[i] = level_start_index[i + 1] - level_start_index[i]
else:
split_size_or_sections[i] = y.shape[1] - level_start_index[i]
y = torch.split(y, split_size_or_sections, dim=1)
out = []
multi_scale_features = []
num_cur_levels = 0
for i, z in enumerate(y):
out.append(z.transpose(1, 2).view(bs, -1, spatial_shapes[i][0], spatial_shapes[i][1]))
# append `out` with extra FPN levels
# Reverse feature maps into top-down order (from low to high resolution)
for idx, f in enumerate(self.in_features[:self.num_fpn_levels][::-1]):
x = features[f].float()
lateral_conv = self.lateral_convs[idx]
output_conv = self.output_convs[idx]
cur_fpn = lateral_conv(x)
# Following FPN implementation, we use nearest upsampling here
y = cur_fpn + F.interpolate(out[-1], size=cur_fpn.shape[-2:], mode="bilinear", align_corners=False)
y = output_conv(y)
out.append(y)
for o in out:
if num_cur_levels < self.maskformer_num_feature_levels:
multi_scale_features.append(o)
num_cur_levels += 1
return self.mask_features(out[-1]), out[0], multi_scale_features
================================================
FILE: mask2former/modeling/pixel_decoder/ops/functions/__init__.py
================================================
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
from .ms_deform_attn_func import MSDeformAttnFunction
================================================
FILE: mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py
================================================
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import torch
import torch.nn.functional as F
from torch.autograd import Function
from torch.autograd.function import once_differentiable
try:
import MultiScaleDeformableAttention as MSDA
except ModuleNotFoundError as e:
info_string = (
"\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
"\t`cd mask2former/modeling/pixel_decoder/ops`\n"
"\t`sh make.sh`\n"
)
raise ModuleNotFoundError(info_string)
class MSDeformAttnFunction(Function):
@staticmethod
def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
ctx.im2col_step = im2col_step
output = MSDA.ms_deform_attn_forward(
value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
return output
@staticmethod
@once_differentiable
def backward(ctx, grad_output):
value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
grad_value, grad_sampling_loc, grad_attn_weight = \
MSDA.ms_deform_attn_backward(
value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
# for debug and test only,
# need to use cuda version instead
N_, S_, M_, D_ = value.shape
_, Lq_, M_, L_, P_, _ = sampling_locations.shape
value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
sampling_grids = 2 * sampling_locations - 1
sampling_value_list = []
for lid_, (H_, W_) in enumerate(value_spatial_shapes):
# N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
# N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
# N_*M_, D_, Lq_, P_
sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
mode='bilinear', padding_mode='zeros', align_corners=False)
sampling_value_list.append(sampling_value_l_)
# (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
return output.transpose(1, 2).contiguous()
================================================
FILE: mask2former/modeling/pixel_decoder/ops/make.sh
================================================
#!/usr/bin/env bash
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
python setup.py build install
================================================
FILE: mask2former/modeling/pixel_decoder/ops/modules/__init__.py
================================================
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
from .ms_deform_attn import MSDeformAttn
================================================
FILE: mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py
================================================
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import warnings
import math
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.init import xavier_uniform_, constant_
from ..functions import MSDeformAttnFunction
from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
def _is_power_of_2(n):
if (not isinstance(n, int)) or (n < 0):
raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
return (n & (n-1) == 0) and n != 0
class MSDeformAttn(nn.Module):
def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
"""
Multi-Scale Deformable Attention Module
:param d_model hidden dimension
:param n_levels number of feature levels
:param n_heads number of attention heads
:param n_points number of sampling points per attention head per feature level
"""
super().__init__()
if d_model % n_heads != 0:
raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
_d_per_head = d_model // n_heads
# you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
if not _is_power_of_2(_d_per_head):
warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
"which is more efficient in our CUDA implementation.")
self.im2col_step = 128
self.d_model = d_model
self.n_levels = n_levels
self.n_heads = n_heads
self.n_points = n_points
self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
self.value_proj = nn.Linear(d_model, d_model)
self.output_proj = nn.Linear(d_model, d_model)
self._reset_parameters()
def _reset_parameters(self):
constant_(self.sampling_offsets.weight.data, 0.)
thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
for i in range(self.n_points):
grid_init[:, :, i, :] *= i + 1
with torch.no_grad():
self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
constant_(self.attention_weights.weight.data, 0.)
constant_(self.attention_weights.bias.data, 0.)
xavier_uniform_(self.value_proj.weight.data)
constant_(self.value_proj.bias.data, 0.)
xavier_uniform_(self.output_proj.weight.data)
constant_(self.output_proj.bias.data, 0.)
def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
"""
:param query (N, Length_{query}, C)
:param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
:param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
:param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
:param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
:param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
:return output (N, Length_{query}, C)
"""
N, Len_q, _ = query.shape
N, Len_in, _ = input_flatten.shape
assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
value = self.value_proj(input_flatten)
if input_padding_mask is not None:
value = value.masked_fill(input_padding_mask[..., None], float(0))
value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
# N, Len_q, n_heads, n_levels, n_points, 2
if reference_points.shape[-1] == 2:
offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
sampling_locations = reference_points[:, :, None, :, None, :] \
+ sampling_offsets / offset_normalizer[None, None, None, :, None, :]
elif reference_points.shape[-1] == 4:
sampling_locations = reference_points[:, :, None, :, None, :2] \
+ sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
else:
raise ValueError(
'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
try:
output = MSDeformAttnFunction.apply(
value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
except:
# CPU
output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
# # For FLOPs calculation only
# output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
output = self.output_proj(output)
return output
================================================
FILE: mask2former/modeling/pixel_decoder/ops/setup.py
================================================
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
import os
import glob
import torch
from torch.utils.cpp_extension import CUDA_HOME
from torch.utils.cpp_extension import CppExtension
from torch.utils.cpp_extension import CUDAExtension
from setuptools import find_packages
from setuptools import setup
requirements = ["torch", "torchvision"]
def get_extensions():
this_dir = os.path.dirname(os.path.abspath(__file__))
extensions_dir = os.path.join(this_dir, "src")
main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
sources = main_file + source_cpu
extension = CppExtension
extra_compile_args = {"cxx": []}
define_macros = []
# Force cuda since torch ask for a device, not if cuda is in fact available.
if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
extension = CUDAExtension
sources += source_cuda
define_macros += [("WITH_CUDA", None)]
extra_compile_args["nvcc"] = [
"-DCUDA_HAS_FP16=1",
"-D__CUDA_NO_HALF_OPERATORS__",
"-D__CUDA_NO_HALF_CONVERSIONS__",
"-D__CUDA_NO_HALF2_OPERATORS__",
]
else:
if CUDA_HOME is None:
raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
else:
raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
sources = [os.path.join(extensions_dir, s) for s in sources]
include_dirs = [extensions_dir]
ext_modules = [
extension(
"MultiScaleDeformableAttention",
sources,
include_dirs=include_dirs,
define_macros=define_macros,
extra_compile_args=extra_compile_args,
)
]
return ext_modules
setup(
name="MultiScaleDeformableAttention",
version="1.0",
author="Weijie Su",
url="https://github.com/fundamentalvision/Deformable-DETR",
description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
packages=find_packages(exclude=("configs", "tests",)),
ext_modules=get_extensions(),
cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
)
================================================
FILE: mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#include
#include
#include
at::Tensor
ms_deform_attn_cpu_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step)
{
AT_ERROR("Not implement on cpu");
}
std::vector
ms_deform_attn_cpu_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step)
{
AT_ERROR("Not implement on cpu");
}
================================================
FILE: mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#pragma once
#include
at::Tensor
ms_deform_attn_cpu_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step);
std::vector
ms_deform_attn_cpu_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step);
================================================
FILE: mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#include
#include "cuda/ms_deform_im2col_cuda.cuh"
#include
#include
#include
#include
at::Tensor ms_deform_attn_cuda_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step)
{
AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
const int batch = value.size(0);
const int spatial_size = value.size(1);
const int num_heads = value.size(2);
const int channels = value.size(3);
const int num_levels = spatial_shapes.size(0);
const int num_query = sampling_loc.size(1);
const int num_point = sampling_loc.size(4);
const int im2col_step_ = std::min(batch, im2col_step);
AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
const int batch_n = im2col_step_;
auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
auto per_value_size = spatial_size * num_heads * channels;
auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
for (int n = 0; n < batch/im2col_step_; ++n)
{
auto columns = output_n.select(0, n);
AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
value.data() + n * im2col_step_ * per_value_size,
spatial_shapes.data(),
level_start_index.data(),
sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
attn_weight.data() + n * im2col_step_ * per_attn_weight_size,
batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
columns.data());
}));
}
output = output.view({batch, num_query, num_heads*channels});
return output;
}
std::vector ms_deform_attn_cuda_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step)
{
AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
const int batch = value.size(0);
const int spatial_size = value.size(1);
const int num_heads = value.size(2);
const int channels = value.size(3);
const int num_levels = spatial_shapes.size(0);
const int num_query = sampling_loc.size(1);
const int num_point = sampling_loc.size(4);
const int im2col_step_ = std::min(batch, im2col_step);
AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
auto grad_value = at::zeros_like(value);
auto grad_sampling_loc = at::zeros_like(sampling_loc);
auto grad_attn_weight = at::zeros_like(attn_weight);
const int batch_n = im2col_step_;
auto per_value_size = spatial_size * num_heads * channels;
auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
for (int n = 0; n < batch/im2col_step_; ++n)
{
auto grad_output_g = grad_output_n.select(0, n);
AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
grad_output_g.data(),
value.data() + n * im2col_step_ * per_value_size,
spatial_shapes.data(),
level_start_index.data(),
sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
attn_weight.data() + n * im2col_step_ * per_attn_weight_size,
batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
grad_value.data() + n * im2col_step_ * per_value_size,
grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size);
}));
}
return {
grad_value, grad_sampling_loc, grad_attn_weight
};
}
================================================
FILE: mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#pragma once
#include
at::Tensor ms_deform_attn_cuda_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step);
std::vector ms_deform_attn_cuda_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step);
================================================
FILE: mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh
================================================
/*!
**************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************
* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
* Copyright (c) 2018 Microsoft
**************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#include
#include
#include
#include
#include
#include
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N, const int num_threads)
{
return (N + num_threads - 1) / num_threads;
}
template
__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data,
const int &height, const int &width, const int &nheads, const int &channels,
const scalar_t &h, const scalar_t &w, const int &m, const int &c)
{
const int h_low = floor(h);
const int w_low = floor(w);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const scalar_t lh = h - h_low;
const scalar_t lw = w - w_low;
const scalar_t hh = 1 - lh, hw = 1 - lw;
const int w_stride = nheads * channels;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int base_ptr = m * channels + c;
scalar_t v1 = 0;
if (h_low >= 0 && w_low >= 0)
{
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
}
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
{
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
}
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
{
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
}
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
{
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
}
const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template
__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
const int &height, const int &width, const int &nheads, const int &channels,
const scalar_t &h, const scalar_t &w, const int &m, const int &c,
const scalar_t &top_grad,
const scalar_t &attn_weight,
scalar_t* &grad_value,
scalar_t* grad_sampling_loc,
scalar_t* grad_attn_weight)
{
const int h_low = floor(h);
const int w_low = floor(w);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const scalar_t lh = h - h_low;
const scalar_t lw = w - w_low;
const scalar_t hh = 1 - lh, hw = 1 - lw;
const int w_stride = nheads * channels;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int base_ptr = m * channels + c;
const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const scalar_t top_grad_value = top_grad * attn_weight;
scalar_t grad_h_weight = 0, grad_w_weight = 0;
scalar_t v1 = 0;
if (h_low >= 0 && w_low >= 0)
{
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
grad_h_weight -= hw * v1;
grad_w_weight -= hh * v1;
atomicAdd(grad_value+ptr1, w1*top_grad_value);
}
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
{
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
grad_h_weight -= lw * v2;
grad_w_weight += hh * v2;
atomicAdd(grad_value+ptr2, w2*top_grad_value);
}
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
{
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
grad_h_weight += hw * v3;
grad_w_weight -= lh * v3;
atomicAdd(grad_value+ptr3, w3*top_grad_value);
}
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
{
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
grad_h_weight += lw * v4;
grad_w_weight += lh * v4;
atomicAdd(grad_value+ptr4, w4*top_grad_value);
}
const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
*grad_attn_weight = top_grad * val;
*grad_sampling_loc = width * grad_w_weight * top_grad_value;
*(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
}
template
__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
const int &height, const int &width, const int &nheads, const int &channels,
const scalar_t &h, const scalar_t &w, const int &m, const int &c,
const scalar_t &top_grad,
const scalar_t &attn_weight,
scalar_t* &grad_value,
scalar_t* grad_sampling_loc,
scalar_t* grad_attn_weight)
{
const int h_low = floor(h);
const int w_low = floor(w);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const scalar_t lh = h - h_low;
const scalar_t lw = w - w_low;
const scalar_t hh = 1 - lh, hw = 1 - lw;
const int w_stride = nheads * channels;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int base_ptr = m * channels + c;
const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const scalar_t top_grad_value = top_grad * attn_weight;
scalar_t grad_h_weight = 0, grad_w_weight = 0;
scalar_t v1 = 0;
if (h_low >= 0 && w_low >= 0)
{
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
grad_h_weight -= hw * v1;
grad_w_weight -= hh * v1;
atomicAdd(grad_value+ptr1, w1*top_grad_value);
}
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
{
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
grad_h_weight -= lw * v2;
grad_w_weight += hh * v2;
atomicAdd(grad_value+ptr2, w2*top_grad_value);
}
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
{
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
grad_h_weight += hw * v3;
grad_w_weight -= lh * v3;
atomicAdd(grad_value+ptr3, w3*top_grad_value);
}
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
{
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
grad_h_weight += lw * v4;
grad_w_weight += lh * v4;
atomicAdd(grad_value+ptr4, w4*top_grad_value);
}
const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
atomicAdd(grad_attn_weight, top_grad * val);
atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
}
template
__global__ void ms_deformable_im2col_gpu_kernel(const int n,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *data_col)
{
CUDA_KERNEL_LOOP(index, n)
{
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
scalar_t *data_col_ptr = data_col + index;
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
scalar_t col = 0;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
}
data_weight_ptr += 1;
data_loc_w_ptr += 2;
}
}
*data_col_ptr = col;
}
}
template
__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
__shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
__shared__ scalar_t cache_grad_attn_weight[blockSize];
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
if (tid == 0)
{
scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
int sid=2;
for (unsigned int tid = 1; tid < blockSize; ++tid)
{
_grad_w += cache_grad_sampling_loc[sid];
_grad_h += cache_grad_sampling_loc[sid + 1];
_grad_a += cache_grad_attn_weight[tid];
sid += 2;
}
*grad_sampling_loc = _grad_w;
*(grad_sampling_loc + 1) = _grad_h;
*grad_attn_weight = _grad_a;
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template
__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
__shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
__shared__ scalar_t cache_grad_attn_weight[blockSize];
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
for (unsigned int s=blockSize/2; s>0; s>>=1)
{
if (tid < s) {
const unsigned int xid1 = tid << 1;
const unsigned int xid2 = (tid + s) << 1;
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
}
__syncthreads();
}
if (tid == 0)
{
*grad_sampling_loc = cache_grad_sampling_loc[0];
*(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
*grad_attn_weight = cache_grad_attn_weight[0];
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template
__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
extern __shared__ int _s[];
scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
if (tid == 0)
{
scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
int sid=2;
for (unsigned int tid = 1; tid < blockDim.x; ++tid)
{
_grad_w += cache_grad_sampling_loc[sid];
_grad_h += cache_grad_sampling_loc[sid + 1];
_grad_a += cache_grad_attn_weight[tid];
sid += 2;
}
*grad_sampling_loc = _grad_w;
*(grad_sampling_loc + 1) = _grad_h;
*grad_attn_weight = _grad_a;
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template
__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
extern __shared__ int _s[];
scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
{
if (tid < s) {
const unsigned int xid1 = tid << 1;
const unsigned int xid2 = (tid + s) << 1;
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
if (tid + (s << 1) < spre)
{
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
}
}
__syncthreads();
}
if (tid == 0)
{
*grad_sampling_loc = cache_grad_sampling_loc[0];
*(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
*grad_attn_weight = cache_grad_attn_weight[0];
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template
__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
extern __shared__ int _s[];
scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
{
if (tid < s) {
const unsigned int xid1 = tid << 1;
const unsigned int xid2 = (tid + s) << 1;
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
if (tid + (s << 1) < spre)
{
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
}
}
__syncthreads();
}
if (tid == 0)
{
atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template
__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear_gm(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
grad_sampling_loc, grad_attn_weight);
}
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template
void ms_deformable_im2col_cuda(cudaStream_t stream,
const scalar_t* data_value,
const int64_t* data_spatial_shapes,
const int64_t* data_level_start_index,
const scalar_t* data_sampling_loc,
const scalar_t* data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t* data_col)
{
const int num_kernels = batch_size * num_query * num_heads * channels;
const int num_actual_kernels = batch_size * num_query * num_heads * channels;
const int num_threads = CUDA_NUM_THREADS;
ms_deformable_im2col_gpu_kernel
<<>>(
num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight,
batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
}
}
template
void ms_deformable_col2im_cuda(cudaStream_t stream,
const scalar_t* grad_col,
const scalar_t* data_value,
const int64_t * data_spatial_shapes,
const int64_t * data_level_start_index,
const scalar_t * data_sampling_loc,
const scalar_t * data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t* grad_value,
scalar_t* grad_sampling_loc,
scalar_t* grad_attn_weight)
{
const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
const int num_kernels = batch_size * num_query * num_heads * channels;
const int num_actual_kernels = batch_size * num_query * num_heads * channels;
if (channels > 1024)
{
if ((channels & 1023) == 0)
{
ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
}
else
{
ms_deformable_col2im_gpu_kernel_gm
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
}
}
else{
switch(channels)
{
case 1:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 2:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 4:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 8:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 16:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 32:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 64:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 128:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 256:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 512:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 1024:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
default:
if (channels < 64)
{
ms_deformable_col2im_gpu_kernel_shm_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
}
else
{
ms_deformable_col2im_gpu_kernel_shm_reduce_v2
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
}
}
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
}
}
================================================
FILE: mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#pragma once
#include "cpu/ms_deform_attn_cpu.h"
#ifdef WITH_CUDA
#include "cuda/ms_deform_attn_cuda.h"
#endif
at::Tensor
ms_deform_attn_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step)
{
if (value.type().is_cuda())
{
#ifdef WITH_CUDA
return ms_deform_attn_cuda_forward(
value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
std::vector
ms_deform_attn_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step)
{
if (value.type().is_cuda())
{
#ifdef WITH_CUDA
return ms_deform_attn_cuda_backward(
value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
================================================
FILE: mask2former/modeling/pixel_decoder/ops/src/vision.cpp
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#include "ms_deform_attn.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
}
================================================
FILE: mask2former/modeling/pixel_decoder/ops/test.py
================================================
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import time
import torch
import torch.nn as nn
from torch.autograd import gradcheck
from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
N, M, D = 1, 2, 2
Lq, L, P = 2, 2, 2
shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
S = sum([(H*W).item() for H, W in shapes])
torch.manual_seed(3)
@torch.no_grad()
def check_forward_equal_with_pytorch_double():
value = torch.rand(N, S, M, D).cuda() * 0.01
sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
im2col_step = 2
output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
fwdok = torch.allclose(output_cuda, output_pytorch)
max_abs_err = (output_cuda - output_pytorch).abs().max()
max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
@torch.no_grad()
def check_forward_equal_with_pytorch_float():
value = torch.rand(N, S, M, D).cuda() * 0.01
sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
im2col_step = 2
output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
max_abs_err = (output_cuda - output_pytorch).abs().max()
max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
value = torch.rand(N, S, M, channels).cuda() * 0.01
sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
im2col_step = 2
func = MSDeformAttnFunction.apply
value.requires_grad = grad_value
sampling_locations.requires_grad = grad_sampling_loc
attention_weights.requires_grad = grad_attn_weight
gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
print(f'* {gradok} check_gradient_numerical(D={channels})')
if __name__ == '__main__':
check_forward_equal_with_pytorch_double()
check_forward_equal_with_pytorch_float()
for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
check_gradient_numerical(channels, True, True, True)
================================================
FILE: mask2former/modeling/transformer_decoder/__init__.py
================================================
from .maskformer_transformer_decoder import StandardTransformerDecoder
from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
================================================
FILE: mask2former/modeling/transformer_decoder/mask2former_transformer_decoder.py
================================================
# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
import logging
import fvcore.nn.weight_init as weight_init
from typing import Optional
import torch
from torch import nn, Tensor
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.layers import Conv2d
from .position_encoding import PositionEmbeddingSine
from .maskformer_transformer_decoder import TRANSFORMER_DECODER_REGISTRY
class SelfAttentionLayer(nn.Module):
def __init__(self, d_model, nhead, dropout=0.0,
activation="relu", normalize_before=False):
super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.norm = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(self, tgt,
tgt_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
q = k = self.with_pos_embed(tgt, query_pos)
tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
key_padding_mask=tgt_key_padding_mask)[0]
tgt = tgt + self.dropout(tgt2)
tgt = self.norm(tgt)
return tgt
def forward_pre(self, tgt,
tgt_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
tgt2 = self.norm(tgt)
q = k = self.with_pos_embed(tgt2, query_pos)
tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
key_padding_mask=tgt_key_padding_mask)[0]
tgt = tgt + self.dropout(tgt2)
return tgt
def forward(self, tgt,
tgt_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
if self.normalize_before:
return self.forward_pre(tgt, tgt_mask,
tgt_key_padding_mask, query_pos)
return self.forward_post(tgt, tgt_mask,
tgt_key_padding_mask, query_pos)
class CrossAttentionLayer(nn.Module):
def __init__(self, d_model, nhead, dropout=0.0,
activation="relu", normalize_before=False):
super().__init__()
self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.norm = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(self, tgt, memory,
memory_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
key=self.with_pos_embed(memory, pos),
value=memory, attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask)[0]
tgt = tgt + self.dropout(tgt2)
tgt = self.norm(tgt)
return tgt
def forward_pre(self, tgt, memory,
memory_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
tgt2 = self.norm(tgt)
tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
key=self.with_pos_embed(memory, pos),
value=memory, attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask)[0]
tgt = tgt + self.dropout(tgt2)
return tgt
def forward(self, tgt, memory,
memory_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
if self.normalize_before:
return self.forward_pre(tgt, memory, memory_mask,
memory_key_padding_mask, pos, query_pos)
return self.forward_post(tgt, memory, memory_mask,
memory_key_padding_mask, pos, query_pos)
class FFNLayer(nn.Module):
def __init__(self, d_model, dim_feedforward=2048, dropout=0.0,
activation="relu", normalize_before=False):
super().__init__()
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm = nn.LayerNorm(d_model)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(self, tgt):
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
tgt = tgt + self.dropout(tgt2)
tgt = self.norm(tgt)
return tgt
def forward_pre(self, tgt):
tgt2 = self.norm(tgt)
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
tgt = tgt + self.dropout(tgt2)
return tgt
def forward(self, tgt):
if self.normalize_before:
return self.forward_pre(tgt)
return self.forward_post(tgt)
def _get_activation_fn(activation):
"""Return an activation function given a string"""
if activation == "relu":
return F.relu
if activation == "gelu":
return F.gelu
if activation == "glu":
return F.glu
raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
class MLP(nn.Module):
""" Very simple multi-layer perceptron (also called FFN)"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
def forward(self, x):
for i, layer in enumerate(self.layers):
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x
@TRANSFORMER_DECODER_REGISTRY.register()
class MultiScaleMaskedTransformerDecoder(nn.Module):
_version = 2
def _load_from_state_dict(
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
):
version = local_metadata.get("version", None)
if version is None or version < 2:
# Do not warn if train from scratch
scratch = True
logger = logging.getLogger(__name__)
for k in list(state_dict.keys()):
newk = k
if "static_query" in k:
newk = k.replace("static_query", "query_feat")
if newk != k:
state_dict[newk] = state_dict[k]
del state_dict[k]
scratch = False
if not scratch:
logger.warning(
f"Weight format of {self.__class__.__name__} have changed! "
"Please upgrade your models. Applying automatic conversion now ..."
)
@configurable
def __init__(
self,
in_channels,
mask_classification=True,
*,
num_classes: int,
hidden_dim: int,
num_queries: int,
nheads: int,
dim_feedforward: int,
dec_layers: int,
pre_norm: bool,
mask_dim: int,
enforce_input_project: bool,
):
"""
NOTE: this interface is experimental.
Args:
in_channels: channels of the input features
mask_classification: whether to add mask classifier or not
num_classes: number of classes
hidden_dim: Transformer feature dimension
num_queries: number of queries
nheads: number of heads
dim_feedforward: feature dimension in feedforward network
enc_layers: number of Transformer encoder layers
dec_layers: number of Transformer decoder layers
pre_norm: whether to use pre-LayerNorm or not
mask_dim: mask feature dimension
enforce_input_project: add input project 1x1 conv even if input
channels and hidden dim is identical
"""
super().__init__()
assert mask_classification, "Only support mask classification model"
self.mask_classification = mask_classification
# positional encoding
N_steps = hidden_dim // 2
self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
# define Transformer decoder here
self.num_heads = nheads
self.num_layers = dec_layers
self.transformer_self_attention_layers = nn.ModuleList()
self.transformer_cross_attention_layers = nn.ModuleList()
self.transformer_ffn_layers = nn.ModuleList()
for _ in range(self.num_layers):
self.transformer_self_attention_layers.append(
SelfAttentionLayer(
d_model=hidden_dim,
nhead=nheads,
dropout=0.0,
normalize_before=pre_norm,
)
)
self.transformer_cross_attention_layers.append(
CrossAttentionLayer(
d_model=hidden_dim,
nhead=nheads,
dropout=0.0,
normalize_before=pre_norm,
)
)
self.transformer_ffn_layers.append(
FFNLayer(
d_model=hidden_dim,
dim_feedforward=dim_feedforward,
dropout=0.0,
normalize_before=pre_norm,
)
)
self.decoder_norm = nn.LayerNorm(hidden_dim)
self.num_queries = num_queries
# learnable query features
self.query_feat = nn.Embedding(num_queries, hidden_dim)
# learnable query p.e.
self.query_embed = nn.Embedding(num_queries, hidden_dim)
# level embedding (we always use 3 scales)
self.num_feature_levels = 3
self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim)
self.input_proj = nn.ModuleList()
for _ in range(self.num_feature_levels):
if in_channels != hidden_dim or enforce_input_project:
self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1))
weight_init.c2_xavier_fill(self.input_proj[-1])
else:
self.input_proj.append(nn.Sequential())
# output FFNs
if self.mask_classification:
self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
@classmethod
def from_config(cls, cfg, in_channels, mask_classification):
ret = {}
ret["in_channels"] = in_channels
ret["mask_classification"] = mask_classification
ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
# Transformer parameters:
ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
# NOTE: because we add learnable query features which requires supervision,
# we add minus 1 to decoder layers to be consistent with our loss
# implementation: that is, number of auxiliary losses is always
# equal to number of decoder layers. With learnable query features, the number of
# auxiliary losses equals number of decoders plus 1.
assert cfg.MODEL.MASK_FORMER.DEC_LAYERS >= 1
ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS - 1
ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
return ret
def forward(self, x, mask_features, mask = None):
# x is a list of multi-scale feature
assert len(x) == self.num_feature_levels
src = []
pos = []
size_list = []
# disable mask, it does not affect performance
del mask
for i in range(self.num_feature_levels):
size_list.append(x[i].shape[-2:])
pos.append(self.pe_layer(x[i], None).flatten(2))
src.append(self.input_proj[i](x[i]).flatten(2) + self.level_embed.weight[i][None, :, None])
# flatten NxCxHxW to HWxNxC
pos[-1] = pos[-1].permute(2, 0, 1)
src[-1] = src[-1].permute(2, 0, 1)
_, bs, _ = src[0].shape
# QxNxC
query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1)
# query_embed = None
# print('come here==========')
output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1)
predictions_class = []
predictions_mask = []
# prediction heads on learnable query features
outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0])
predictions_class.append(outputs_class)
predictions_mask.append(outputs_mask)
for i in range(self.num_layers):
level_index = i % self.num_feature_levels
attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False
# attention: cross-attention first
output = self.transformer_cross_attention_layers[i](
output, src[level_index],
memory_mask=attn_mask,
memory_key_padding_mask=None, # here we do not apply masking on padded region
pos=pos[level_index], query_pos=query_embed
)
output = self.transformer_self_attention_layers[i](
output, tgt_mask=None,
tgt_key_padding_mask=None,
query_pos=query_embed
)
# FFN
output = self.transformer_ffn_layers[i](
output
)
outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels])
predictions_class.append(outputs_class)
predictions_mask.append(outputs_mask)
assert len(predictions_class) == self.num_layers + 1
# print('len mask predictions:', len(predictions_mask))
out = {
'pred_logits': predictions_class[-1],
'pred_masks': predictions_mask[-1],
'aux_outputs': self._set_aux_loss(
predictions_class if self.mask_classification else None, predictions_mask
)
}
return out
def forward_prediction_heads(self, output, mask_features, attn_mask_target_size):
decoder_output = self.decoder_norm(output)
decoder_output = decoder_output.transpose(0, 1)
outputs_class = self.class_embed(decoder_output)
mask_embed = self.mask_embed(decoder_output)
outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
# NOTE: prediction is of higher-resolution
# [B, Q, H, W] -> [B, Q, H*W] -> [B, h, Q, H*W] -> [B*h, Q, HW]
attn_mask = F.interpolate(outputs_mask, size=attn_mask_target_size, mode="bilinear", align_corners=False)
# must use bool type
# If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged.
attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool()
attn_mask = attn_mask.detach()
return outputs_class, outputs_mask, attn_mask
@torch.jit.unused
def _set_aux_loss(self, outputs_class, outputs_seg_masks):
# this is a workaround to make torchscript happy, as torchscript
# doesn't support dictionary with non-homogeneous values, such
# as a dict having both a Tensor and a list.
if self.mask_classification:
return [
{"pred_logits": a, "pred_masks": b}
for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
]
else:
return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
================================================
FILE: mask2former/modeling/transformer_decoder/maskformer_transformer_decoder.py
================================================
# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
import fvcore.nn.weight_init as weight_init
import torch
from torch import nn
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.layers import Conv2d
from detectron2.utils.registry import Registry
from .position_encoding import PositionEmbeddingSine
from .transformer import Transformer
TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE")
TRANSFORMER_DECODER_REGISTRY.__doc__ = """
Registry for transformer module in MaskFormer.
"""
def build_transformer_decoder(cfg, in_channels, mask_classification=True):
"""
Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`.
"""
name = cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME
return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification)
@TRANSFORMER_DECODER_REGISTRY.register()
class StandardTransformerDecoder(nn.Module):
@configurable
def __init__(
self,
in_channels,
mask_classification=True,
*,
num_classes: int,
hidden_dim: int,
num_queries: int,
nheads: int,
dropout: float,
dim_feedforward: int,
enc_layers: int,
dec_layers: int,
pre_norm: bool,
deep_supervision: bool,
mask_dim: int,
enforce_input_project: bool,
):
"""
NOTE: this interface is experimental.
Args:
in_channels: channels of the input features
mask_classification: whether to add mask classifier or not
num_classes: number of classes
hidden_dim: Transformer feature dimension
num_queries: number of queries
nheads: number of heads
dropout: dropout in Transformer
dim_feedforward: feature dimension in feedforward network
enc_layers: number of Transformer encoder layers
dec_layers: number of Transformer decoder layers
pre_norm: whether to use pre-LayerNorm or not
deep_supervision: whether to add supervision to every decoder layers
mask_dim: mask feature dimension
enforce_input_project: add input project 1x1 conv even if input
channels and hidden dim is identical
"""
super().__init__()
self.mask_classification = mask_classification
# positional encoding
N_steps = hidden_dim // 2
self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
transformer = Transformer(
d_model=hidden_dim,
dropout=dropout,
nhead=nheads,
dim_feedforward=dim_feedforward,
num_encoder_layers=enc_layers,
num_decoder_layers=dec_layers,
normalize_before=pre_norm,
return_intermediate_dec=deep_supervision,
)
self.num_queries = num_queries
self.transformer = transformer
hidden_dim = transformer.d_model
self.query_embed = nn.Embedding(num_queries, hidden_dim)
if in_channels != hidden_dim or enforce_input_project:
self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1)
weight_init.c2_xavier_fill(self.input_proj)
else:
self.input_proj = nn.Sequential()
self.aux_loss = deep_supervision
# output FFNs
if self.mask_classification:
self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
@classmethod
def from_config(cls, cfg, in_channels, mask_classification):
ret = {}
ret["in_channels"] = in_channels
ret["mask_classification"] = mask_classification
ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
# Transformer parameters:
ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS
ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS
ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
return ret
def forward(self, x, mask_features, mask=None):
if mask is not None:
mask = F.interpolate(mask[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
pos = self.pe_layer(x, mask)
src = x
hs, memory = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos)
if self.mask_classification:
outputs_class = self.class_embed(hs)
out = {"pred_logits": outputs_class[-1]}
else:
out = {}
if self.aux_loss:
# [l, bs, queries, embed]
mask_embed = self.mask_embed(hs)
outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features)
out["pred_masks"] = outputs_seg_masks[-1]
out["aux_outputs"] = self._set_aux_loss(
outputs_class if self.mask_classification else None, outputs_seg_masks
)
else:
# FIXME h_boxes takes the last one computed, keep this in mind
# [bs, queries, embed]
mask_embed = self.mask_embed(hs[-1])
outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
out["pred_masks"] = outputs_seg_masks
return out
@torch.jit.unused
def _set_aux_loss(self, outputs_class, outputs_seg_masks):
# this is a workaround to make torchscript happy, as torchscript
# doesn't support dictionary with non-homogeneous values, such
# as a dict having both a Tensor and a list.
if self.mask_classification:
return [
{"pred_logits": a, "pred_masks": b}
for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
]
else:
return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
class MLP(nn.Module):
"""Very simple multi-layer perceptron (also called FFN)"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.ModuleList(
nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
)
def forward(self, x):
for i, layer in enumerate(self.layers):
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x
================================================
FILE: mask2former/modeling/transformer_decoder/position_encoding.py
================================================
# # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
"""
Various positional encodings for the transformer.
"""
import math
import torch
from torch import nn
class PositionEmbeddingSine(nn.Module):
"""
This is a more standard version of the position embedding, very similar to the one
used by the Attention is all you need paper, generalized to work on images.
"""
def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
super().__init__()
self.num_pos_feats = num_pos_feats
self.temperature = temperature
self.normalize = normalize
if scale is not None and normalize is False:
raise ValueError("normalize should be True if scale is passed")
if scale is None:
scale = 2 * math.pi
self.scale = scale
def forward(self, x, mask=None):
if mask is None:
mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
not_mask = ~mask
y_embed = not_mask.cumsum(1, dtype=torch.float32)
x_embed = not_mask.cumsum(2, dtype=torch.float32)
if self.normalize:
eps = 1e-6
y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
pos_x = x_embed[:, :, :, None] / dim_t
pos_y = y_embed[:, :, :, None] / dim_t
pos_x = torch.stack(
(pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
).flatten(3)
pos_y = torch.stack(
(pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
).flatten(3)
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
return pos
def __repr__(self, _repr_indent=4):
head = "Positional encoding " + self.__class__.__name__
body = [
"num_pos_feats: {}".format(self.num_pos_feats),
"temperature: {}".format(self.temperature),
"normalize: {}".format(self.normalize),
"scale: {}".format(self.scale),
]
# _repr_indent = 4
lines = [head] + [" " * _repr_indent + line for line in body]
return "\n".join(lines)
================================================
FILE: mask2former/modeling/transformer_decoder/transformer.py
================================================
# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/transformer.py
"""
Transformer class.
Copy-paste from torch.nn.Transformer with modifications:
* positional encodings are passed in MHattention
* extra LN at the end of encoder is removed
* decoder returns a stack of activations from all decoding layers
"""
import copy
from typing import List, Optional
import torch
import torch.nn.functional as F
from torch import Tensor, nn
class Transformer(nn.Module):
def __init__(
self,
d_model=512,
nhead=8,
num_encoder_layers=6,
num_decoder_layers=6,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
normalize_before=False,
return_intermediate_dec=False,
):
super().__init__()
encoder_layer = TransformerEncoderLayer(
d_model, nhead, dim_feedforward, dropout, activation, normalize_before
)
encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
decoder_layer = TransformerDecoderLayer(
d_model, nhead, dim_feedforward, dropout, activation, normalize_before
)
decoder_norm = nn.LayerNorm(d_model)
self.decoder = TransformerDecoder(
decoder_layer,
num_decoder_layers,
decoder_norm,
return_intermediate=return_intermediate_dec,
)
self._reset_parameters()
self.d_model = d_model
self.nhead = nhead
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def forward(self, src, mask, query_embed, pos_embed):
# flatten NxCxHxW to HWxNxC
bs, c, h, w = src.shape
src = src.flatten(2).permute(2, 0, 1)
pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
if mask is not None:
mask = mask.flatten(1)
tgt = torch.zeros_like(query_embed)
memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
hs = self.decoder(
tgt, memory, memory_key_padding_mask=mask, pos=pos_embed, query_pos=query_embed
)
return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
class TransformerEncoder(nn.Module):
def __init__(self, encoder_layer, num_layers, norm=None):
super().__init__()
self.layers = _get_clones(encoder_layer, num_layers)
self.num_layers = num_layers
self.norm = norm
def forward(
self,
src,
mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
):
output = src
for layer in self.layers:
output = layer(
output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, pos=pos
)
if self.norm is not None:
output = self.norm(output)
return output
class TransformerDecoder(nn.Module):
def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
super().__init__()
self.layers = _get_clones(decoder_layer, num_layers)
self.num_layers = num_layers
self.norm = norm
self.return_intermediate = return_intermediate
def forward(
self,
tgt,
memory,
tgt_mask: Optional[Tensor] = None,
memory_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None,
):
output = tgt
intermediate = []
for layer in self.layers:
output = layer(
output,
memory,
tgt_mask=tgt_mask,
memory_mask=memory_mask,
tgt_key_padding_mask=tgt_key_padding_mask,
memory_key_padding_mask=memory_key_padding_mask,
pos=pos,
query_pos=query_pos,
)
if self.return_intermediate:
intermediate.append(self.norm(output))
if self.norm is not None:
output = self.norm(output)
if self.return_intermediate:
intermediate.pop()
intermediate.append(output)
if self.return_intermediate:
return torch.stack(intermediate)
return output.unsqueeze(0)
class TransformerEncoderLayer(nn.Module):
def __init__(
self,
d_model,
nhead,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
normalize_before=False,
):
super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(
self,
src,
src_mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
):
q = k = self.with_pos_embed(src, pos)
src2 = self.self_attn(
q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
)[0]
src = src + self.dropout1(src2)
src = self.norm1(src)
src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
src = src + self.dropout2(src2)
src = self.norm2(src)
return src
def forward_pre(
self,
src,
src_mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
):
src2 = self.norm1(src)
q = k = self.with_pos_embed(src2, pos)
src2 = self.self_attn(
q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
)[0]
src = src + self.dropout1(src2)
src2 = self.norm2(src)
src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
src = src + self.dropout2(src2)
return src
def forward(
self,
src,
src_mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
):
if self.normalize_before:
return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
return self.forward_post(src, src_mask, src_key_padding_mask, pos)
class TransformerDecoderLayer(nn.Module):
def __init__(
self,
d_model,
nhead,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
normalize_before=False,
):
super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.dropout3 = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(
self,
tgt,
memory,
tgt_mask: Optional[Tensor] = None,
memory_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None,
):
q = k = self.with_pos_embed(tgt, query_pos)
tgt2 = self.self_attn(
q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
)[0]
tgt = tgt + self.dropout1(tgt2)
tgt = self.norm1(tgt)
tgt2 = self.multihead_attn(
query=self.with_pos_embed(tgt, query_pos),
key=self.with_pos_embed(memory, pos),
value=memory,
attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask,
)[0]
tgt = tgt + self.dropout2(tgt2)
tgt = self.norm2(tgt)
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
tgt = tgt + self.dropout3(tgt2)
tgt = self.norm3(tgt)
return tgt
def forward_pre(
self,
tgt,
memory,
tgt_mask: Optional[Tensor] = None,
memory_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None,
):
tgt2 = self.norm1(tgt)
q = k = self.with_pos_embed(tgt2, query_pos)
tgt2 = self.self_attn(
q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
)[0]
tgt = tgt + self.dropout1(tgt2)
tgt2 = self.norm2(tgt)
tgt2 = self.multihead_attn(
query=self.with_pos_embed(tgt2, query_pos),
key=self.with_pos_embed(memory, pos),
value=memory,
attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask,
)[0]
tgt = tgt + self.dropout2(tgt2)
tgt2 = self.norm3(tgt)
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
tgt = tgt + self.dropout3(tgt2)
return tgt
def forward(
self,
tgt,
memory,
tgt_mask: Optional[Tensor] = None,
memory_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None,
):
if self.normalize_before:
return self.forward_pre(
tgt,
memory,
tgt_mask,
memory_mask,
tgt_key_padding_mask,
memory_key_padding_mask,
pos,
query_pos,
)
return self.forward_post(
tgt,
memory,
tgt_mask,
memory_mask,
tgt_key_padding_mask,
memory_key_padding_mask,
pos,
query_pos,
)
def _get_clones(module, N):
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
def _get_activation_fn(activation):
"""Return an activation function given a string"""
if activation == "relu":
return F.relu
if activation == "gelu":
return F.gelu
if activation == "glu":
return F.glu
raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
================================================
FILE: mask2former/test_time_augmentation.py
================================================
import copy
import logging
from itertools import count
import numpy as np
import torch
from fvcore.transforms import HFlipTransform
from torch import nn
from torch.nn.parallel import DistributedDataParallel
from detectron2.data.detection_utils import read_image
from detectron2.modeling import DatasetMapperTTA
__all__ = [
"SemanticSegmentorWithTTA",
]
class SemanticSegmentorWithTTA(nn.Module):
"""
A SemanticSegmentor with test-time augmentation enabled.
Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
"""
def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
"""
Args:
cfg (CfgNode):
model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
tta_mapper (callable): takes a dataset dict and returns a list of
augmented versions of the dataset dict. Defaults to
`DatasetMapperTTA(cfg)`.
batch_size (int): batch the augmented images into this batch size for inference.
"""
super().__init__()
if isinstance(model, DistributedDataParallel):
model = model.module
self.cfg = cfg.clone()
self.model = model
if tta_mapper is None:
tta_mapper = DatasetMapperTTA(cfg)
self.tta_mapper = tta_mapper
self.batch_size = batch_size
def __call__(self, batched_inputs):
"""
Same input/output format as :meth:`SemanticSegmentor.forward`
"""
def _maybe_read_image(dataset_dict):
ret = copy.copy(dataset_dict)
if "image" not in ret:
image = read_image(ret.pop("file_name"), self.model.input_format)
image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW
ret["image"] = image
if "height" not in ret and "width" not in ret:
ret["height"] = image.shape[1]
ret["width"] = image.shape[2]
return ret
processed_results = []
for x in batched_inputs:
result = self._inference_one_image(_maybe_read_image(x))
processed_results.append(result)
return processed_results
def _inference_one_image(self, input):
"""
Args:
input (dict): one dataset dict with "image" field being a CHW tensor
Returns:
dict: one output dict
"""
orig_shape = (input["height"], input["width"])
augmented_inputs, tfms = self._get_augmented_inputs(input)
final_predictions = None
count_predictions = 0
for input, tfm in zip(augmented_inputs, tfms):
count_predictions += 1
with torch.no_grad():
if final_predictions is None:
if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2])
else:
final_predictions = self.model([input])[0].pop("sem_seg")
else:
if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2])
else:
final_predictions += self.model([input])[0].pop("sem_seg")
final_predictions = final_predictions / count_predictions
return {"sem_seg": final_predictions}
def _get_augmented_inputs(self, input):
augmented_inputs = self.tta_mapper(input)
tfms = [x.pop("transforms") for x in augmented_inputs]
return augmented_inputs, tfms
================================================
FILE: mask2former/utils/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
================================================
FILE: mask2former/utils/__init__.py.new
================================================
================================================
FILE: mask2former/utils/misc.py
================================================
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
"""
Misc functions, including distributed helpers.
Mostly copy-paste from torchvision references.
"""
from typing import List, Optional
import torch
import torch.distributed as dist
import torchvision
from torch import Tensor
def _max_by_axis(the_list):
# type: (List[List[int]]) -> List[int]
maxes = the_list[0]
for sublist in the_list[1:]:
for index, item in enumerate(sublist):
maxes[index] = max(maxes[index], item)
return maxes
class NestedTensor(object):
def __init__(self, tensors, mask: Optional[Tensor]):
self.tensors = tensors
self.mask = mask
def to(self, device):
# type: (Device) -> NestedTensor # noqa
cast_tensor = self.tensors.to(device)
mask = self.mask
if mask is not None:
assert mask is not None
cast_mask = mask.to(device)
else:
cast_mask = None
return NestedTensor(cast_tensor, cast_mask)
def decompose(self):
return self.tensors, self.mask
def __repr__(self):
return str(self.tensors)
def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
# TODO make this more general
if tensor_list[0].ndim == 3:
if torchvision._is_tracing():
# nested_tensor_from_tensor_list() does not export well to ONNX
# call _onnx_nested_tensor_from_tensor_list() instead
return _onnx_nested_tensor_from_tensor_list(tensor_list)
# TODO make it support different-sized images
max_size = _max_by_axis([list(img.shape) for img in tensor_list])
# min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
batch_shape = [len(tensor_list)] + max_size
b, c, h, w = batch_shape
dtype = tensor_list[0].dtype
device = tensor_list[0].device
tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
for img, pad_img, m in zip(tensor_list, tensor, mask):
pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
m[: img.shape[1], : img.shape[2]] = False
else:
raise ValueError("not supported")
return NestedTensor(tensor, mask)
# _onnx_nested_tensor_from_tensor_list() is an implementation of
# nested_tensor_from_tensor_list() that is supported by ONNX tracing.
@torch.jit.unused
def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
max_size = []
for i in range(tensor_list[0].dim()):
max_size_i = torch.max(
torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
).to(torch.int64)
max_size.append(max_size_i)
max_size = tuple(max_size)
# work around for
# pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
# m[: img.shape[1], :img.shape[2]] = False
# which is not yet supported in onnx
padded_imgs = []
padded_masks = []
for img in tensor_list:
padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
padded_imgs.append(padded_img)
m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
padded_masks.append(padded_mask.to(torch.bool))
tensor = torch.stack(padded_imgs)
mask = torch.stack(padded_masks)
return NestedTensor(tensor, mask=mask)
def is_dist_avail_and_initialized():
if not dist.is_available():
return False
if not dist.is_initialized():
return False
return True
================================================
FILE: mask2former_video/__init__.py
================================================
from . import modeling
# config
from .config import add_maskformer2_video_config
# models
from .video_maskformer_model import VideoMaskFormer
# video
from .data_video import (
YTVISDatasetMapper,
CocoClipDatasetMapper,
YTVISEvaluator,
build_detection_train_loader,
build_detection_test_loader,
build_combined_loader,
get_detection_dataset_dicts,
)
================================================
FILE: mask2former_video/config.py
================================================
# -*- coding: utf-8 -*-
from detectron2.config import CfgNode as CN
def add_maskformer2_video_config(cfg):
# video data
# DataLoader
cfg.INPUT.SAMPLING_FRAME_NUM = 3
cfg.INPUT.SAMPLING_FRAME_RANGE = 5
cfg.INPUT.SAMPLING_FRAME_SHUFFLE = True
cfg.INPUT.AUGMENTATIONS = []
cfg.INPUT.PSEUDO = CN()
cfg.INPUT.PSEUDO.AUGMENTATIONS = ['rotation']
cfg.INPUT.PSEUDO.MIN_SIZE_TRAIN = (480, 512, 544, 576, 608, 640, 672, 704, 736, 768)
cfg.INPUT.PSEUDO.MAX_SIZE_TRAIN = 768
cfg.INPUT.PSEUDO.MIN_SIZE_TRAIN_SAMPLING = "choice_by_clip"
cfg.INPUT.PSEUDO.SAMPLING_FRAME_NUM = 4
cfg.INPUT.PSEUDO.SAMPLING_FRAME_RANGE = 20
cfg.INPUT.PSEUDO.CROP = CN()
cfg.INPUT.PSEUDO.CROP.ENABLED = False
cfg.INPUT.PSEUDO.CROP.TYPE = "absolute_range"
cfg.INPUT.PSEUDO.CROP.SIZE = (384, 600)
# LSJ
cfg.INPUT.LSJ_AUG = CN()
cfg.INPUT.LSJ_AUG.ENABLED = False
cfg.INPUT.LSJ_AUG.IMAGE_SIZE = 1024
cfg.INPUT.LSJ_AUG.MIN_SCALE = 0.1
cfg.INPUT.LSJ_AUG.MAX_SCALE = 2.0
================================================
FILE: mask2former_video/data_video/__init__.py
================================================
# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
from .dataset_mapper import YTVISDatasetMapper, CocoClipDatasetMapper
from .build import *
from .datasets import *
from .ytvis_eval import YTVISEvaluator
================================================
FILE: mask2former_video/data_video/augmentation.py
================================================
# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
import numpy as np
import logging
import sys
from fvcore.transforms.transform import (
HFlipTransform,
NoOpTransform,
VFlipTransform,
)
from PIL import Image
from typing import Tuple
from detectron2.data import transforms as T
class RandomApplyClip(T.Augmentation):
"""
Randomly apply an augmentation with a given probability.
"""
def __init__(self, tfm_or_aug, prob=0.5, clip_frame_cnt=1):
"""
Args:
tfm_or_aug (Transform, Augmentation): the transform or augmentation
to be applied. It can either be a `Transform` or `Augmentation`
instance.
prob (float): probability between 0.0 and 1.0 that
the wrapper transformation is applied
"""
super().__init__()
self.aug = T.augmentation._transform_to_aug(tfm_or_aug)
assert 0.0 <= prob <= 1.0, f"Probablity must be between 0.0 and 1.0 (given: {prob})"
self.prob = prob
self._cnt = 0
self.clip_frame_cnt = clip_frame_cnt
def get_transform(self, *args):
if self._cnt % self.clip_frame_cnt == 0:
self.do = self._rand_range() < self.prob
self._cnt = 0 # avoiding overflow
self._cnt += 1
if self.do:
return self.aug.get_transform(*args)
else:
return NoOpTransform()
def __call__(self, aug_input):
if self._cnt % self.clip_frame_cnt == 0:
self.do = self._rand_range() < self.prob
self._cnt = 0 # avoiding overflow
self._cnt += 1
if self.do:
return self.aug(aug_input)
else:
return NoOpTransform()
class RandomRotationClip(T.Augmentation):
"""
This method returns a copy of this image, rotated the given
number of degrees counter clockwise around the given center.
"""
def __init__(self, angle, prob=0.5, expand=True, center=None, interp=None, clip_frame_cnt=1):
"""
Args:
angle (list[float]): If ``sample_style=="range"``,
a [min, max] interval from which to sample the angle (in degrees).
If ``sample_style=="choice"``, a list of angles to sample from
expand (bool): choose if the image should be resized to fit the whole
rotated image (default), or simply cropped
center (list[[float, float]]): If ``sample_style=="range"``,
a [[minx, miny], [maxx, maxy]] relative interval from which to sample the center,
[0, 0] being the top left of the image and [1, 1] the bottom right.
If ``sample_style=="choice"``, a list of centers to sample from
Default: None, which means that the center of rotation is the center of the image
center has no effect if expand=True because it only affects shifting
"""
super().__init__()
if isinstance(angle, (float, int)):
angle = (angle, angle)
if center is not None and isinstance(center[0], (float, int)):
center = (center, center)
self.angle_save = None
self.center_save = None
self._cnt = 0
self._init(locals())
def get_transform(self, image):
h, w = image.shape[:2]
if self._cnt % self.clip_frame_cnt == 0:
center = None
angle = np.random.uniform(self.angle[0], self.angle[1], size=self.clip_frame_cnt)
if self.center is not None:
center = (
np.random.uniform(self.center[0][0], self.center[1][0]),
np.random.uniform(self.center[0][1], self.center[1][1]),
)
angle = np.sort(angle)
if self._rand_range() < self.prob:
angle = angle[::-1]
self.angle_save = angle
self.center_save = center
self._cnt = 0 # avoiding overflow
angle = self.angle_save[self._cnt]
center = self.center_save
self._cnt += 1
if center is not None:
center = (w * center[0], h * center[1]) # Convert to absolute coordinates
if angle % 360 == 0:
return NoOpTransform()
return T.RotationTransform(h, w, angle, expand=self.expand, center=center, interp=self.interp)
class ResizeShortestEdge(T.Augmentation):
"""
Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge.
If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
"""
def __init__(
self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR, clip_frame_cnt=1
):
"""
Args:
short_edge_length (list[int]): If ``sample_style=="range"``,
a [min, max] interval from which to sample the shortest edge length.
If ``sample_style=="choice"``, a list of shortest edge lengths to sample from.
max_size (int): maximum allowed longest edge length.
sample_style (str): either "range" or "choice".
"""
super().__init__()
assert sample_style in ["range", "choice", "range_by_clip", "choice_by_clip"], sample_style
self.is_range = ("range" in sample_style)
if isinstance(short_edge_length, int):
short_edge_length = (short_edge_length, short_edge_length)
if self.is_range:
assert len(short_edge_length) == 2, (
"short_edge_length must be two values using 'range' sample style."
f" Got {short_edge_length}!"
)
self._cnt = 0
self._init(locals())
def get_transform(self, image):
if self._cnt % self.clip_frame_cnt == 0:
if self.is_range:
self.size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1)
else:
self.size = np.random.choice(self.short_edge_length)
if self.size == 0:
return NoOpTransform()
self._cnt = 0 # avoiding overflow
self._cnt += 1
h, w = image.shape[:2]
scale = self.size * 1.0 / min(h, w)
if h < w:
newh, neww = self.size, scale * w
else:
newh, neww = scale * h, self.size
if max(newh, neww) > self.max_size:
scale = self.max_size * 1.0 / max(newh, neww)
newh = newh * scale
neww = neww * scale
neww = int(neww + 0.5)
newh = int(newh + 0.5)
return T.ResizeTransform(h, w, newh, neww, self.interp)
class RandomFlip(T.Augmentation):
"""
Flip the image horizontally or vertically with the given probability.
"""
def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_frame_cnt=1):
"""
Args:
prob (float): probability of flip.
horizontal (boolean): whether to apply horizontal flipping
vertical (boolean): whether to apply vertical flipping
"""
super().__init__()
if horizontal and vertical:
raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.")
if not horizontal and not vertical:
raise ValueError("At least one of horiz or vert has to be True!")
self._cnt = 0
self._init(locals())
def get_transform(self, image):
if self._cnt % self.clip_frame_cnt == 0:
self.do = self._rand_range() < self.prob
self._cnt = 0 # avoiding overflow
self._cnt += 1
h, w = image.shape[:2]
if self.do:
if self.horizontal:
return HFlipTransform(w)
elif self.vertical:
return VFlipTransform(h)
else:
return NoOpTransform()
class RandomCropClip(T.Augmentation):
"""
Randomly crop a rectangle region out of an image.
"""
def __init__(self, crop_type: str, crop_size, clip_frame_cnt=1):
"""
Args:
crop_type (str): one of "relative_range", "relative", "absolute", "absolute_range".
crop_size (tuple[float, float]): two floats, explained below.
- "relative": crop a (H * crop_size[0], W * crop_size[1]) region from an input image of
size (H, W). crop size should be in (0, 1]
- "relative_range": uniformly sample two values from [crop_size[0], 1]
and [crop_size[1]], 1], and use them as in "relative" crop type.
- "absolute" crop a (crop_size[0], crop_size[1]) region from input image.
crop_size must be smaller than the input image size.
- "absolute_range", for an input of size (H, W), uniformly sample H_crop in
[crop_size[0], min(H, crop_size[1])] and W_crop in [crop_size[0], min(W, crop_size[1])].
Then crop a region (H_crop, W_crop).
"""
# TODO style of relative_range and absolute_range are not consistent:
# one takes (h, w) but another takes (min, max)
super().__init__()
assert crop_type in ["relative_range", "relative", "absolute", "absolute_range"]
self._init(locals())
self._cnt = 0
def get_transform(self, image):
h, w = image.shape[:2] # 667, 500
if self._cnt % self.clip_frame_cnt == 0:
croph, cropw = self.get_crop_size((h, w))
assert h >= croph and w >= cropw, "Shape computation in {} has bugs.".format(self)
h0 = np.random.randint(h - croph + 1) # rand(124) -> 5
w0 = np.random.randint(w - cropw + 1) # rand(111) -> 634
h1 = np.random.randint(h0, h - croph + 1)
w1 = np.random.randint(w0, w - cropw + 1)
x = np.sort(np.random.rand(self.clip_frame_cnt))
h = h0 * x + h1 * (1-x)
w = w0 * x + w1 * (1-x)
h = np.round_(h).astype(np.int)
w = np.round_(w).astype(np.int)
if self._rand_range() < 0.5:
h = h[::-1]
w = w[::-1]
self.hw_save = (h, w)
self.crop_h_save, self.crop_w_save = croph, cropw
self._cnt = 0 # avoiding overflow
_h, _w = self.hw_save[0][self._cnt], self.hw_save[1][self._cnt]
self._cnt += 1
return T.CropTransform(_w, _h, self.crop_w_save, self.crop_h_save)
def get_crop_size(self, image_size):
"""
Args:
image_size (tuple): height, width
Returns:
crop_size (tuple): height, width in absolute pixels
"""
h, w = image_size
if self.crop_type == "relative":
ch, cw = self.crop_size
return int(h * ch + 0.5), int(w * cw + 0.5)
elif self.crop_type == "relative_range":
crop_size = np.asarray(self.crop_size, dtype=np.float32)
ch, cw = crop_size + np.random.rand(2) * (1 - crop_size)
return int(h * ch + 0.5), int(w * cw + 0.5)
elif self.crop_type == "absolute":
return (min(self.crop_size[0], h), min(self.crop_size[1], w))
elif self.crop_type == "absolute_range":
assert self.crop_size[0] <= self.crop_size[1]
ch = np.random.randint(min(h, self.crop_size[0]), min(h, self.crop_size[1]) + 1)
cw = np.random.randint(min(w, self.crop_size[0]), min(w, self.crop_size[1]) + 1)
return ch, cw
else:
raise NotImplementedError("Unknown crop type {}".format(self.crop_type))
class FixedSizeCropClip(T.Augmentation):
"""
If `crop_size` is smaller than the input image size, then it uses a random crop of
the crop size. If `crop_size` is larger than the input image size, then it pads
the right and the bottom of the image to the crop size if `pad` is True, otherwise
it returns the smaller image.
"""
def __init__(self, crop_size: Tuple[int], pad: bool = True, pad_value: float = 128.0, clip_frame_cnt=1):
"""
Args:
crop_size: target image (height, width).
pad: if True, will pad images smaller than `crop_size` up to `crop_size`
pad_value: the padding value.
"""
super().__init__()
self._init(locals())
self._cnt = 0
def _get_crop(self, image: np.ndarray):
# Compute the image scale and scaled size.
input_size = image.shape[:2]
output_size = self.crop_size
# Add random crop if the image is scaled up.
max_offset = np.subtract(input_size, output_size)
max_offset = np.maximum(max_offset, 0)
if self._cnt % self.clip_frame_cnt == 0:
offset = np.multiply(max_offset, np.random.uniform(0.0, 1.0))
offset = np.round(offset).astype(int)
self.offset_save = offset
self._cnt = 0 # avoiding overflow
self._cnt += 1
offset = self.offset_save
return CropTransform(
offset[1], offset[0], output_size[1], output_size[0], input_size[1], input_size[0]
)
def _get_pad(self, image: np.ndarray):
# Compute the image scale and scaled size.
input_size = image.shape[:2]
output_size = self.crop_size
# Add padding if the image is scaled down.
pad_size = np.subtract(output_size, input_size)
pad_size = np.maximum(pad_size, 0)
original_size = np.minimum(input_size, output_size)
return PadTransform(
0, 0, pad_size[1], pad_size[0], original_size[1], original_size[0], self.pad_value
)
def get_transform(self, image: np.ndarray):
transforms = [self._get_crop(image)]
if self.pad:
transforms.append(self._get_pad(image))
return TransformList(transforms)
class ResizeShortestEdgeClip(T.Augmentation):
"""
Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge.
If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
"""
def __init__(
self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR, clip_frame_cnt=1
):
"""
Args:
short_edge_length (list[int]): If ``sample_style=="range"``,
a [min, max] interval from which to sample the shortest edge length.
If ``sample_style=="choice"``, a list of shortest edge lengths to sample from.
max_size (int): maximum allowed longest edge length.
sample_style (str): either "range" or "choice".
"""
super().__init__()
assert sample_style in ["range", "choice", "range_by_clip", "choice_by_clip"], sample_style
self.is_range = ("range" in sample_style)
if isinstance(short_edge_length, int):
short_edge_length = (short_edge_length, short_edge_length)
if self.is_range:
assert len(short_edge_length) == 2, (
"short_edge_length must be two values using 'range' sample style."
f" Got {short_edge_length}!"
)
self._cnt = 0
self._init(locals())
def get_transform(self, image):
if self._cnt % self.clip_frame_cnt == 0:
if self.is_range:
self.size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1)
else:
self.size = np.random.choice(self.short_edge_length)
self._cnt = 0 # avoiding overflow
if self.size == 0:
return NoOpTransform()
self._cnt += 1
h, w = image.shape[:2]
scale = self.size * 1.0 / min(h, w)
if h < w:
newh, neww = self.size, scale * w
else:
newh, neww = scale * h, self.size
if max(newh, neww) > self.max_size:
scale = self.max_size * 1.0 / max(newh, neww)
newh = newh * scale
neww = neww * scale
neww = int(neww + 0.5)
newh = int(newh + 0.5)
return T.ResizeTransform(h, w, newh, neww, self.interp)
class RandomFlipClip(T.Augmentation):
"""
Flip the image horizontally or vertically with the given probability.
"""
def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_frame_cnt=1):
"""
Args:
prob (float): probability of flip.
horizontal (boolean): whether to apply horizontal flipping
vertical (boolean): whether to apply vertical flipping
"""
super().__init__()
if horizontal and vertical:
raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.")
if not horizontal and not vertical:
raise ValueError("At least one of horiz or vert has to be True!")
self._cnt = 0
self._init(locals())
def get_transform(self, image):
if self._cnt % self.clip_frame_cnt == 0:
self.do = self._rand_range() < self.prob
self._cnt = 0 # avoiding overflow
self._cnt += 1
h, w = image.shape[:2]
if self.do:
if self.horizontal:
return HFlipTransform(w)
elif self.vertical:
return VFlipTransform(h)
else:
return NoOpTransform()
def build_augmentation(cfg, is_train):
logger = logging.getLogger(__name__)
aug_list = []
if is_train:
# Crop
if cfg.INPUT.CROP.ENABLED:
aug_list.append(T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE))
# Resize
min_size = cfg.INPUT.MIN_SIZE_TRAIN
max_size = cfg.INPUT.MAX_SIZE_TRAIN
sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
ms_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM if "by_clip" in cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING else 1
aug_list.append(ResizeShortestEdge(min_size, max_size, sample_style, clip_frame_cnt=ms_clip_frame_cnt))
# Flip
if cfg.INPUT.RANDOM_FLIP != "none":
if cfg.INPUT.RANDOM_FLIP == "flip_by_clip":
flip_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM
else:
flip_clip_frame_cnt = 1
aug_list.append(
# NOTE using RandomFlip modified for the support of flip maintenance
RandomFlip(
horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"),
vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
clip_frame_cnt=flip_clip_frame_cnt,
)
)
# Additional augmentations : brightness, contrast, saturation, rotation
augmentations = cfg.INPUT.AUGMENTATIONS
if "brightness" in augmentations:
aug_list.append(T.RandomBrightness(0.9, 1.1))
if "contrast" in augmentations:
aug_list.append(T.RandomContrast(0.9, 1.1))
if "saturation" in augmentations:
aug_list.append(T.RandomSaturation(0.9, 1.1))
if "rotation" in augmentations:
# print('not come here' * 10)
aug_list.append(
T.RandomRotation(
[-10, 10], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], sample_style="range"
)
)
else:
# Resize
min_size = cfg.INPUT.MIN_SIZE_TEST
max_size = cfg.INPUT.MAX_SIZE_TEST
sample_style = "choice"
aug_list.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
return aug_list
def build_pseudo_augmentation(cfg, is_train):
logger = logging.getLogger(__name__)
aug_list = []
if is_train:
use_lsj = cfg.INPUT.LSJ_AUG.ENABLED
if use_lsj:
image_size = cfg.INPUT.LSJ_AUG.IMAGE_SIZE
min_scale = cfg.INPUT.LSJ_AUG.MIN_SCALE
max_scale = cfg.INPUT.LSJ_AUG.MAX_SCALE
if cfg.INPUT.RANDOM_FLIP != "none":
if cfg.INPUT.RANDOM_FLIP == "flip_by_clip":
clip_frame_cnt = cfg.INPUT.PSEUDO.SAMPLING_FRAME_NUM
else:
clip_frame_cnt = 1
aug_list.append(
# NOTE using RandomFlip modified for the support of flip maintenance
RandomFlipClip(
horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"),
vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
clip_frame_cnt=clip_frame_cnt,
)
)
# Additional augmentations : brightness, contrast, saturation, rotation
augmentations = cfg.INPUT.PSEUDO.AUGMENTATIONS
if "brightness" in augmentations:
aug_list.append(T.RandomBrightness(0.9, 1.1))
if "contrast" in augmentations:
aug_list.append(T.RandomContrast(0.9, 1.1))
if "saturation" in augmentations:
aug_list.append(T.RandomSaturation(0.9, 1.1))
if "rotation" in augmentations:
aug_list.append(
RandomRotationClip(
[-15, 15], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], clip_frame_cnt=clip_frame_cnt,
)
)
aug_list.extend([
ResizeScaleClip(
min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size,
clip_frame_cnt=clip_frame_cnt,
),
FixedSizeCropClip(crop_size=(image_size, image_size), clip_frame_cnt=clip_frame_cnt),
])
else:
min_size = cfg.INPUT.PSEUDO.MIN_SIZE_TRAIN
max_size = cfg.INPUT.PSEUDO.MAX_SIZE_TRAIN
sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
clip_frame_cnt = cfg.INPUT.PSEUDO.SAMPLING_FRAME_NUM
# Crop
if cfg.INPUT.PSEUDO.CROP.ENABLED:
crop_aug = RandomApplyClip(
T.AugmentationList([
ResizeShortestEdgeClip([400, 500, 600], 1333, sample_style, clip_frame_cnt=clip_frame_cnt),
RandomCropClip(cfg.INPUT.PSEUDO.CROP.TYPE, cfg.INPUT.PSEUDO.CROP.SIZE, clip_frame_cnt=clip_frame_cnt)
]),
clip_frame_cnt=clip_frame_cnt
)
aug_list.append(crop_aug)
# Resize
aug_list.append(ResizeShortestEdgeClip(min_size, max_size, sample_style, clip_frame_cnt=clip_frame_cnt))
# Flip
aug_list.append(
# NOTE using RandomFlip modified for the support of flip maintenance
RandomFlipClip(
horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"),
vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
clip_frame_cnt=clip_frame_cnt,
)
)
# Additional augmentations : brightness, contrast, saturation, rotation
augmentations = cfg.INPUT.PSEUDO.AUGMENTATIONS
if "brightness" in augmentations:
aug_list.append(T.RandomBrightness(0.9, 1.1))
if "contrast" in augmentations:
aug_list.append(T.RandomContrast(0.9, 1.1))
if "saturation" in augmentations:
aug_list.append(T.RandomSaturation(0.9, 1.1))
if "rotation" in augmentations:
aug_list.append(
RandomRotationClip(
[-15, 15], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], clip_frame_cnt=clip_frame_cnt,
)
)
else:
# Resize
min_size = cfg.INPUT.MIN_SIZE_TEST
max_size = cfg.INPUT.MAX_SIZE_TEST
sample_style = "choice"
aug_list.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
return aug_list
================================================
FILE: mask2former_video/data_video/build.py
================================================
# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
import itertools
import logging
import torch.utils.data
from typing import Collection, Sequence
from detectron2.config import CfgNode, configurable
from detectron2.data.build import (
build_batch_data_loader,
load_proposals_into_dataset,
trivial_batch_collator,
)
from detectron2.data.catalog import DatasetCatalog
from detectron2.data.common import DatasetFromList, MapDataset
from detectron2.data.dataset_mapper import DatasetMapper
from detectron2.data.samplers import InferenceSampler, TrainingSampler
from detectron2.utils.comm import get_world_size
from .combined_loader import CombinedDataLoader, Loader
def _compute_num_images_per_worker(cfg: CfgNode):
num_workers = get_world_size()
images_per_batch = cfg.SOLVER.IMS_PER_BATCH
assert (
images_per_batch % num_workers == 0
), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format(
images_per_batch, num_workers
)
assert (
images_per_batch >= num_workers
), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format(
images_per_batch, num_workers
)
images_per_worker = images_per_batch // num_workers
return images_per_worker
def filter_images_with_only_crowd_annotations(dataset_dicts, dataset_names):
"""
Filter out images with none annotations or only crowd annotations
(i.e., images without non-crowd annotations).
A common training-time preprocessing on COCO dataset.
Args:
dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
Returns:
list[dict]: the same format, but filtered.
"""
num_before = len(dataset_dicts)
def valid(anns):
for ann in anns:
if isinstance(ann, list):
for instance in ann:
if instance.get("iscrowd", 0) == 0:
return True
else:
if ann.get("iscrowd", 0) == 0:
return True
return False
dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])]
num_after = len(dataset_dicts)
logger = logging.getLogger(__name__)
logger.info(
"Removed {} images with no usable annotations. {} images left.".format(
num_before - num_after, num_after
)
)
return dataset_dicts
def get_detection_dataset_dicts(
dataset_names, filter_empty=True, proposal_files=None
):
"""
Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.
Args:
dataset_names (str or list[str]): a dataset name or a list of dataset names
filter_empty (bool): whether to filter out images without instance annotations
proposal_files (list[str]): if given, a list of object proposal files
that match each dataset in `dataset_names`.
Returns:
list[dict]: a list of dicts following the standard dataset dict format.
"""
if isinstance(dataset_names, str):
dataset_names = [dataset_names]
assert len(dataset_names)
dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in dataset_names]
for dataset_name, dicts in zip(dataset_names, dataset_dicts):
assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
if proposal_files is not None:
assert len(dataset_names) == len(proposal_files)
# load precomputed proposals from proposal files
dataset_dicts = [
load_proposals_into_dataset(dataset_i_dicts, proposal_file)
for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files)
]
dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
has_instances = "annotations" in dataset_dicts[0]
if filter_empty and has_instances:
dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts, dataset_names)
assert len(dataset_dicts), "No valid data found in {}.".format(",".join(dataset_names))
return dataset_dicts
def _train_loader_from_config(cfg, mapper, dataset_name=None, *, dataset=None, sampler=None):
if dataset is None:
dataset = get_detection_dataset_dicts(
dataset_name,
filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
)
if mapper is None:
mapper = DatasetMapper(cfg, True)
if sampler is None:
sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
logger = logging.getLogger(__name__)
logger.info("Using training sampler {}".format(sampler_name))
sampler = TrainingSampler(len(dataset))
return {
"dataset": dataset,
"sampler": sampler,
"mapper": mapper,
"total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
"aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
"num_workers": cfg.DATALOADER.NUM_WORKERS,
}
# TODO can allow dataset as an iterable or IterableDataset to make this function more general
@configurable(from_config=_train_loader_from_config)
def build_detection_train_loader(
dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0
):
"""
Build a dataloader for object detection with some default features.
This interface is experimental.
Args:
dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
or a map-style pytorch dataset. They can be obtained by using
:func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
mapper (callable): a callable which takes a sample (dict) from dataset and
returns the format to be consumed by the model.
When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``.
sampler (torch.utils.data.sampler.Sampler or None): a sampler that
produces indices to be applied on ``dataset``.
Default to :class:`TrainingSampler`, which coordinates a random shuffle
sequence across all workers.
total_batch_size (int): total batch size across all workers. Batching
simply puts data into a list.
aspect_ratio_grouping (bool): whether to group images with similar
aspect ratio for efficiency. When enabled, it requires each
element in dataset be a dict with keys "width" and "height".
num_workers (int): number of parallel data loading workers
Returns:
torch.utils.data.DataLoader: a dataloader. Each output from it is a
``list[mapped_element]`` of length ``total_batch_size / num_workers``,
where ``mapped_element`` is produced by the ``mapper``.
"""
if isinstance(dataset, list):
dataset = DatasetFromList(dataset, copy=False)
if mapper is not None:
dataset = MapDataset(dataset, mapper)
if sampler is None:
sampler = TrainingSampler(len(dataset))
assert isinstance(sampler, torch.utils.data.sampler.Sampler)
return build_batch_data_loader(
dataset,
sampler,
total_batch_size,
aspect_ratio_grouping=aspect_ratio_grouping,
num_workers=num_workers,
)
def build_combined_loader(cfg: CfgNode, loaders: Collection[Loader], ratios: Sequence[float]):
images_per_worker = _compute_num_images_per_worker(cfg)
return CombinedDataLoader(loaders, images_per_worker, ratios)
def _test_loader_from_config(cfg, dataset_name, mapper=None):
"""
Uses the given `dataset_name` argument (instead of the names in cfg), because the
standard practice is to evaluate each test set individually (not combining them).
"""
dataset = get_detection_dataset_dicts(
[dataset_name],
filter_empty=False,
proposal_files=[
cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(dataset_name)]
]
if cfg.MODEL.LOAD_PROPOSALS
else None,
)
if mapper is None:
mapper = DatasetMapper(cfg, False)
return {"dataset": dataset, "mapper": mapper, "num_workers": cfg.DATALOADER.NUM_WORKERS}
@configurable(from_config=_test_loader_from_config)
def build_detection_test_loader(dataset, *, mapper, num_workers=0):
"""
Similar to `build_detection_train_loader`, but uses a batch size of 1.
This interface is experimental.
Args:
dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
or a map-style pytorch dataset. They can be obtained by using
:func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
mapper (callable): a callable which takes a sample (dict) from dataset
and returns the format to be consumed by the model.
When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``.
num_workers (int): number of parallel data loading workers
Returns:
DataLoader: a torch DataLoader, that loads the given detection
dataset, with test-time transformation and batching.
Examples:
::
data_loader = build_detection_test_loader(
DatasetRegistry.get("my_test"),
mapper=DatasetMapper(...))
# or, instantiate with a CfgNode:
data_loader = build_detection_test_loader(cfg, "my_test")
"""
if isinstance(dataset, list):
dataset = DatasetFromList(dataset, copy=False)
if mapper is not None:
dataset = MapDataset(dataset, mapper)
sampler = InferenceSampler(len(dataset))
# Always use 1 image per worker during inference since this is the
# standard when reporting inference time in papers.
batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False)
data_loader = torch.utils.data.DataLoader(
dataset,
num_workers=num_workers,
batch_sampler=batch_sampler,
collate_fn=trivial_batch_collator,
)
return data_loader
================================================
FILE: mask2former_video/data_video/combined_loader.py
================================================
import random
from collections import deque
from typing import Any, Collection, Deque, Iterable, Iterator, List, Sequence
Loader = Iterable[Any]
def _pooled_next(iterator: Iterator[Any], pool: Deque[Any]):
if not pool:
pool.extend(next(iterator))
return pool.popleft()
class CombinedDataLoader:
"""
Combines data loaders using the provided sampling ratios
"""
BATCH_COUNT = 100
def __init__(self, loaders: Collection[Loader], batch_size: int, ratios: Sequence[float]):
self.loaders = loaders
self.batch_size = batch_size
self.ratios = ratios
def __iter__(self) -> Iterator[List[Any]]:
iters = [iter(loader) for loader in self.loaders]
indices = []
pool = [deque()] * len(iters)
# infinite iterator, as in D2
while True:
if not indices:
# just a buffer of indices, its size doesn't matter
# as long as it's a multiple of batch_size
k = self.batch_size * self.BATCH_COUNT
indices = random.choices(range(len(self.loaders)), self.ratios, k=k)
try:
batch = [_pooled_next(iters[i], pool[i]) for i in indices[: self.batch_size]]
except StopIteration:
break
indices = indices[self.batch_size :]
yield batch
================================================
FILE: mask2former_video/data_video/dataset_mapper.py
================================================
# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
import copy
import logging
import random
import numpy as np
from typing import List, Union
import torch
from detectron2.config import configurable
from detectron2.structures import (
BitMasks,
Boxes,
BoxMode,
Instances,
)
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.data import MetadataCatalog
from .augmentation import build_augmentation, build_pseudo_augmentation #build_coco_augmentation
from .datasets.ytvis import COCO_TO_YTVIS_2019, COCO_TO_YTVIS_2021
import os
from pycocotools import mask as coco_mask
__all__ = ["YTVISDatasetMapper", "CocoClipDatasetMapper"]
def seed_everything(seed):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
def filter_empty_instances(instances, by_box=True, by_mask=True, box_threshold=1e-5):
"""
Filter out empty instances in an `Instances` object.
Args:
instances (Instances):
by_box (bool): whether to filter out instances with empty boxes
by_mask (bool): whether to filter out instances with empty masks
box_threshold (float): minimum width and height to be considered non-empty
Returns:
Instances: the filtered instances.
"""
assert by_box or by_mask
r = []
if by_box:
r.append(instances.gt_boxes.nonempty(threshold=box_threshold))
if instances.has("gt_masks") and by_mask:
r.append(instances.gt_masks.nonempty())
if not r:
return instances
m = r[0]
for x in r[1:]:
m = m & x
instances.gt_ids[~m] = -1
return instances
def _get_dummy_anno():
return {
"iscrowd": 0,
"category_id": -1,
"id": -1,
"bbox": np.array([0, 0, 0, 0]),
"bbox_mode": BoxMode.XYXY_ABS,
"segmentation": [np.array([0.0] * 6)]
}
def ytvis_annotations_to_instances(annos, image_size):
"""
Create an :class:`Instances` object used by the models,
from instance annotations in the dataset dict.
Args:
annos (list[dict]): a list of instance annotations in one image, each
element for one instance.
image_size (tuple): height, width
Returns:
Instances:
It will contain fields "gt_boxes", "gt_classes", "gt_ids",
"gt_masks", if they can be obtained from `annos`.
This is the format that builtin models expect.
"""
boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos]
target = Instances(image_size)
target.gt_boxes = Boxes(boxes)
classes = [int(obj["category_id"]) for obj in annos]
classes = torch.tensor(classes, dtype=torch.int64)
target.gt_classes = classes
ids = [int(obj["id"]) for obj in annos]
ids = torch.tensor(ids, dtype=torch.int64)
target.gt_ids = ids
if len(annos) and "segmentation" in annos[0]:
segms = [obj["segmentation"] for obj in annos]
masks = []
for segm in segms:
assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
segm.ndim
)
# mask array
masks.append(segm)
# torch.from_numpy does not support array with negative stride.
masks = BitMasks(
torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in masks])
)
target.gt_masks = masks
return target
def convert_coco_poly_to_mask(segmentations, height, width):
masks = []
for polygons in segmentations:
rles = coco_mask.frPyObjects(polygons, height, width)
mask = coco_mask.decode(rles)
if len(mask.shape) < 3:
mask = mask[..., None]
mask = torch.as_tensor(mask, dtype=torch.uint8)
mask = mask.any(dim=2)
masks.append(mask)
if masks:
masks = torch.stack(masks, dim=0)
else:
masks = torch.zeros((0, height, width), dtype=torch.uint8)
return masks
class YTVISDatasetMapper:
"""
A callable which takes a dataset dict in YouTube-VIS Dataset format,
and map it into a format used by the model.
"""
@configurable
def __init__(
self,
is_train: bool,
is_tgt: bool,
*,
augmentations: List[Union[T.Augmentation, T.Transform]],
image_format: str,
use_instance_mask: bool = False,
sampling_frame_num: int = 2,
sampling_frame_range: int = 5,
sampling_frame_shuffle: bool = False,
num_classes: int = 40,
src_dataset_name: str = "",
tgt_dataset_name: str = "",
):
"""
NOTE: this interface is experimental.
Args:
is_train: whether it's used in training or inference
augmentations: a list of augmentations or deterministic transforms to apply
image_format: an image format supported by :func:`detection_utils.read_image`.
use_instance_mask: whether to process instance segmentation annotations, if available
"""
# fmt: off
self.is_train = is_train
self.is_tgt = is_tgt
self.augmentations = T.AugmentationList(augmentations)
self.image_format = image_format
self.use_instance_mask = use_instance_mask
self.sampling_frame_num = sampling_frame_num
self.sampling_frame_range = sampling_frame_range
self.sampling_frame_shuffle = sampling_frame_shuffle
self.num_classes = num_classes
if not is_tgt:
self.src_metadata = MetadataCatalog.get(src_dataset_name)
self.tgt_metadata = MetadataCatalog.get(tgt_dataset_name)
print('tgt_dataset_name:', tgt_dataset_name)
if tgt_dataset_name.startswith("ytvis_2019"):
src2tgt = OVIS_TO_YTVIS_2019
elif tgt_dataset_name.startswith("ytvis_2021"):
src2tgt = OVIS_TO_YTVIS_2021
elif tgt_dataset_name.startswith("ovis"):
if src_dataset_name.startswith("ytvis_2019"):
src2tgt = YTVIS_2019_TO_OVIS
elif src_dataset_name.startswith("ytvis_2021"):
src2tgt = YTVIS_2021_TO_OVIS
else:
raise NotImplementedError
else:
raise NotImplementedError
self.src2tgt = {}
for k, v in src2tgt.items():
self.src2tgt[
self.src_metadata.thing_dataset_id_to_contiguous_id[k]
] = self.tgt_metadata.thing_dataset_id_to_contiguous_id[v]
# fmt: on
logger = logging.getLogger(__name__)
mode = "training" if is_train else "inference"
logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}")
@classmethod
def from_config(cls, cfg, is_train: bool = True, is_tgt: bool = True):
augs = build_augmentation(cfg, is_train)
sampling_frame_num = cfg.INPUT.SAMPLING_FRAME_NUM
sampling_frame_range = cfg.INPUT.SAMPLING_FRAME_RANGE
sampling_frame_shuffle = cfg.INPUT.SAMPLING_FRAME_SHUFFLE
ret = {
"is_train": is_train,
"is_tgt": is_tgt,
"augmentations": augs,
"image_format": cfg.INPUT.FORMAT,
"use_instance_mask": cfg.MODEL.MASK_ON,
"sampling_frame_num": sampling_frame_num,
"sampling_frame_range": sampling_frame_range,
"sampling_frame_shuffle": sampling_frame_shuffle,
"num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
"tgt_dataset_name": cfg.DATASETS.TRAIN[-1],
}
return ret
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one video, in YTVIS Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
# TODO consider examining below deepcopy as it costs huge amount of computations.
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
video_length = dataset_dict["length"]
if self.is_train:
ref_frame = random.randrange(video_length)
start_idx = max(0, ref_frame-self.sampling_frame_range)
end_idx = min(video_length, ref_frame+self.sampling_frame_range + 1)
selected_idx = np.random.choice(
np.array(list(range(start_idx, ref_frame)) + list(range(ref_frame+1, end_idx))),
self.sampling_frame_num - 1,
)
selected_idx = selected_idx.tolist() + [ref_frame]
selected_idx = sorted(selected_idx)
if self.sampling_frame_shuffle:
random.shuffle(selected_idx)
else:
selected_idx = range(video_length)
video_annos = dataset_dict.pop("annotations", None)
file_names = dataset_dict.pop("file_names", None)
if self.is_train:
_ids = set()
for frame_idx in selected_idx:
_ids.update([anno["id"] for anno in video_annos[frame_idx]])
ids = dict()
for i, _id in enumerate(_ids):
ids[_id] = i
dataset_dict["video_len"] = len(video_annos)
dataset_dict["frame_idx"] = list(selected_idx)
dataset_dict["image"] = []
dataset_dict["instances"] = []
dataset_dict["file_names"] = []
for frame_idx in selected_idx:
dataset_dict["file_names"].append(file_names[frame_idx])
# Read image
image = utils.read_image(file_names[frame_idx], format=self.image_format)
utils.check_image_size(dataset_dict, image)
aug_input = T.AugInput(image)
transforms = self.augmentations(aug_input)
image = aug_input.image
image_shape = image.shape[:2] # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"].append(torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))))
if (video_annos is None) or (not self.is_train):
continue
# NOTE copy() is to prevent annotations getting changed from applying augmentations
_frame_annos = []
for anno in video_annos[frame_idx]:
_anno = {}
for k, v in anno.items():
_anno[k] = copy.deepcopy(v)
_frame_annos.append(_anno)
# USER: Implement additional transformations if you have other types of data
annos = [
utils.transform_instance_annotations(obj, transforms, image_shape)
for obj in _frame_annos
if obj.get("iscrowd", 0) == 0
]
sorted_annos = [_get_dummy_anno() for _ in range(len(ids))]
for _anno in annos:
idx = ids[_anno["id"]]
sorted_annos[idx] = _anno
_gt_ids = [_anno["id"] for _anno in sorted_annos]
instances = utils.annotations_to_instances(sorted_annos, image_shape, mask_format="bitmask")
if not self.is_tgt:
instances.gt_classes = torch.tensor(
[self.src2tgt[c] if c in self.src2tgt else -1 for c in instances.gt_classes.tolist()]
)
instances.gt_ids = torch.tensor(_gt_ids)
instances = filter_empty_instances(instances)
# if instances.has("gt_masks"):
# instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
# instances = filter_empty_instances(instances)
if not instances.has("gt_masks"):
instances.gt_masks = BitMasks(torch.empty((0, *image_shape)))
dataset_dict["instances"].append(instances)
return dataset_dict
class CocoClipDatasetMapper:
"""
A callable which takes a COCO image which converts into multiple frames,
and map it into a format used by the model.
"""
@configurable
def __init__(
self,
is_train: bool,
is_tgt: bool,
*,
augmentations: List[Union[T.Augmentation, T.Transform]],
image_format: str,
sampling_frame_num: int = 2,
sampling_frame_range: int = 5,
src_dataset_name: str = "",
tgt_dataset_name: str = "",
):
"""
NOTE: this interface is experimental.
Args:
is_train: whether it's used in training or inference
augmentations: a list of augmentations or deterministic transforms to apply
image_format: an image format supported by :func:`detection_utils.read_image`.
"""
# fmt: off
self.is_train = is_train
self.is_tgt = is_tgt
self.augmentations = T.AugmentationList(augmentations)
self.image_format = image_format
self.sampling_frame_num = sampling_frame_num
self.sampling_frame_range = sampling_frame_range
if not is_tgt:
self.src_metadata = MetadataCatalog.get(src_dataset_name)
self.tgt_metadata = MetadataCatalog.get(tgt_dataset_name)
if tgt_dataset_name.startswith("ytvis_2019"):
src2tgt = COCO_TO_YTVIS_2019
elif tgt_dataset_name.startswith("ytvis_2021"):
src2tgt = COCO_TO_YTVIS_2021
elif tgt_dataset_name.startswith("ovis"):
src2tgt = COCO_TO_OVIS
else:
raise NotImplementedError
self.src2tgt = {}
for k, v in src2tgt.items():
self.src2tgt[
self.src_metadata.thing_dataset_id_to_contiguous_id[k]
] = self.tgt_metadata.thing_dataset_id_to_contiguous_id[v]
# fmt: on
logger = logging.getLogger(__name__)
mode = "training" if is_train else "inference"
logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}")
@classmethod
def from_config(cls, cfg, is_train: bool = True, is_tgt: bool = True):
if is_tgt:
augs = build_augmentation(cfg, is_train)
else:
# print('come here')
augs = build_pseudo_augmentation(cfg, is_train)
sampling_frame_num = cfg.INPUT.PSEUDO.SAMPLING_FRAME_NUM
sampling_frame_range = cfg.INPUT.PSEUDO.SAMPLING_FRAME_RANGE
ret = {
"is_train": is_train,
"is_tgt": is_tgt,
"augmentations": augs,
"image_format": cfg.INPUT.FORMAT,
"sampling_frame_num": sampling_frame_num,
"sampling_frame_range": sampling_frame_range,
"tgt_dataset_name": cfg.DATASETS.TRAIN[-1],
}
return ret
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
img_annos = dataset_dict.pop("annotations", None)
file_name = dataset_dict.pop("file_name", None)
original_image = utils.read_image(file_name, format=self.image_format)
if self.is_train:
video_length = random.randrange(16, 49)
ref_frame = random.randrange(video_length)
start_idx = max(0, ref_frame-self.sampling_frame_range)
end_idx = min(video_length, ref_frame+self.sampling_frame_range + 1)
selected_idx = np.random.choice(
np.array(list(range(start_idx, ref_frame)) + list(range(ref_frame+1, end_idx))),
self.sampling_frame_num - 1,
)
selected_idx = selected_idx.tolist() + [ref_frame]
selected_idx = sorted(selected_idx)
else:
video_length = self.sampling_frame_num
selected_idx = list(range(self.sampling_frame_num))
dataset_dict["video_len"] = video_length
dataset_dict["frame_idx"] = selected_idx
dataset_dict["image"] = []
dataset_dict["instances"] = []
dataset_dict["file_names"] = [file_name] * self.sampling_frame_num
for _ in range(self.sampling_frame_num):
utils.check_image_size(dataset_dict, original_image)
aug_input = T.AugInput(original_image)
transforms = self.augmentations(aug_input)
image = aug_input.image
image_shape = image.shape[:2] # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"].append(torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))))
if (img_annos is None) or (not self.is_train):
continue
_img_annos = []
for anno in img_annos:
_anno = {}
for k, v in anno.items():
_anno[k] = copy.deepcopy(v)
_img_annos.append(_anno)
# USER: Implement additional transformations if you have other types of data
annos = [
utils.transform_instance_annotations(obj, transforms, image_shape)
for obj in _img_annos
if obj.get("iscrowd", 0) == 0
]
_gt_ids = list(range(len(annos)))
for idx in range(len(annos)):
if len(annos[idx]["segmentation"]) == 0:
annos[idx]["segmentation"] = [np.array([0.0] * 6)]
instances = utils.annotations_to_instances(annos, image_shape)
if not self.is_tgt:
instances.gt_classes = torch.tensor(
[self.src2tgt[c] if c in self.src2tgt else -1 for c in instances.gt_classes.tolist()]
)
instances.gt_ids = torch.tensor(_gt_ids)
# instances.gt_boxes = instances.gt_masks.get_bounding_boxes() # NOTE we don't need boxes
instances = filter_empty_instances(instances)
h, w = instances.image_size
if hasattr(instances, 'gt_masks'):
gt_masks = instances.gt_masks
gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
instances.gt_masks = gt_masks
else:
instances.gt_masks = torch.zeros((0, h, w), dtype=torch.uint8)
dataset_dict["instances"].append(instances)
return dataset_dict
================================================
FILE: mask2former_video/data_video/datasets/__init__.py
================================================
# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
from . import builtin # ensure the builtin datasets are registered
__all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
================================================
FILE: mask2former_video/data_video/datasets/builtin.py
================================================
# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
import os
from .ytvis import (
register_ytvis_instances,
_get_ytvis_2019_instances_meta,
_get_ytvis_2021_instances_meta,
)
from detectron2.data.datasets.coco import register_coco_instances
from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
_PREDEFINED_SPLITS_COCO = {}
_PREDEFINED_SPLITS_COCO["coco"] = {
"coco_2017_train_fake": ("coco/train2017", "coco/annotations/coco2ytvis2019_train.json"),
}
# ==== Predefined splits for YTVIS 2019 ===========
_PREDEFINED_SPLITS_YTVIS_2019 = {
"ytvis_2019_train": ("ytvis_2019/train/JPEGImages",
"ytvis_2019/train.json"),
"ytvis_2019_val": ("ytvis_2019/valid/JPEGImages",
"ytvis_2019/valid.json"),
"ytvis_2019_test": ("ytvis_2019/test/JPEGImages",
"ytvis_2019/test.json"),
}
# ==== Predefined splits for YTVIS 2021 ===========
_PREDEFINED_SPLITS_YTVIS_2021 = {
"ytvis_2021_train": ("ytvis_2021/train/JPEGImages",
"ytvis_2021/train.json"),
"ytvis_2021_val": ("ytvis_2021/valid/JPEGImages",
"ytvis_2021/valid.json"),
"ytvis_2021_test": ("ytvis_2021/test/JPEGImages",
"ytvis_2021/test.json"),
}
def register_all_ytvis_2019(root):
for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2019.items():
# Assume pre-defined datasets live in `./datasets`.
register_ytvis_instances(
key,
_get_ytvis_2019_instances_meta(),
os.path.join(root, json_file) if "://" not in json_file else json_file,
os.path.join(root, image_root),
)
def register_all_ytvis_2021(root):
for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2021.items():
# Assume pre-defined datasets live in `./datasets`.
register_ytvis_instances(
key,
_get_ytvis_2021_instances_meta(),
os.path.join(root, json_file) if "://" not in json_file else json_file,
os.path.join(root, image_root),
)
def register_all_coco(root):
for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_COCO.items():
for key, (image_root, json_file) in splits_per_dataset.items():
# Assume pre-defined datasets live in `./datasets`.
register_coco_instances(
key,
_get_builtin_metadata(dataset_name),
os.path.join(root, json_file) if "://" not in json_file else json_file,
os.path.join(root, image_root),
)
if __name__.endswith(".builtin"):
# Assume pre-defined datasets live in `./datasets`.
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_ytvis_2019(_root)
register_all_ytvis_2021(_root)
register_all_coco(_root)
================================================
FILE: mask2former_video/data_video/datasets/ytvis.py
================================================
# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
import contextlib
import io
import json
import logging
import numpy as np
import os
import pycocotools.mask as mask_util
from fvcore.common.file_io import PathManager
from fvcore.common.timer import Timer
from detectron2.structures import Boxes, BoxMode, PolygonMasks
from detectron2.data import DatasetCatalog, MetadataCatalog
"""
This file contains functions to parse YTVIS dataset of
COCO-format annotations into dicts in "Detectron2 format".
"""
logger = logging.getLogger(__name__)
__all__ = ["load_ytvis_json", "register_ytvis_instances"]
COCO_TO_YTVIS_2019 = {
1:1, 2:21, 3:6, 4:21, 5:28, 7:17, 8:29, 9:34, 17:14, 18:8, 19:18, 21:15, 22:32, 23:20, 24:30, 25:22, 35:33, 36:33, 41:5, 42:27, 43:40
}
COCO_TO_YTVIS_2021 = {
1:26, 2:23, 3:5, 4:23, 5:1, 7:36, 8:37, 9:4, 16:3, 17:6, 18:9, 19:19, 21:7, 22:12, 23:2, 24:40, 25:18, 34:14, 35:31, 36:31, 41:29, 42:33, 43:34
}
YTVIS_CATEGORIES_2019 = [
{"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
{"color": [0, 82, 0], "isthing": 1, "id": 2, "name": "giant_panda"},
{"color": [119, 11, 32], "isthing": 1, "id": 3, "name": "lizard"},
{"color": [165, 42, 42], "isthing": 1, "id": 4, "name": "parrot"},
{"color": [134, 134, 103], "isthing": 1, "id": 5, "name": "skateboard"},
{"color": [0, 0, 142], "isthing": 1, "id": 6, "name": "sedan"},
{"color": [255, 109, 65], "isthing": 1, "id": 7, "name": "ape"},
{"color": [0, 226, 252], "isthing": 1, "id": 8, "name": "dog"},
{"color": [5, 121, 0], "isthing": 1, "id": 9, "name": "snake"},
{"color": [0, 60, 100], "isthing": 1, "id": 10, "name": "monkey"},
{"color": [250, 170, 30], "isthing": 1, "id": 11, "name": "hand"},
{"color": [100, 170, 30], "isthing": 1, "id": 12, "name": "rabbit"},
{"color": [179, 0, 194], "isthing": 1, "id": 13, "name": "duck"},
{"color": [255, 77, 255], "isthing": 1, "id": 14, "name": "cat"},
{"color": [120, 166, 157], "isthing": 1, "id": 15, "name": "cow"},
{"color": [73, 77, 174], "isthing": 1, "id": 16, "name": "fish"},
{"color": [0, 80, 100], "isthing": 1, "id": 17, "name": "train"},
{"color": [182, 182, 255], "isthing": 1, "id": 18, "name": "horse"},
{"color": [0, 143, 149], "isthing": 1, "id": 19, "name": "turtle"},
{"color": [174, 57, 255], "isthing": 1, "id": 20, "name": "bear"},
{"color": [0, 0, 230], "isthing": 1, "id": 21, "name": "motorbike"},
{"color": [72, 0, 118], "isthing": 1, "id": 22, "name": "giraffe"},
{"color": [255, 179, 240], "isthing": 1, "id": 23, "name": "leopard"},
{"color": [0, 125, 92], "isthing": 1, "id": 24, "name": "fox"},
{"color": [209, 0, 151], "isthing": 1, "id": 25, "name": "deer"},
{"color": [188, 208, 182], "isthing": 1, "id": 26, "name": "owl"},
{"color": [145, 148, 174], "isthing": 1, "id": 27, "name": "surfboard"},
{"color": [106, 0, 228], "isthing": 1, "id": 28, "name": "airplane"},
{"color": [0, 0, 70], "isthing": 1, "id": 29, "name": "truck"},
{"color": [199, 100, 0], "isthing": 1, "id": 30, "name": "zebra"},
{"color": [166, 196, 102], "isthing": 1, "id": 31, "name": "tiger"},
{"color": [110, 76, 0], "isthing": 1, "id": 32, "name": "elephant"},
{"color": [133, 129, 255], "isthing": 1, "id": 33, "name": "snowboard"},
{"color": [0, 0, 192], "isthing": 1, "id": 34, "name": "boat"},
{"color": [183, 130, 88], "isthing": 1, "id": 35, "name": "shark"},
{"color": [130, 114, 135], "isthing": 1, "id": 36, "name": "mouse"},
{"color": [107, 142, 35], "isthing": 1, "id": 37, "name": "frog"},
{"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "eagle"},
{"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "earless_seal"},
{"color": [255, 208, 186], "isthing": 1, "id": 40, "name": "tennis_racket"},
]
YTVIS_CATEGORIES_2021 = [
{"color": [106, 0, 228], "isthing": 1, "id": 1, "name": "airplane"},
{"color": [174, 57, 255], "isthing": 1, "id": 2, "name": "bear"},
{"color": [255, 109, 65], "isthing": 1, "id": 3, "name": "bird"},
{"color": [0, 0, 192], "isthing": 1, "id": 4, "name": "boat"},
{"color": [0, 0, 142], "isthing": 1, "id": 5, "name": "car"},
{"color": [255, 77, 255], "isthing": 1, "id": 6, "name": "cat"},
{"color": [120, 166, 157], "isthing": 1, "id": 7, "name": "cow"},
{"color": [209, 0, 151], "isthing": 1, "id": 8, "name": "deer"},
{"color": [0, 226, 252], "isthing": 1, "id": 9, "name": "dog"},
{"color": [179, 0, 194], "isthing": 1, "id": 10, "name": "duck"},
{"color": [174, 255, 243], "isthing": 1, "id": 11, "name": "earless_seal"},
{"color": [110, 76, 0], "isthing": 1, "id": 12, "name": "elephant"},
{"color": [73, 77, 174], "isthing": 1, "id": 13, "name": "fish"},
{"color": [250, 170, 30], "isthing": 1, "id": 14, "name": "flying_disc"},
{"color": [0, 125, 92], "isthing": 1, "id": 15, "name": "fox"},
{"color": [107, 142, 35], "isthing": 1, "id": 16, "name": "frog"},
{"color": [0, 82, 0], "isthing": 1, "id": 17, "name": "giant_panda"},
{"color": [72, 0, 118], "isthing": 1, "id": 18, "name": "giraffe"},
{"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
{"color": [255, 179, 240], "isthing": 1, "id": 20, "name": "leopard"},
{"color": [119, 11, 32], "isthing": 1, "id": 21, "name": "lizard"},
{"color": [0, 60, 100], "isthing": 1, "id": 22, "name": "monkey"},
{"color": [0, 0, 230], "isthing": 1, "id": 23, "name": "motorbike"},
{"color": [130, 114, 135], "isthing": 1, "id": 24, "name": "mouse"},
{"color": [165, 42, 42], "isthing": 1, "id": 25, "name": "parrot"},
{"color": [220, 20, 60], "isthing": 1, "id": 26, "name": "person"},
{"color": [100, 170, 30], "isthing": 1, "id": 27, "name": "rabbit"},
{"color": [183, 130, 88], "isthing": 1, "id": 28, "name": "shark"},
{"color": [134, 134, 103], "isthing": 1, "id": 29, "name": "skateboard"},
{"color": [5, 121, 0], "isthing": 1, "id": 30, "name": "snake"},
{"color": [133, 129, 255], "isthing": 1, "id": 31, "name": "snowboard"},
{"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "squirrel"},
{"color": [145, 148, 174], "isthing": 1, "id": 33, "name": "surfboard"},
{"color": [255, 208, 186], "isthing": 1, "id": 34, "name": "tennis_racket"},
{"color": [166, 196, 102], "isthing": 1, "id": 35, "name": "tiger"},
{"color": [0, 80, 100], "isthing": 1, "id": 36, "name": "train"},
{"color": [0, 0, 70], "isthing": 1, "id": 37, "name": "truck"},
{"color": [0, 143, 149], "isthing": 1, "id": 38, "name": "turtle"},
{"color": [0, 228, 0], "isthing": 1, "id": 39, "name": "whale"},
{"color": [199, 100, 0], "isthing": 1, "id": 40, "name": "zebra"},
]
def _get_ytvis_2019_instances_meta():
thing_ids = [k["id"] for k in YTVIS_CATEGORIES_2019 if k["isthing"] == 1]
thing_colors = [k["color"] for k in YTVIS_CATEGORIES_2019 if k["isthing"] == 1]
assert len(thing_ids) == 40, len(thing_ids)
# Mapping from the incontiguous YTVIS category id to an id in [0, 39]
thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
thing_classes = [k["name"] for k in YTVIS_CATEGORIES_2019 if k["isthing"] == 1]
ret = {
"thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
"thing_classes": thing_classes,
"thing_colors": thing_colors,
}
return ret
def _get_ytvis_2021_instances_meta():
thing_ids = [k["id"] for k in YTVIS_CATEGORIES_2021 if k["isthing"] == 1]
thing_colors = [k["color"] for k in YTVIS_CATEGORIES_2021 if k["isthing"] == 1]
assert len(thing_ids) == 40, len(thing_ids)
# Mapping from the incontiguous YTVIS category id to an id in [0, 39]
thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
thing_classes = [k["name"] for k in YTVIS_CATEGORIES_2021 if k["isthing"] == 1]
ret = {
"thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
"thing_classes": thing_classes,
"thing_colors": thing_colors,
}
return ret
def load_ytvis_json(json_file, image_root, dataset_name=None, extra_annotation_keys=None):
from .ytvis_api.ytvos import YTVOS
timer = Timer()
json_file = PathManager.get_local_path(json_file)
with contextlib.redirect_stdout(io.StringIO()):
ytvis_api = YTVOS(json_file)
if timer.seconds() > 1:
logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
id_map = None
if dataset_name is not None:
meta = MetadataCatalog.get(dataset_name)
cat_ids = sorted(ytvis_api.getCatIds())
cats = ytvis_api.loadCats(cat_ids)
# The categories in a custom json file may not be sorted.
thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])]
meta.thing_classes = thing_classes
# In COCO, certain category ids are artificially removed,
# and by convention they are always ignored.
# We deal with COCO's id issue and translate
# the category ids to contiguous ids in [0, 80).
# It works by looking at the "categories" field in the json, therefore
# if users' own json also have incontiguous ids, we'll
# apply this mapping as well but print a warning.
if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)):
if "coco" not in dataset_name:
logger.warning(
"""
Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you.
"""
)
id_map = {v: i for i, v in enumerate(cat_ids)}
meta.thing_dataset_id_to_contiguous_id = id_map
# sort indices for reproducible results
vid_ids = sorted(ytvis_api.vids.keys())
# vids is a list of dicts, each looks something like:
# {'license': 1,
# 'flickr_url': ' ',
# 'file_names': ['ff25f55852/00000.jpg', 'ff25f55852/00005.jpg', ..., 'ff25f55852/00175.jpg'],
# 'height': 720,
# 'width': 1280,
# 'length': 36,
# 'date_captured': '2019-04-11 00:55:41.903902',
# 'id': 2232}
vids = ytvis_api.loadVids(vid_ids)
anns = [ytvis_api.vidToAnns[vid_id] for vid_id in vid_ids]
total_num_valid_anns = sum([len(x) for x in anns])
total_num_anns = len(ytvis_api.anns)
if total_num_valid_anns < total_num_anns:
logger.warning(
f"{json_file} contains {total_num_anns} annotations, but only "
f"{total_num_valid_anns} of them match to images in the file."
)
vids_anns = list(zip(vids, anns))
logger.info("Loaded {} videos in YTVIS format from {}".format(len(vids_anns), json_file))
dataset_dicts = []
ann_keys = ["iscrowd", "category_id", "id"] + (extra_annotation_keys or [])
num_instances_without_valid_segmentation = 0
for (vid_dict, anno_dict_list) in vids_anns:
record = {}
record["file_names"] = [os.path.join(image_root, vid_dict["file_names"][i]) for i in range(vid_dict["length"])]
record["height"] = vid_dict["height"]
record["width"] = vid_dict["width"]
record["length"] = vid_dict["length"]
video_id = record["video_id"] = vid_dict["id"]
video_objs = []
for frame_idx in range(record["length"]):
frame_objs = []
for anno in anno_dict_list:
assert anno["video_id"] == video_id
obj = {key: anno[key] for key in ann_keys if key in anno}
_bboxes = anno.get("bboxes", None)
_segm = anno.get("segmentations", None)
if not (_bboxes and _segm and _bboxes[frame_idx] and _segm[frame_idx]):
continue
bbox = _bboxes[frame_idx]
segm = _segm[frame_idx]
obj["bbox"] = bbox
obj["bbox_mode"] = BoxMode.XYWH_ABS
if isinstance(segm, dict):
if isinstance(segm["counts"], list):
# convert to compressed RLE
segm = mask_util.frPyObjects(segm, *segm["size"])
elif segm:
# filter out invalid polygons (< 3 points)
segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
if len(segm) == 0:
num_instances_without_valid_segmentation += 1
continue # ignore this instance
obj["segmentation"] = segm
if id_map:
obj["category_id"] = id_map[obj["category_id"]]
frame_objs.append(obj)
video_objs.append(frame_objs)
record["annotations"] = video_objs
dataset_dicts.append(record)
if num_instances_without_valid_segmentation > 0:
logger.warning(
"Filtered out {} instances without valid segmentation. ".format(
num_instances_without_valid_segmentation
)
+ "There might be issues in your dataset generation process. "
"A valid polygon should be a list[float] with even length >= 6."
)
return dataset_dicts
def register_ytvis_instances(name, metadata, json_file, image_root):
"""
Register a dataset in YTVIS's json annotation format for
instance tracking.
Args:
name (str): the name that identifies a dataset, e.g. "ytvis_train".
metadata (dict): extra metadata associated with this dataset. You can
leave it as an empty dict.
json_file (str): path to the json instance annotation file.
image_root (str or path-like): directory which contains all the images.
"""
assert isinstance(name, str), name
assert isinstance(json_file, (str, os.PathLike)), json_file
assert isinstance(image_root, (str, os.PathLike)), image_root
# 1. register a function which returns dicts
DatasetCatalog.register(name, lambda: load_ytvis_json(json_file, image_root, name))
# 2. Optionally, add metadata about this dataset,
# since they might be useful in evaluation, visualization or logging
MetadataCatalog.get(name).set(
json_file=json_file, image_root=image_root, evaluator_type="ytvis", **metadata
)
if __name__ == "__main__":
"""
Test the YTVIS json dataset loader.
"""
from detectron2.utils.logger import setup_logger
from detectron2.utils.visualizer import Visualizer
import detectron2.data.datasets # noqa # add pre-defined metadata
import sys
from PIL import Image
logger = setup_logger(name=__name__)
#assert sys.argv[3] in DatasetCatalog.list()
meta = MetadataCatalog.get("ytvis_2019_train")
json_file = "./datasets/ytvis/instances_train_sub.json"
image_root = "./datasets/ytvis/train/JPEGImages"
dicts = load_ytvis_json(json_file, image_root, dataset_name="ytvis_2019_train")
logger.info("Done loading {} samples.".format(len(dicts)))
dirname = "ytvis-data-vis"
os.makedirs(dirname, exist_ok=True)
def extract_frame_dic(dic, frame_idx):
import copy
frame_dic = copy.deepcopy(dic)
annos = frame_dic.get("annotations", None)
if annos:
frame_dic["annotations"] = annos[frame_idx]
return frame_dic
for d in dicts:
vid_name = d["file_names"][0].split('/')[-2]
os.makedirs(os.path.join(dirname, vid_name), exist_ok=True)
for idx, file_name in enumerate(d["file_names"]):
img = np.array(Image.open(file_name))
visualizer = Visualizer(img, metadata=meta)
vis = visualizer.draw_dataset_dict(extract_frame_dic(d, idx))
fpath = os.path.join(dirname, vid_name, file_name.split('/')[-1])
vis.save(fpath)
================================================
FILE: mask2former_video/data_video/datasets/ytvis_api/__init__.py
================================================
# Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi
================================================
FILE: mask2former_video/data_video/datasets/ytvis_api/ytvos.py
================================================
# Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi
__author__ = 'ychfan'
# Interface for accessing the YouTubeVIS dataset.
# The following API functions are defined:
# YTVOS - YTVOS api class that loads YouTubeVIS annotation file and prepare data structures.
# decodeMask - Decode binary mask M encoded via run-length encoding.
# encodeMask - Encode binary mask M using run-length encoding.
# getAnnIds - Get ann ids that satisfy given filter conditions.
# getCatIds - Get cat ids that satisfy given filter conditions.
# getImgIds - Get img ids that satisfy given filter conditions.
# loadAnns - Load anns with the specified ids.
# loadCats - Load cats with the specified ids.
# loadImgs - Load imgs with the specified ids.
# annToMask - Convert segmentation in an annotation to binary mask.
# loadRes - Load algorithm results and create API for accessing them.
# Microsoft COCO Toolbox. version 2.0
# Data, paper, and tutorials available at: http://mscoco.org/
# Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
# Licensed under the Simplified BSD License [see bsd.txt]
import json
import time
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.patches import Polygon
import numpy as np
import copy
import itertools
from pycocotools import mask as maskUtils
import os
from collections import defaultdict
import sys
PYTHON_VERSION = sys.version_info[0]
if PYTHON_VERSION == 2:
from urllib import urlretrieve
elif PYTHON_VERSION == 3:
from urllib.request import urlretrieve
def _isArrayLike(obj):
return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
class YTVOS:
def __init__(self, annotation_file=None):
"""
Constructor of Microsoft COCO helper class for reading and visualizing annotations.
:param annotation_file (str): location of annotation file
:param image_folder (str): location to the folder that hosts images.
:return:
"""
# load dataset
self.dataset,self.anns,self.cats,self.vids = dict(),dict(),dict(),dict()
self.vidToAnns, self.catToVids = defaultdict(list), defaultdict(list)
if not annotation_file == None:
print('loading annotations into memory...')
tic = time.time()
dataset = json.load(open(annotation_file, 'r'))
assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
print('Done (t={:0.2f}s)'.format(time.time()- tic))
self.dataset = dataset
self.createIndex()
def createIndex(self):
# create index
print('creating index...')
anns, cats, vids = {}, {}, {}
vidToAnns,catToVids = defaultdict(list),defaultdict(list)
if 'annotations' in self.dataset:
for ann in self.dataset['annotations']:
vidToAnns[ann['video_id']].append(ann)
anns[ann['id']] = ann
if 'videos' in self.dataset:
for vid in self.dataset['videos']:
vids[vid['id']] = vid
if 'categories' in self.dataset:
for cat in self.dataset['categories']:
cats[cat['id']] = cat
if 'annotations' in self.dataset and 'categories' in self.dataset:
for ann in self.dataset['annotations']:
catToVids[ann['category_id']].append(ann['video_id'])
print('index created!')
# create class members
self.anns = anns
self.vidToAnns = vidToAnns
self.catToVids = catToVids
self.vids = vids
self.cats = cats
def info(self):
"""
Print information about the annotation file.
:return:
"""
for key, value in self.dataset['info'].items():
print('{}: {}'.format(key, value))
def getAnnIds(self, vidIds=[], catIds=[], areaRng=[], iscrowd=None):
"""
Get ann ids that satisfy given filter conditions. default skips that filter
:param vidIds (int array) : get anns for given vids
catIds (int array) : get anns for given cats
areaRng (float array) : get anns for given area range (e.g. [0 inf])
iscrowd (boolean) : get anns for given crowd label (False or True)
:return: ids (int array) : integer array of ann ids
"""
vidIds = vidIds if _isArrayLike(vidIds) else [vidIds]
catIds = catIds if _isArrayLike(catIds) else [catIds]
if len(vidIds) == len(catIds) == len(areaRng) == 0:
anns = self.dataset['annotations']
else:
if not len(vidIds) == 0:
lists = [self.vidToAnns[vidId] for vidId in vidIds if vidId in self.vidToAnns]
anns = list(itertools.chain.from_iterable(lists))
else:
anns = self.dataset['annotations']
anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds]
anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['avg_area'] > areaRng[0] and ann['avg_area'] < areaRng[1]]
if not iscrowd == None:
ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
else:
ids = [ann['id'] for ann in anns]
return ids
def getCatIds(self, catNms=[], supNms=[], catIds=[]):
"""
filtering parameters. default skips that filter.
:param catNms (str array) : get cats for given cat names
:param supNms (str array) : get cats for given supercategory names
:param catIds (int array) : get cats for given cat ids
:return: ids (int array) : integer array of cat ids
"""
catNms = catNms if _isArrayLike(catNms) else [catNms]
supNms = supNms if _isArrayLike(supNms) else [supNms]
catIds = catIds if _isArrayLike(catIds) else [catIds]
if len(catNms) == len(supNms) == len(catIds) == 0:
cats = self.dataset['categories']
else:
cats = self.dataset['categories']
cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms]
cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds]
ids = [cat['id'] for cat in cats]
return ids
def getVidIds(self, vidIds=[], catIds=[]):
'''
Get vid ids that satisfy given filter conditions.
:param vidIds (int array) : get vids for given ids
:param catIds (int array) : get vids with all given cats
:return: ids (int array) : integer array of vid ids
'''
vidIds = vidIds if _isArrayLike(vidIds) else [vidIds]
catIds = catIds if _isArrayLike(catIds) else [catIds]
if len(vidIds) == len(catIds) == 0:
ids = self.vids.keys()
else:
ids = set(vidIds)
for i, catId in enumerate(catIds):
if i == 0 and len(ids) == 0:
ids = set(self.catToVids[catId])
else:
ids &= set(self.catToVids[catId])
return list(ids)
def loadAnns(self, ids=[]):
"""
Load anns with the specified ids.
:param ids (int array) : integer ids specifying anns
:return: anns (object array) : loaded ann objects
"""
if _isArrayLike(ids):
return [self.anns[id] for id in ids]
elif type(ids) == int:
return [self.anns[ids]]
def loadCats(self, ids=[]):
"""
Load cats with the specified ids.
:param ids (int array) : integer ids specifying cats
:return: cats (object array) : loaded cat objects
"""
if _isArrayLike(ids):
return [self.cats[id] for id in ids]
elif type(ids) == int:
return [self.cats[ids]]
def loadVids(self, ids=[]):
"""
Load anns with the specified ids.
:param ids (int array) : integer ids specifying vid
:return: vids (object array) : loaded vid objects
"""
if _isArrayLike(ids):
return [self.vids[id] for id in ids]
elif type(ids) == int:
return [self.vids[ids]]
def loadRes(self, resFile):
"""
Load result file and return a result api object.
:param resFile (str) : file name of result file
:return: res (obj) : result api object
"""
res = YTVOS()
res.dataset['videos'] = [img for img in self.dataset['videos']]
print('Loading and preparing results...')
tic = time.time()
if type(resFile) == str or (PYTHON_VERSION == 2 and type(resFile) == unicode):
anns = json.load(open(resFile))
elif type(resFile) == np.ndarray:
anns = self.loadNumpyAnnotations(resFile)
else:
anns = resFile
assert type(anns) == list, 'results in not an array of objects'
annsVidIds = [ann['video_id'] for ann in anns]
assert set(annsVidIds) == (set(annsVidIds) & set(self.getVidIds())), \
'Results do not correspond to current coco set'
if 'segmentations' in anns[0]:
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
for id, ann in enumerate(anns):
ann['areas'] = []
if not 'bboxes' in ann:
ann['bboxes'] = []
for seg in ann['segmentations']:
# now only support compressed RLE format as segmentation results
if seg:
ann['areas'].append(maskUtils.area(seg))
if len(ann['bboxes']) < len(ann['areas']):
ann['bboxes'].append(maskUtils.toBbox(seg))
else:
ann['areas'].append(None)
if len(ann['bboxes']) < len(ann['areas']):
ann['bboxes'].append(None)
ann['id'] = id+1
l = [a for a in ann['areas'] if a]
if len(l)==0:
ann['avg_area'] = 0
else:
ann['avg_area'] = np.array(l).mean()
ann['iscrowd'] = 0
print('DONE (t={:0.2f}s)'.format(time.time()- tic))
res.dataset['annotations'] = anns
res.createIndex()
return res
def annToRLE(self, ann, frameId):
"""
Convert annotation which can be polygons, uncompressed RLE to RLE.
:return: binary mask (numpy 2D array)
"""
t = self.vids[ann['video_id']]
h, w = t['height'], t['width']
segm = ann['segmentations'][frameId]
if type(segm) == list:
# polygon -- a single object might consist of multiple parts
# we merge all parts into one mask rle code
rles = maskUtils.frPyObjects(segm, h, w)
rle = maskUtils.merge(rles)
elif type(segm['counts']) == list:
# uncompressed RLE
rle = maskUtils.frPyObjects(segm, h, w)
else:
# rle
rle = segm
return rle
def annToMask(self, ann, frameId):
"""
Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
:return: binary mask (numpy 2D array)
"""
rle = self.annToRLE(ann, frameId)
m = maskUtils.decode(rle)
return m
================================================
FILE: mask2former_video/data_video/datasets/ytvis_api/ytvoseval.py
================================================
# Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi
__author__ = 'ychfan'
import numpy as np
import datetime
import time
from collections import defaultdict
from pycocotools import mask as maskUtils
import copy
class YTVOSeval:
# Interface for evaluating video instance segmentation on the YouTubeVIS dataset.
#
# The usage for YTVOSeval is as follows:
# cocoGt=..., cocoDt=... # load dataset and results
# E = YTVOSeval(cocoGt,cocoDt); # initialize YTVOSeval object
# E.params.recThrs = ...; # set parameters as desired
# E.evaluate(); # run per image evaluation
# E.accumulate(); # accumulate per image results
# E.summarize(); # display summary metrics of results
# For example usage see evalDemo.m and http://mscoco.org/.
#
# The evaluation parameters are as follows (defaults in brackets):
# imgIds - [all] N img ids to use for evaluation
# catIds - [all] K cat ids to use for evaluation
# iouThrs - [.5:.05:.95] T=10 IoU thresholds for evaluation
# recThrs - [0:.01:1] R=101 recall thresholds for evaluation
# areaRng - [...] A=4 object area ranges for evaluation
# maxDets - [1 10 100] M=3 thresholds on max detections per image
# iouType - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints'
# iouType replaced the now DEPRECATED useSegm parameter.
# useCats - [1] if true use category labels for evaluation
# Note: if useCats=0 category labels are ignored as in proposal scoring.
# Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified.
#
# evaluate(): evaluates detections on every image and every category and
# concats the results into the "evalImgs" with fields:
# dtIds - [1xD] id for each of the D detections (dt)
# gtIds - [1xG] id for each of the G ground truths (gt)
# dtMatches - [TxD] matching gt id at each IoU or 0
# gtMatches - [TxG] matching dt id at each IoU or 0
# dtScores - [1xD] confidence of each dt
# gtIgnore - [1xG] ignore flag for each gt
# dtIgnore - [TxD] ignore flag for each dt at each IoU
#
# accumulate(): accumulates the per-image, per-category evaluation
# results in "evalImgs" into the dictionary "eval" with fields:
# params - parameters used for evaluation
# date - date evaluation was performed
# counts - [T,R,K,A,M] parameter dimensions (see above)
# precision - [TxRxKxAxM] precision for every evaluation setting
# recall - [TxKxAxM] max recall for every evaluation setting
# Note: precision and recall==-1 for settings with no gt objects.
#
# See also coco, mask, pycocoDemo, pycocoEvalDemo
#
# Microsoft COCO Toolbox. version 2.0
# Data, paper, and tutorials available at: http://mscoco.org/
# Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
# Licensed under the Simplified BSD License [see coco/license.txt]
def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'):
'''
Initialize CocoEval using coco APIs for gt and dt
:param cocoGt: coco object with ground truth annotations
:param cocoDt: coco object with detection results
:return: None
'''
if not iouType:
print('iouType not specified. use default iouType segm')
self.cocoGt = cocoGt # ground truth COCO API
self.cocoDt = cocoDt # detections COCO API
self.params = {} # evaluation parameters
self.evalVids = defaultdict(list) # per-image per-category evaluation results [KxAxI] elements
self.eval = {} # accumulated evaluation results
self._gts = defaultdict(list) # gt for evaluation
self._dts = defaultdict(list) # dt for evaluation
self.params = Params(iouType=iouType) # parameters
self._paramsEval = {} # parameters for evaluation
self.stats = [] # result summarization
self.ious = {} # ious between all gts and dts
if not cocoGt is None:
self.params.vidIds = sorted(cocoGt.getVidIds())
self.params.catIds = sorted(cocoGt.getCatIds())
def _prepare(self):
'''
Prepare ._gts and ._dts for evaluation based on params
:return: None
'''
def _toMask(anns, coco):
# modify ann['segmentation'] by reference
for ann in anns:
for i, a in enumerate(ann['segmentations']):
if a:
rle = coco.annToRLE(ann, i)
ann['segmentations'][i] = rle
l = [a for a in ann['areas'] if a]
if len(l)==0:
ann['avg_area'] = 0
else:
ann['avg_area'] = np.array(l).mean()
p = self.params
if p.useCats:
gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(vidIds=p.vidIds, catIds=p.catIds))
dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(vidIds=p.vidIds, catIds=p.catIds))
else:
gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(vidIds=p.vidIds))
dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(vidIds=p.vidIds))
# convert ground truth to mask if iouType == 'segm'
if p.iouType == 'segm':
_toMask(gts, self.cocoGt)
_toMask(dts, self.cocoDt)
# set ignore flag
for gt in gts:
gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0
gt['ignore'] = 'iscrowd' in gt and gt['iscrowd']
if p.iouType == 'keypoints':
gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore']
self._gts = defaultdict(list) # gt for evaluation
self._dts = defaultdict(list) # dt for evaluation
for gt in gts:
self._gts[gt['video_id'], gt['category_id']].append(gt)
for dt in dts:
self._dts[dt['video_id'], dt['category_id']].append(dt)
self.evalVids = defaultdict(list) # per-image per-category evaluation results
self.eval = {} # accumulated evaluation results
def evaluate(self):
'''
Run per image evaluation on given images and store results (a list of dict) in self.evalVids
:return: None
'''
tic = time.time()
print('Running per image evaluation...')
p = self.params
# add backward compatibility if useSegm is specified in params
if not p.useSegm is None:
p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
print('Evaluate annotation type *{}*'.format(p.iouType))
p.vidIds = list(np.unique(p.vidIds))
if p.useCats:
p.catIds = list(np.unique(p.catIds))
p.maxDets = sorted(p.maxDets)
self.params=p
self._prepare()
# loop through images, area range, max detection number
catIds = p.catIds if p.useCats else [-1]
if p.iouType == 'segm' or p.iouType == 'bbox':
computeIoU = self.computeIoU
elif p.iouType == 'keypoints':
computeIoU = self.computeOks
self.ious = {(vidId, catId): computeIoU(vidId, catId) \
for vidId in p.vidIds
for catId in catIds}
evaluateVid = self.evaluateVid
maxDet = p.maxDets[-1]
self.evalImgs = [evaluateVid(vidId, catId, areaRng, maxDet)
for catId in catIds
for areaRng in p.areaRng
for vidId in p.vidIds
]
self._paramsEval = copy.deepcopy(self.params)
toc = time.time()
print('DONE (t={:0.2f}s).'.format(toc-tic))
def computeIoU(self, vidId, catId):
p = self.params
if p.useCats:
gt = self._gts[vidId,catId]
dt = self._dts[vidId,catId]
else:
gt = [_ for cId in p.catIds for _ in self._gts[vidId,cId]]
dt = [_ for cId in p.catIds for _ in self._dts[vidId,cId]]
if len(gt) == 0 and len(dt) ==0:
return []
inds = np.argsort([-d['score'] for d in dt], kind='mergesort')
dt = [dt[i] for i in inds]
if len(dt) > p.maxDets[-1]:
dt=dt[0:p.maxDets[-1]]
if p.iouType == 'segm':
g = [g['segmentations'] for g in gt]
d = [d['segmentations'] for d in dt]
elif p.iouType == 'bbox':
g = [g['bboxes'] for g in gt]
d = [d['bboxes'] for d in dt]
else:
raise Exception('unknown iouType for iou computation')
# compute iou between each dt and gt region
iscrowd = [int(o['iscrowd']) for o in gt]
#ious = maskUtils.iou(d,g,iscrowd)
def iou_seq(d_seq, g_seq):
i = .0
u = .0
for d, g in zip(d_seq, g_seq):
if d and g:
i += maskUtils.area(maskUtils.merge([d, g], True))
u += maskUtils.area(maskUtils.merge([d, g], False))
elif not d and g:
u += maskUtils.area(g)
elif d and not g:
u += maskUtils.area(d)
if not u > .0:
print("Mask sizes in video {} and category {} may not match!".format(vidId, catId))
iou = i / u if u > .0 else .0
return iou
ious = np.zeros([len(d), len(g)])
for i, j in np.ndindex(ious.shape):
ious[i, j] = iou_seq(d[i], g[j])
#print(vidId, catId, ious.shape, ious)
return ious
def computeOks(self, imgId, catId):
p = self.params
# dimention here should be Nxm
gts = self._gts[imgId, catId]
dts = self._dts[imgId, catId]
inds = np.argsort([-d['score'] for d in dts], kind='mergesort')
dts = [dts[i] for i in inds]
if len(dts) > p.maxDets[-1]:
dts = dts[0:p.maxDets[-1]]
# if len(gts) == 0 and len(dts) == 0:
if len(gts) == 0 or len(dts) == 0:
return []
ious = np.zeros((len(dts), len(gts)))
sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62,.62, 1.07, 1.07, .87, .87, .89, .89])/10.0
vars = (sigmas * 2)**2
k = len(sigmas)
# compute oks between each detection and ground truth object
for j, gt in enumerate(gts):
# create bounds for ignore regions(double the gt bbox)
g = np.array(gt['keypoints'])
xg = g[0::3]; yg = g[1::3]; vg = g[2::3]
k1 = np.count_nonzero(vg > 0)
bb = gt['bbox']
x0 = bb[0] - bb[2]; x1 = bb[0] + bb[2] * 2
y0 = bb[1] - bb[3]; y1 = bb[1] + bb[3] * 2
for i, dt in enumerate(dts):
d = np.array(dt['keypoints'])
xd = d[0::3]; yd = d[1::3]
if k1>0:
# measure the per-keypoint distance if keypoints visible
dx = xd - xg
dy = yd - yg
else:
# measure minimum distance to keypoints in (x0,y0) & (x1,y1)
z = np.zeros((k))
dx = np.max((z, x0-xd),axis=0)+np.max((z, xd-x1),axis=0)
dy = np.max((z, y0-yd),axis=0)+np.max((z, yd-y1),axis=0)
e = (dx**2 + dy**2) / vars / (gt['avg_area']+np.spacing(1)) / 2
if k1 > 0:
e=e[vg > 0]
ious[i, j] = np.sum(np.exp(-e)) / e.shape[0]
return ious
def evaluateVid(self, vidId, catId, aRng, maxDet):
'''
perform evaluation for single category and image
:return: dict (single image results)
'''
p = self.params
if p.useCats:
gt = self._gts[vidId,catId]
dt = self._dts[vidId,catId]
else:
gt = [_ for cId in p.catIds for _ in self._gts[vidId,cId]]
dt = [_ for cId in p.catIds for _ in self._dts[vidId,cId]]
if len(gt) == 0 and len(dt) ==0:
return None
for g in gt:
if g['ignore'] or (g['avg_area']aRng[1]):
g['_ignore'] = 1
else:
g['_ignore'] = 0
# sort dt highest score first, sort gt ignore last
gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort')
gt = [gt[i] for i in gtind]
dtind = np.argsort([-d['score'] for d in dt], kind='mergesort')
dt = [dt[i] for i in dtind[0:maxDet]]
iscrowd = [int(o['iscrowd']) for o in gt]
# load computed ious
ious = self.ious[vidId, catId][:, gtind] if len(self.ious[vidId, catId]) > 0 else self.ious[vidId, catId]
T = len(p.iouThrs)
G = len(gt)
D = len(dt)
gtm = np.zeros((T,G))
dtm = np.zeros((T,D))
gtIg = np.array([g['_ignore'] for g in gt])
dtIg = np.zeros((T,D))
if not len(ious)==0:
for tind, t in enumerate(p.iouThrs):
for dind, d in enumerate(dt):
# information about best match so far (m=-1 -> unmatched)
iou = min([t,1-1e-10])
m = -1
for gind, g in enumerate(gt):
# if this gt already matched, and not a crowd, continue
if gtm[tind,gind]>0 and not iscrowd[gind]:
continue
# if dt matched to reg gt, and on ignore gt, stop
if m>-1 and gtIg[m]==0 and gtIg[gind]==1:
break
# continue to next gt unless better match made
if ious[dind,gind] < iou:
continue
# if match successful and best so far, store appropriately
iou=ious[dind,gind]
m=gind
# if match made store id of match for both dt and gt
if m ==-1:
continue
dtIg[tind,dind] = gtIg[m]
dtm[tind,dind] = gt[m]['id']
gtm[tind,m] = d['id']
# set unmatched detections outside of area range to ignore
a = np.array([d['avg_area']aRng[1] for d in dt]).reshape((1, len(dt)))
dtIg = np.logical_or(dtIg, np.logical_and(dtm==0, np.repeat(a,T,0)))
# store results for given image and category
return {
'video_id': vidId,
'category_id': catId,
'aRng': aRng,
'maxDet': maxDet,
'dtIds': [d['id'] for d in dt],
'gtIds': [g['id'] for g in gt],
'dtMatches': dtm,
'gtMatches': gtm,
'dtScores': [d['score'] for d in dt],
'gtIgnore': gtIg,
'dtIgnore': dtIg,
}
def accumulate(self, p = None):
'''
Accumulate per image evaluation results and store the result in self.eval
:param p: input params for evaluation
:return: None
'''
print('Accumulating evaluation results...')
tic = time.time()
if not self.evalImgs:
print('Please run evaluate() first')
# allows input customized parameters
if p is None:
p = self.params
p.catIds = p.catIds if p.useCats == 1 else [-1]
T = len(p.iouThrs)
R = len(p.recThrs)
K = len(p.catIds) if p.useCats else 1
A = len(p.areaRng)
M = len(p.maxDets)
precision = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories
recall = -np.ones((T,K,A,M))
scores = -np.ones((T,R,K,A,M))
# create dictionary for future indexing
_pe = self._paramsEval
catIds = _pe.catIds if _pe.useCats else [-1]
setK = set(catIds)
setA = set(map(tuple, _pe.areaRng))
setM = set(_pe.maxDets)
setI = set(_pe.vidIds)
# get inds to evaluate
k_list = [n for n, k in enumerate(p.catIds) if k in setK]
m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
i_list = [n for n, i in enumerate(p.vidIds) if i in setI]
I0 = len(_pe.vidIds)
A0 = len(_pe.areaRng)
# retrieve E at each category, area range, and max number of detections
for k, k0 in enumerate(k_list):
Nk = k0*A0*I0
for a, a0 in enumerate(a_list):
Na = a0*I0
for m, maxDet in enumerate(m_list):
E = [self.evalImgs[Nk + Na + i] for i in i_list]
E = [e for e in E if not e is None]
if len(E) == 0:
continue
dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E])
# different sorting method generates slightly different results.
# mergesort is used to be consistent as Matlab implementation.
inds = np.argsort(-dtScores, kind='mergesort')
dtScoresSorted = dtScores[inds]
dtm = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds]
dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet] for e in E], axis=1)[:,inds]
gtIg = np.concatenate([e['gtIgnore'] for e in E])
npig = np.count_nonzero(gtIg==0 )
if npig == 0:
continue
tps = np.logical_and( dtm, np.logical_not(dtIg) )
fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) )
tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)
for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
tp = np.array(tp)
fp = np.array(fp)
nd = len(tp)
rc = tp / npig
pr = tp / (fp+tp+np.spacing(1))
q = np.zeros((R,))
ss = np.zeros((R,))
if nd:
recall[t,k,a,m] = rc[-1]
else:
recall[t,k,a,m] = 0
# numpy is slow without cython optimization for accessing elements
# use python array gets significant speed improvement
pr = pr.tolist(); q = q.tolist()
for i in range(nd-1, 0, -1):
if pr[i] > pr[i-1]:
pr[i-1] = pr[i]
inds = np.searchsorted(rc, p.recThrs, side='left')
try:
for ri, pi in enumerate(inds):
q[ri] = pr[pi]
ss[ri] = dtScoresSorted[pi]
except:
pass
precision[t,:,k,a,m] = np.array(q)
scores[t,:,k,a,m] = np.array(ss)
self.eval = {
'params': p,
'counts': [T, R, K, A, M],
'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
'precision': precision,
'recall': recall,
'scores': scores,
}
toc = time.time()
print('DONE (t={:0.2f}s).'.format( toc-tic))
def summarize(self):
'''
Compute and display summary metrics for evaluation results.
Note this functin can *only* be applied on the default parameter setting
'''
def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ):
p = self.params
iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}'
titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
typeStr = '(AP)' if ap==1 else '(AR)'
iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
if iouThr is None else '{:0.2f}'.format(iouThr)
aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
if ap == 1:
# dimension of precision: [TxRxKxAxM]
s = self.eval['precision']
# IoU
if iouThr is not None:
t = np.where(iouThr == p.iouThrs)[0]
s = s[t]
s = s[:,:,:,aind,mind]
else:
# dimension of recall: [TxKxAxM]
s = self.eval['recall']
if iouThr is not None:
t = np.where(iouThr == p.iouThrs)[0]
s = s[t]
s = s[:,:,aind,mind]
if len(s[s>-1])==0:
mean_s = -1
else:
mean_s = np.mean(s[s>-1])
print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
return mean_s
def _summarizeDets():
stats = np.zeros((12,))
stats[0] = _summarize(1)
stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2])
stats[2] = _summarize(1, iouThr=.75, maxDets=self.params.maxDets[2])
stats[3] = _summarize(1, areaRng='small', maxDets=self.params.maxDets[2])
stats[4] = _summarize(1, areaRng='medium', maxDets=self.params.maxDets[2])
stats[5] = _summarize(1, areaRng='large', maxDets=self.params.maxDets[2])
stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
stats[9] = _summarize(0, areaRng='small', maxDets=self.params.maxDets[2])
stats[10] = _summarize(0, areaRng='medium', maxDets=self.params.maxDets[2])
stats[11] = _summarize(0, areaRng='large', maxDets=self.params.maxDets[2])
return stats
def _summarizeKps():
stats = np.zeros((10,))
stats[0] = _summarize(1, maxDets=20)
stats[1] = _summarize(1, maxDets=20, iouThr=.5)
stats[2] = _summarize(1, maxDets=20, iouThr=.75)
stats[3] = _summarize(1, maxDets=20, areaRng='medium')
stats[4] = _summarize(1, maxDets=20, areaRng='large')
stats[5] = _summarize(0, maxDets=20)
stats[6] = _summarize(0, maxDets=20, iouThr=.5)
stats[7] = _summarize(0, maxDets=20, iouThr=.75)
stats[8] = _summarize(0, maxDets=20, areaRng='medium')
stats[9] = _summarize(0, maxDets=20, areaRng='large')
return stats
if not self.eval:
raise Exception('Please run accumulate() first')
iouType = self.params.iouType
if iouType == 'segm' or iouType == 'bbox':
summarize = _summarizeDets
elif iouType == 'keypoints':
summarize = _summarizeKps
self.stats = summarize()
def __str__(self):
self.summarize()
class Params:
'''
Params for coco evaluation api
'''
def setDetParams(self):
self.vidIds = []
self.catIds = []
# np.arange causes trouble. the data point on arange is slightly larger than the true value
#self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True)
#self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True)
self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
self.maxDets = [1, 10, 100]
self.areaRng = [[0 ** 2, 1e5 ** 2], [0 ** 2, 128 ** 2], [ 128 ** 2, 256 ** 2], [256 ** 2, 1e5 ** 2]]
self.areaRngLbl = ['all', 'small', 'medium', 'large']
self.useCats = 1
def setKpParams(self):
self.vidIds = []
self.catIds = []
# np.arange causes trouble. the data point on arange is slightly larger than the true value
self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True)
self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True)
self.maxDets = [20]
self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
self.areaRngLbl = ['all', 'medium', 'large']
self.useCats = 1
def __init__(self, iouType='segm'):
if iouType == 'segm' or iouType == 'bbox':
self.setDetParams()
elif iouType == 'keypoints':
self.setKpParams()
else:
raise Exception('iouType not supported')
self.iouType = iouType
# useSegm is deprecated
self.useSegm = None
================================================
FILE: mask2former_video/data_video/ytvis_eval.py
================================================
# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
import contextlib
import copy
import io
import itertools
import json
import logging
import numpy as np
import os
from collections import OrderedDict
import pycocotools.mask as mask_util
import torch
from .datasets.ytvis_api.ytvos import YTVOS
from .datasets.ytvis_api.ytvoseval import YTVOSeval
from tabulate import tabulate
import detectron2.utils.comm as comm
from detectron2.config import CfgNode
from detectron2.data import MetadataCatalog
from detectron2.evaluation import DatasetEvaluator
from detectron2.utils.file_io import PathManager
from detectron2.utils.logger import create_small_table
class YTVISEvaluator(DatasetEvaluator):
"""
Evaluate AR for object proposals, AP for instance detection/segmentation, AP
for keypoint detection outputs using COCO's metrics.
See http://cocodataset.org/#detection-eval and
http://cocodataset.org/#keypoints-eval to understand its metrics.
In addition to COCO, this evaluator is able to support any bounding box detection,
instance segmentation, or keypoint detection dataset.
"""
def __init__(
self,
dataset_name,
tasks=None,
distributed=True,
output_dir=None,
*,
use_fast_impl=True,
):
"""
Args:
dataset_name (str): name of the dataset to be evaluated.
It must have either the following corresponding metadata:
"json_file": the path to the COCO format annotation
Or it must be in detectron2's standard dataset format
so it can be converted to COCO format automatically.
tasks (tuple[str]): tasks that can be evaluated under the given
configuration. A task is one of "bbox", "segm", "keypoints".
By default, will infer this automatically from predictions.
distributed (True): if True, will collect results from all ranks and run evaluation
in the main process.
Otherwise, will only evaluate the results in the current process.
output_dir (str): optional, an output directory to dump all
results predicted on the dataset. The dump contains two files:
1. "instances_predictions.pth" a file in torch serialization
format that contains all the raw original predictions.
2. "coco_instances_results.json" a json file in COCO's result
format.
use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP.
Although the results should be very close to the official implementation in COCO
API, it is still recommended to compute results with the official API for use in
papers. The faster implementation also uses more RAM.
"""
self._logger = logging.getLogger(__name__)
self._distributed = distributed
self._output_dir = output_dir
self._use_fast_impl = use_fast_impl
if tasks is not None and isinstance(tasks, CfgNode):
self._logger.warning(
"COCO Evaluator instantiated using config, this is deprecated behavior."
" Please pass in explicit arguments instead."
)
self._tasks = None # Infering it from predictions should be better
else:
self._tasks = tasks
self._cpu_device = torch.device("cpu")
self._metadata = MetadataCatalog.get(dataset_name)
json_file = PathManager.get_local_path(self._metadata.json_file)
with contextlib.redirect_stdout(io.StringIO()):
self._ytvis_api = YTVOS(json_file)
# Test set json files do not contain annotations (evaluation must be
# performed using the COCO evaluation server).
self._do_evaluation = "annotations" in self._ytvis_api.dataset
def reset(self):
self._predictions = []
def process(self, inputs, outputs):
"""
Args:
inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
It is a list of dict. Each dict corresponds to an image and
contains keys like "height", "width", "file_name", "image_id".
outputs: the outputs of a COCO model. It is a list of dicts with key
"instances" that contains :class:`Instances`.
"""
prediction = instances_to_coco_json_video(inputs, outputs)
self._predictions.extend(prediction)
def evaluate(self):
"""
Args:
img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset
"""
if self._distributed:
comm.synchronize()
predictions = comm.gather(self._predictions, dst=0)
predictions = list(itertools.chain(*predictions))
if not comm.is_main_process():
return {}
else:
predictions = self._predictions
if len(predictions) == 0:
self._logger.warning("[COCOEvaluator] Did not receive valid predictions.")
return {}
if self._output_dir:
PathManager.mkdirs(self._output_dir)
file_path = os.path.join(self._output_dir, "instances_predictions.pth")
with PathManager.open(file_path, "wb") as f:
torch.save(predictions, f)
self._results = OrderedDict()
self._eval_predictions(predictions)
# Copy so the caller can do whatever with results
return copy.deepcopy(self._results)
def _eval_predictions(self, predictions):
"""
Evaluate predictions. Fill self._results with the metrics of the tasks.
"""
self._logger.info("Preparing results for YTVIS format ...")
# unmap the category ids for COCO
if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
num_classes = len(all_contiguous_ids)
assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
for result in predictions:
category_id = result["category_id"]
assert category_id < num_classes, (
f"A prediction has class={category_id}, "
f"but the dataset only has {num_classes} classes and "
f"predicted class id should be in [0, {num_classes - 1}]."
)
result["category_id"] = reverse_id_mapping[category_id]
if self._output_dir:
file_path = os.path.join(self._output_dir, "results.json")
self._logger.info("Saving results to {}".format(file_path))
with PathManager.open(file_path, "w") as f:
f.write(json.dumps(predictions))
f.flush()
if not self._do_evaluation:
self._logger.info("Annotations are not available for evaluation.")
return
coco_eval = (
_evaluate_predictions_on_coco(
self._ytvis_api,
predictions,
)
if len(predictions) > 0
else None # cocoapi does not handle empty results very well
)
res = self._derive_coco_results(
coco_eval, class_names=self._metadata.get("thing_classes")
)
self._results["segm"] = res
def _derive_coco_results(self, coco_eval, class_names=None):
"""
Derive the desired score numbers from summarized COCOeval.
Args:
coco_eval (None or COCOEval): None represents no predictions from model.
iou_type (str):
class_names (None or list[str]): if provided, will use it to predict
per-category AP.
Returns:
a dict of {metric name: score}
"""
metrics = ["AP", "AP50", "AP75", "APs", "APm", "APl", "AR1", "AR10"]
if coco_eval is None:
self._logger.warn("No predictions from the model!")
return {metric: float("nan") for metric in metrics}
# the standard metrics
results = {
metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan")
for idx, metric in enumerate(metrics)
}
self._logger.info(
"Evaluation results for {}: \n".format("segm") + create_small_table(results)
)
if not np.isfinite(sum(results.values())):
self._logger.info("Some metrics cannot be computed and is shown as NaN.")
if class_names is None or len(class_names) <= 1:
return results
# Compute per-category AP
# from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa
precisions = coco_eval.eval["precision"]
# precision has dims (iou, recall, cls, area range, max dets)
assert len(class_names) == precisions.shape[2]
results_per_category = []
for idx, name in enumerate(class_names):
# area range index 0: all area ranges
# max dets index -1: typically 100 per image
precision = precisions[:, :, idx, 0, -1]
precision = precision[precision > -1]
ap = np.mean(precision) if precision.size else float("nan")
results_per_category.append(("{}".format(name), float(ap * 100)))
# tabulate it
N_COLS = min(6, len(results_per_category) * 2)
results_flatten = list(itertools.chain(*results_per_category))
results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)])
table = tabulate(
results_2d,
tablefmt="pipe",
floatfmt=".3f",
headers=["category", "AP"] * (N_COLS // 2),
numalign="left",
)
self._logger.info("Per-category {} AP: \n".format("segm") + table)
results.update({"AP-" + name: ap for name, ap in results_per_category})
return results
def instances_to_coco_json_video(inputs, outputs):
"""
Dump an "Instances" object to a COCO-format json that's used for evaluation.
Args:
instances (Instances):
video_id (int): the image id
Returns:
list[dict]: list of json annotations in COCO format.
"""
assert len(inputs) == 1, "More than one inputs are loaded for inference!"
video_id = inputs[0]["video_id"]
video_length = inputs[0]["length"]
scores = outputs["pred_scores"]
labels = outputs["pred_labels"]
masks = outputs["pred_masks"]
ytvis_results = []
for instance_id, (s, l, m) in enumerate(zip(scores, labels, masks)):
segms = [
mask_util.encode(np.array(_mask[:, :, None], order="F", dtype="uint8"))[0]
for _mask in m
]
for rle in segms:
rle["counts"] = rle["counts"].decode("utf-8")
res = {
"video_id": video_id,
"score": s,
"category_id": l,
"segmentations": segms,
}
ytvis_results.append(res)
return ytvis_results
def _evaluate_predictions_on_coco(
coco_gt,
coco_results,
img_ids=None,
):
"""
Evaluate the coco results using COCOEval API.
"""
assert len(coco_results) > 0
coco_results = copy.deepcopy(coco_results)
# When evaluating mask AP, if the results contain bbox, cocoapi will
# use the box area as the area of the instance, instead of the mask area.
# This leads to a different definition of small/medium/large.
# We remove the bbox field to let mask AP use mask area.
for c in coco_results:
c.pop("bbox", None)
coco_dt = coco_gt.loadRes(coco_results)
coco_eval = YTVOSeval(coco_gt, coco_dt)
# For COCO, the default max_dets_per_image is [1, 10, 100].
max_dets_per_image = [1, 10, 100] # Default from COCOEval
coco_eval.params.maxDets = max_dets_per_image
if img_ids is not None:
coco_eval.params.imgIds = img_ids
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
return coco_eval
================================================
FILE: mask2former_video/modeling/__init__.py
================================================
from .transformer_decoder.video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder
================================================
FILE: mask2former_video/modeling/criterion.py
================================================
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py
import logging
import torch
import torch.nn.functional as F
from torch import nn
from detectron2.utils.comm import get_world_size
from detectron2.projects.point_rend.point_features import (
get_uncertain_point_coords_with_randomness,
point_sample,
)
from mask2former.utils.misc import is_dist_avail_and_initialized
import random
import cv2
import os
def unfold_wo_center(x, kernel_size, dilation):
assert x.dim() == 4
assert kernel_size % 2 == 1
# using SAME padding
padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
unfolded_x = F.unfold(
x, kernel_size=kernel_size,
padding=padding,
dilation=dilation
)
unfolded_x = unfolded_x.reshape(
x.size(0), x.size(1), -1, x.size(2), x.size(3)
)
# remove the center pixels
size = kernel_size ** 2
unfolded_x = torch.cat((
unfolded_x[:, :, :size // 2],
unfolded_x[:, :, size // 2 + 1:]
), dim=2)
return unfolded_x
def unfold_w_center(x, kernel_size, dilation):
assert x.dim() == 4
assert kernel_size % 2 == 1
# using SAME padding
padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
unfolded_x = F.unfold(
x, kernel_size=kernel_size,
padding=padding,
dilation=dilation
)
unfolded_x = unfolded_x.reshape(
x.size(0), x.size(1), -1, x.size(2), x.size(3)
)
return unfolded_x
def compute_pairwise_term(mask_logits, pairwise_size, pairwise_dilation):
assert mask_logits.dim() == 4
log_fg_prob = F.logsigmoid(mask_logits)
log_bg_prob = F.logsigmoid(-mask_logits)
log_fg_prob_unfold = unfold_wo_center(
log_fg_prob, kernel_size=pairwise_size,
dilation=pairwise_dilation
)
log_bg_prob_unfold = unfold_wo_center(
log_bg_prob, kernel_size=pairwise_size,
dilation=pairwise_dilation
)
# the probability of making the same prediction = p_i * p_j + (1 - p_i) * (1 - p_j)
# we compute the the probability in log space to avoid numerical instability
log_same_fg_prob = log_fg_prob[:, :, None] + log_fg_prob_unfold
log_same_bg_prob = log_bg_prob[:, :, None] + log_bg_prob_unfold
max_ = torch.max(log_same_fg_prob, log_same_bg_prob)
log_same_prob = torch.log(
torch.exp(log_same_fg_prob - max_) +
torch.exp(log_same_bg_prob - max_)
) + max_
# loss = -log(prob)
return -log_same_prob[:, 0]
def compute_pairwise_term_neighbor(mask_logits, mask_logits_neighbor, pairwise_size, pairwise_dilation):
assert mask_logits.dim() == 4
log_fg_prob_neigh = F.logsigmoid(mask_logits_neighbor)
log_bg_prob_neigh = F.logsigmoid(-mask_logits_neighbor)
log_fg_prob = F.logsigmoid(mask_logits)
log_bg_prob = F.logsigmoid(-mask_logits)
log_fg_prob_unfold = unfold_w_center(
log_fg_prob, kernel_size=pairwise_size,
dilation=pairwise_dilation
)
# print('log_fg_prob shape:', log_fg_prob.shape, 'log_fg_prob unfold:', log_fg_prob_unfold.shape)
log_bg_prob_unfold = unfold_w_center(
log_bg_prob, kernel_size=pairwise_size,
dilation=pairwise_dilation
)
# the probability of making the same prediction = p_i * p_j + (1 - p_i) * (1 - p_j)
# we compute the the probability in log space to avoid numerical instability
log_same_fg_prob = log_fg_prob_neigh[:, :, None] + log_fg_prob_unfold
log_same_bg_prob = log_bg_prob_neigh[:, :, None] + log_bg_prob_unfold
max_ = torch.max(log_same_fg_prob, log_same_bg_prob)
log_same_prob = torch.log(
torch.exp(log_same_fg_prob - max_) +
torch.exp(log_same_bg_prob - max_)
) + max_
# loss = -log(prob)
return -log_same_prob[:, 0]
def dice_coefficient(x, target):
eps = 1e-5
n_inst = x.size(0)
x = x.reshape(n_inst, -1)
target = target.reshape(n_inst, -1)
intersection = (x * target).sum(dim=1)
union = (x ** 2.0).sum(dim=1) + (target ** 2.0).sum(dim=1) + eps
loss = 1. - (2 * intersection / union)
return loss
def compute_project_term(mask_scores, gt_bitmasks):
mask_losses_y = dice_coefficient(
mask_scores.max(dim=2, keepdim=True)[0],
gt_bitmasks.max(dim=2, keepdim=True)[0]
)
mask_losses_x = dice_coefficient(
mask_scores.max(dim=3, keepdim=True)[0],
gt_bitmasks.max(dim=3, keepdim=True)[0]
)
return (mask_losses_x + mask_losses_y).mean()
def dice_loss(
inputs: torch.Tensor,
targets: torch.Tensor,
num_masks: float,
):
"""
Compute the DICE loss, similar to generalized IOU for masks
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
"""
inputs = inputs.sigmoid()
inputs = inputs.flatten(1)
numerator = 2 * (inputs * targets).sum(-1)
denominator = inputs.sum(-1) + targets.sum(-1)
loss = 1 - (numerator + 1) / (denominator + 1)
return loss.sum() / num_masks
dice_loss_jit = torch.jit.script(
dice_loss
) # type: torch.jit.ScriptModule
def sigmoid_ce_loss(
inputs: torch.Tensor,
targets: torch.Tensor,
num_masks: float,
):
"""
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
Returns:
Loss tensor
"""
loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
return loss.mean(1).sum() / num_masks
sigmoid_ce_loss_jit = torch.jit.script(
sigmoid_ce_loss
) # type: torch.jit.ScriptModule
def visualize_masks(masks, output_dir='masks'):
"""
Visualize binary mask tensor with shape (N, H, W) and save them as PNG images in the output directory.
"""
os.makedirs(output_dir, exist_ok=True)
n, h, w = masks.shape
masks = masks.cpu().numpy()
for i in range(n):
mask = (masks[i] * 255).astype('uint8')
print('mask sum', mask.sum(), mask.max(), mask.min())
# mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)
# mask = mask * 255
# mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)
filename = os.path.join(output_dir, f'mask_{i}.jpg')
cv2.imwrite(filename, mask)
def calculate_uncertainty(logits):
"""
We estimate uncerainty as L1 distance between 0.0 and the logit prediction in 'logits' for the
foreground class in `classes`.
Args:
logits (Tensor): A tensor of shape (R, 1, ...) for class-specific or
class-agnostic, where R is the total number of predicted masks in all images and C is
the number of foreground classes. The values are logits.
Returns:
scores (Tensor): A tensor of shape (R, 1, ...) that contains uncertainty scores with
the most uncertain locations having the highest uncertainty score.
"""
assert logits.shape[1] == 1
gt_class_logits = logits.clone()
return -(torch.abs(gt_class_logits))
class VideoSetCriterion(nn.Module):
"""This class computes the loss for DETR.
The process happens in two steps:
1) we compute hungarian assignment between ground truth boxes and the outputs of the model
2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
"""
def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses,
num_points, oversample_ratio, importance_sample_ratio):
"""Create the criterion.
Parameters:
num_classes: number of object categories, omitting the special no-object category
matcher: module able to compute a matching between targets and proposals
weight_dict: dict containing as key the names of the losses and as values their relative weight.
eos_coef: relative classification weight applied to the no-object category
losses: list of all the losses to be applied. See get_loss for list of available losses.
"""
super().__init__()
self.num_classes = num_classes
self.matcher = matcher
self.weight_dict = weight_dict
self.eos_coef = eos_coef
self.losses = losses
empty_weight = torch.ones(self.num_classes + 1)
empty_weight[-1] = self.eos_coef
self.register_buffer("empty_weight", empty_weight)
# pointwise mask loss parameters
self.num_points = num_points
self.oversample_ratio = oversample_ratio
self.importance_sample_ratio = importance_sample_ratio
self._warmup_iters = 2000
self.register_buffer("_iter", torch.zeros([1]))
def loss_labels(self, outputs, targets, indices, num_masks):
"""Classification loss (NLL)
targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
"""
assert "pred_logits" in outputs
src_logits = outputs["pred_logits"].float()
idx = self._get_src_permutation_idx(indices)
target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
target_classes = torch.full(
src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
)
target_classes[idx] = target_classes_o
loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
losses = {"loss_ce": loss_ce}
return losses
def loss_masks(self, outputs, targets, indices, num_masks):
"""Compute the losses related to the masks: the focal loss and the dice loss.
targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
"""
assert "pred_masks" in outputs
src_idx = self._get_src_permutation_idx(indices)
src_masks = outputs["pred_masks"]
src_masks = src_masks[src_idx]
# Modified to handle video
target_masks = torch.cat([t['masks'][i] for t, (_, i) in zip(targets, indices)]).to(src_masks)
# No need to upsample predictions as we are using normalized coordinates :)
# NT x 1 x H x W
src_masks = src_masks.flatten(0, 1)[:, None]
target_masks = target_masks.flatten(0, 1)[:, None]
# print('src_masks shape:', src_masks.shape)
# print('target_masks shape:', target_masks.shape)
with torch.no_grad():
# sample point_coords
point_coords = get_uncertain_point_coords_with_randomness(
src_masks,
lambda logits: calculate_uncertainty(logits),
self.num_points,
self.oversample_ratio,
self.importance_sample_ratio,
)
# get gt labels
point_labels = point_sample(
target_masks,
point_coords,
align_corners=False,
).squeeze(1)
point_logits = point_sample(
src_masks,
point_coords,
align_corners=False,
).squeeze(1)
losses = {
"loss_mask": sigmoid_ce_loss_jit(point_logits, point_labels, num_masks),
"loss_mask_proj": src_masks.sum() * 0.,
"loss_dice": dice_loss_jit(point_logits, point_labels, num_masks),
"loss_bound": src_masks.sum() * 0.,
"loss_bound_neighbor": src_masks.sum() * 0.,
}
del src_masks
del target_masks
return losses
def topk_mask(self, images_lab_sim):
images_lab_sim_mask = torch.zeros_like(images_lab_sim)
topk, indices = torch.topk(images_lab_sim, 5, dim =1)
images_lab_sim_mask = images_lab_sim_mask.scatter(1, indices, topk)
return images_lab_sim_mask
def loss_masks_proj(self, outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2):
"""Compute the losses related to the masks: the focal loss and the dice loss.
targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
"""
assert "pred_masks" in outputs
self._iter += 1
# print('images_lab_sim is None:', (images_lab_sim is None))
if images_lab_sim is None:
return self.loss_masks(outputs, targets, indices, num_masks)
src_idx = self._get_src_permutation_idx(indices)
src_masks = outputs["pred_masks"]
src_masks = src_masks[src_idx]
# Modified to handle video
target_masks = torch.cat([t['masks'][i] for t, (_, i) in zip(targets, indices)]).to(src_masks)
images_lab_sim = torch.cat(images_lab_sim, dim =0)
images_lab_sim_nei = torch.cat(images_lab_sim_nei, dim=0)
images_lab_sim_nei1 = torch.cat(images_lab_sim_nei1, dim=0)
images_lab_sim_nei2 = torch.cat(images_lab_sim_nei2, dim=0)
images_lab_sim = images_lab_sim.view(-1, target_masks.shape[1], images_lab_sim.shape[-3], images_lab_sim.shape[-2], images_lab_sim.shape[-1])
images_lab_sim_nei = images_lab_sim_nei.unsqueeze(1)
images_lab_sim_nei1 = images_lab_sim_nei1.unsqueeze(1)
images_lab_sim_nei2 = images_lab_sim_nei2.unsqueeze(1)
if len(src_idx[0].tolist()) > 0:
images_lab_sim = torch.cat([images_lab_sim[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1)
images_lab_sim_nei = self.topk_mask(torch.cat([images_lab_sim_nei[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1))
images_lab_sim_nei1 = self.topk_mask(torch.cat([images_lab_sim_nei1[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1))
images_lab_sim_nei2 = self.topk_mask(torch.cat([images_lab_sim_nei2[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1))
k_size = 3
if src_masks.shape[0] > 0:
pairwise_losses_neighbor = compute_pairwise_term_neighbor(
src_masks[:,:1], src_masks[:,1:2], k_size, 3
)
pairwise_losses_neighbor1 = compute_pairwise_term_neighbor(
src_masks[:,:1], src_masks[:,2:3], k_size, 3
)
pairwise_losses_neighbor2 = compute_pairwise_term_neighbor(
src_masks[:,1:2], src_masks[:,2:3], k_size, 3
)
src_masks = src_masks.flatten(0, 1)[:, None]
target_masks = target_masks.flatten(0, 1)[:, None]
target_masks = F.interpolate(target_masks, (src_masks.shape[-2], src_masks.shape[-1]), mode='bilinear')
if src_masks.shape[0] > 0:
loss_prj_term = compute_project_term(src_masks.sigmoid(), target_masks)
pairwise_losses = compute_pairwise_term(
src_masks, 3, 2
)
weights = (images_lab_sim >= 0.3).float() * target_masks.float()
target_masks_sum = target_masks.reshape(pairwise_losses_neighbor.shape[0], 3, target_masks.shape[-2], target_masks.shape[-1]).sum(dim=1, keepdim=True)
target_masks_sum = (target_masks_sum >= 1.0).float()
weights_neighbor = (images_lab_sim_nei >= 0.05).float() * target_masks_sum # ori is 0.5, 0.01, 0.001, 0.005, 0.0001, 0.02, 0.05, 0.075, 0.1 , dy 0.5
weights_neighbor1 = (images_lab_sim_nei1 >= 0.05).float() * target_masks_sum # ori is 0.5, 0.01, 0.001, 0.005, 0.0001, 0.02, 0.05, 0.075, 0.1, dy 0.5
weights_neighbor2 = (images_lab_sim_nei2 >= 0.05).float() * target_masks_sum # ori is 0.5, 0.01, 0.001, 0.005, 0.0001, 0.02, 0.05, 0.075, 0.1, dy 0.5
warmup_factor = min(self._iter.item() / float(self._warmup_iters), 1.0) #1.0
loss_pairwise = (pairwise_losses * weights).sum() / weights.sum().clamp(min=1.0)
loss_pairwise_neighbor = (pairwise_losses_neighbor * weights_neighbor).sum() / weights_neighbor.sum().clamp(min=1.0) * warmup_factor
loss_pairwise_neighbor1 = (pairwise_losses_neighbor1 * weights_neighbor1).sum() / weights_neighbor1.sum().clamp(min=1.0) * warmup_factor
loss_pairwise_neighbor2 = (pairwise_losses_neighbor2 * weights_neighbor2).sum() / weights_neighbor2.sum().clamp(min=1.0) * warmup_factor
else:
loss_prj_term = src_masks.sum() * 0.
loss_pairwise = src_masks.sum() * 0.
loss_pairwise_neighbor = src_masks.sum() * 0.
loss_pairwise_neighbor1 = src_masks.sum() * 0.
loss_pairwise_neighbor2 = src_masks.sum() * 0.
losses = {
"loss_mask": src_masks.sum() * 0.,
"loss_mask_proj": loss_prj_term,
"loss_dice": src_masks.sum() * 0.,
"loss_bound": loss_pairwise,
"loss_bound_neighbor": (loss_pairwise_neighbor + loss_pairwise_neighbor1 + loss_pairwise_neighbor2) * 0.1,
}
del src_masks
del target_masks
return losses
def _get_src_permutation_idx(self, indices):
# permute predictions following indices
batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
src_idx = torch.cat([src for (src, _) in indices])
return batch_idx, src_idx
def _get_tgt_permutation_idx(self, indices):
# permute targets following indices
batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
tgt_idx = torch.cat([tgt for (_, tgt) in indices])
return batch_idx, tgt_idx
def get_loss(self, loss, outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2):
loss_map = {
'labels': self.loss_labels,
'masks': self.loss_masks_proj,
}
assert loss in loss_map, f"do you really want to compute {loss} loss?"
if loss == 'masks':
return loss_map[loss](outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2)
else:
return loss_map[loss](outputs, targets, indices, num_masks)
def forward(self, outputs, targets, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2):
"""This performs the loss computation.
Parameters:
outputs: dict of tensors, see the output specification of the model for the format
targets: list of dicts, such that len(targets) == batch_size.
The expected keys in each dict depends on the losses applied, see each loss' doc
"""
outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
# Retrieve the matching between the outputs of the last layer and the targets
indices = self.matcher(outputs_without_aux, targets)
# Compute the average number of target boxes accross all nodes, for normalization purposes
num_masks = sum(len(t["labels"]) for t in targets)
num_masks = torch.as_tensor(
[num_masks], dtype=torch.float, device=next(iter(outputs.values())).device
)
if is_dist_avail_and_initialized():
torch.distributed.all_reduce(num_masks)
num_masks = torch.clamp(num_masks / get_world_size(), min=1).item()
# Compute all the requested losses
losses = {}
for loss in self.losses:
losses.update(self.get_loss(loss, outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2))
# In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
if "aux_outputs" in outputs:
for i, aux_outputs in enumerate(outputs["aux_outputs"]):
indices = self.matcher(aux_outputs, targets)
for loss in self.losses:
l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2)
l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
losses.update(l_dict)
return losses
def __repr__(self):
head = "Criterion " + self.__class__.__name__
body = [
"matcher: {}".format(self.matcher.__repr__(_repr_indent=8)),
"losses: {}".format(self.losses),
"weight_dict: {}".format(self.weight_dict),
"num_classes: {}".format(self.num_classes),
"eos_coef: {}".format(self.eos_coef),
"num_points: {}".format(self.num_points),
"oversample_ratio: {}".format(self.oversample_ratio),
"importance_sample_ratio: {}".format(self.importance_sample_ratio),
]
_repr_indent = 4
lines = [head] + [" " * _repr_indent + line for line in body]
return "\n".join(lines)
================================================
FILE: mask2former_video/modeling/matcher.py
================================================
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py
"""
Modules to compute the matching cost and solve the corresponding LSAP.
"""
import torch
import torch.nn.functional as F
from scipy.optimize import linear_sum_assignment
from torch import nn
from torch.cuda.amp import autocast
from detectron2.projects.point_rend.point_features import point_sample
import cv2
import os
# def visualize_masks(masks, output_dir='masks_new'):
# """
# Visualize binary mask tensor with shape (N, H, W) and save them as PNG images in the output directory.
# """
# os.makedirs(output_dir, exist_ok=True)
# masks = masks.flatten(0, 1)
# print('masks shape:', masks.shape)
# n, h, w = masks.shape
# for i in range(n):
# mask = masks[i].cpu().numpy()
# mask = (mask * 255).astype('uint8')
# # mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)
# filename = os.path.join(output_dir, f'mask_{i}.png')
# cv2.imwrite(filename, mask)
def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
"""
Compute the bounding boxes around the provided masks.
Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with
``0 <= x1 < x2`` and ``0 <= y1 < y2``.
Args:
masks (Tensor[N, H, W]): masks to transform where N is the number of masks
and (H, W) are the spatial dimensions.
Returns:
Tensor[N, 4]: bounding boxes
"""
if masks.numel() == 0:
return masks
n = masks.shape[0]
masks = masks.flatten(0, 1)
for index, mask in enumerate(masks):
y, x = torch.where(mask != 0)
if len(x) * len(y) == 0:
continue
masks[index, torch.min(y):torch.max(y)+1, torch.min(x):torch.max(x)+1] = 1.0
masks = masks.view(n, -1, masks.shape[-2], masks.shape[-1])
return masks
def masks_to_boxes_new(masks: torch.Tensor) -> torch.Tensor:
"""
Compute the bounding boxes around the provided masks.
Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with
``0 <= x1 < x2`` and ``0 <= y1 < y2``.
Args:
masks (Tensor[N, H, W]): masks to transform where N is the number of masks
and (H, W) are the spatial dimensions.
Returns:
Tensor[N, 4]: bounding boxes
"""
if masks.numel() == 0:
return masks
n, _, h, w = masks.shape
masks = masks.flatten(0, 1)
y = torch.arange(0, h, dtype=torch.float).to(masks.device)
x = torch.arange(0, w, dtype=torch.float).to(masks.device)
y, x = torch.meshgrid(y, x)
x_mask = (masks * x.unsqueeze(0))
x_max = x_mask.flatten(1).max(-1)[0] + 1
x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
y_mask = (masks * y.unsqueeze(0))
y_max = y_mask.flatten(1).max(-1)[0] + 1
y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
boxes = torch.stack([x_min, y_min, x_max, y_max], 1)
# print('boxes shape:', boxes.shape)
mem_mask = torch.zeros_like(masks)
hMask = torch.logical_or(torch.arange(h).unsqueeze(0).to(boxes)=boxes[:, 3, None])
wMask = torch.logical_or(torch.arange(w).unsqueeze(0).to(boxes)=boxes[:, 2, None])
mem_mask = torch.logical_or(hMask.unsqueeze(2), wMask.unsqueeze(1)).float()
# print('mem mask shape:', mem_mask.shape)
mem_mask = 1.0 - mem_mask.view(n, -1, masks.shape[-2], masks.shape[-1])
return mem_mask
def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
"""
Compute the DICE loss, similar to generalized IOU for masks
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
"""
inputs = inputs.sigmoid()
inputs = inputs.flatten(1)
numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
loss = 1 - (numerator + 1) / (denominator + 1)
return loss
def batch_dice_loss_nosig(inputs: torch.Tensor, targets: torch.Tensor):
"""
Compute the DICE loss, similar to generalized IOU for masks
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
"""
# inputs = inputs.sigmoid()
inputs = inputs.flatten(1)
numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
loss = 1 - (numerator + 1) / (denominator + 1)
return loss
batch_dice_loss_jit = torch.jit.script(
batch_dice_loss
) # type: torch.jit.ScriptModule
batch_dice_loss_jit_nosig = torch.jit.script(
batch_dice_loss_nosig
) # type: torch.jit.ScriptModule
def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
"""
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
Returns:
Loss tensor
"""
hw = inputs.shape[1]
pos = F.binary_cross_entropy_with_logits(
inputs, torch.ones_like(inputs), reduction="none"
)
neg = F.binary_cross_entropy_with_logits(
inputs, torch.zeros_like(inputs), reduction="none"
)
loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum(
"nc,mc->nm", neg, (1 - targets)
)
return loss / hw
def batch_sigmoid_ce_loss_nosig(inputs: torch.Tensor, targets: torch.Tensor):
"""
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
Returns:
Loss tensor
"""
hw = inputs.shape[1]
pos = F.binary_cross_entropy(
inputs, torch.ones_like(inputs), reduction="none"
)
neg = F.binary_cross_entropy(
inputs, torch.zeros_like(inputs), reduction="none"
)
loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum(
"nc,mc->nm", neg, (1 - targets)
)
#print('loss max no sig:', loss.max())
return loss / hw
batch_sigmoid_ce_loss_jit = torch.jit.script(
batch_sigmoid_ce_loss
) # type: torch.jit.ScriptModule
batch_sigmoid_ce_loss_jit_nosig = torch.jit.script(
batch_sigmoid_ce_loss_nosig
) # type: torch.jit.ScriptModule
class VideoHungarianMatcher(nn.Module):
"""This class computes an assignment between the targets and the predictions of the network
For efficiency reasons, the targets don't include the no_object. Because of this, in general,
there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
while the others are un-matched (and thus treated as non-objects).
"""
def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0):
"""Creates the matcher
Params:
cost_class: This is the relative weight of the classification error in the matching cost
cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost
cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost
"""
super().__init__()
self.cost_class = cost_class
self.cost_mask = cost_mask
self.cost_dice = cost_dice
assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0"
self.num_points = num_points
@torch.no_grad()
def memory_efficient_forward(self, outputs, targets):
"""More memory-friendly matching"""
bs, num_queries = outputs["pred_logits"].shape[:2]
indices = []
# Iterate through batch size
for b in range(bs):
out_prob = outputs["pred_logits"][b].softmax(-1) # [num_queries, num_classes]
tgt_ids = targets[b]["labels"]
# Compute the classification cost. Contrary to the loss, we don't use the NLL,
# but approximate it in 1 - proba[target class].
# The 1 is a constant that doesn't change the matching, it can be ommitted.
cost_class = -out_prob[:, tgt_ids]
out_mask = outputs["pred_masks"][b] # [num_queries, T, H_pred, W_pred]
is_ytvis = (out_mask.shape[1] == 3) # change here
if is_ytvis:
# out_mask_c = masks_to_boxes((out_mask.sigmoid() > 0.5).clone().float()).float()
out_mask = masks_to_boxes_new((out_mask.sigmoid() > 0.5).float()).float() # ori match
# visualize_masks(out_mask, 'box_mask_convert')
# gt masks are already padded when preparing target
tgt_mask = targets[b]["masks"].to(out_mask) # [num_gts, T, H_pred, W_pred]
if is_ytvis:
tgt_mask = masks_to_boxes(tgt_mask).float() # ori match, change here will also influnce criterion
# all masks share the same set of points for efficient matching!
point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device)
# get gt labels
tgt_mask = point_sample(
tgt_mask,
point_coords.repeat(tgt_mask.shape[0], 1, 1),
align_corners=False,
).flatten(1)
out_mask = point_sample(
out_mask,
point_coords.repeat(out_mask.shape[0], 1, 1),
align_corners=False,
).flatten(1)
with autocast(enabled=False):
out_mask = out_mask.float()
tgt_mask = tgt_mask.float()
# Compute the dice loss betwen masks
if not is_ytvis:
cost_dice = batch_dice_loss_jit(out_mask, tgt_mask)
cost_mask = batch_sigmoid_ce_loss_jit(out_mask, tgt_mask)
else:
cost_dice_nosig = batch_dice_loss_jit_nosig(out_mask, tgt_mask)
# Final cost matrix
if not is_ytvis:
C = (
self.cost_mask * cost_mask
+ self.cost_class * cost_class
+ self.cost_dice * cost_dice
)
else:
C = (
self.cost_class * cost_class
+ self.cost_dice * cost_dice_nosig
)
C = C.reshape(num_queries, -1).cpu()
indices.append(linear_sum_assignment(C))
return [
(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
for i, j in indices
]
@torch.no_grad()
def forward(self, outputs, targets):
"""Performs the matching
Params:
outputs: This is a dict that contains at least these entries:
"pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
"pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks
targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
"labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
objects in the target) containing the class labels
"masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks
Returns:
A list of size batch_size, containing tuples of (index_i, index_j) where:
- index_i is the indices of the selected predictions (in order)
- index_j is the indices of the corresponding selected targets (in order)
For each batch element, it holds:
len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
"""
return self.memory_efficient_forward(outputs, targets)
def __repr__(self, _repr_indent=4):
head = "Matcher " + self.__class__.__name__
body = [
"cost_class: {}".format(self.cost_class),
"cost_mask: {}".format(self.cost_mask),
"cost_dice: {}".format(self.cost_dice),
]
lines = [head] + [" " * _repr_indent + line for line in body]
return "\n".join(lines)
================================================
FILE: mask2former_video/modeling/transformer_decoder/__init__.py
================================================
from .video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder
================================================
FILE: mask2former_video/modeling/transformer_decoder/position_encoding.py
================================================
# # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
"""
Various positional encodings for the transformer.
"""
import math
import torch
from torch import nn
class PositionEmbeddingSine3D(nn.Module):
"""
This is a more standard version of the position embedding, very similar to the one
used by the Attention is all you need paper, generalized to work on images.
"""
def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
super().__init__()
self.num_pos_feats = num_pos_feats
self.temperature = temperature
self.normalize = normalize
if scale is not None and normalize is False:
raise ValueError("normalize should be True if scale is passed")
if scale is None:
scale = 2 * math.pi
self.scale = scale
def forward(self, x, mask=None):
# b, t, c, h, w
assert x.dim() == 5, f"{x.shape} should be a 5-dimensional Tensor, got {x.dim()}-dimensional Tensor instead"
if mask is None:
mask = torch.zeros((x.size(0), x.size(1), x.size(3), x.size(4)), device=x.device, dtype=torch.bool)
not_mask = ~mask
z_embed = not_mask.cumsum(1, dtype=torch.float32)
y_embed = not_mask.cumsum(2, dtype=torch.float32)
x_embed = not_mask.cumsum(3, dtype=torch.float32)
if self.normalize:
eps = 1e-6
z_embed = z_embed / (z_embed[:, -1:, :, :] + eps) * self.scale
y_embed = y_embed / (y_embed[:, :, -1:, :] + eps) * self.scale
x_embed = x_embed / (x_embed[:, :, :, -1:] + eps) * self.scale
dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
dim_t_z = torch.arange((self.num_pos_feats * 2), dtype=torch.float32, device=x.device)
dim_t_z = self.temperature ** (2 * (dim_t_z // 2) / (self.num_pos_feats * 2))
pos_x = x_embed[:, :, :, :, None] / dim_t
pos_y = y_embed[:, :, :, :, None] / dim_t
pos_z = z_embed[:, :, :, :, None] / dim_t_z
pos_x = torch.stack((pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
pos_y = torch.stack((pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
pos_z = torch.stack((pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3) # b, t, c, h, w
return pos
================================================
FILE: mask2former_video/modeling/transformer_decoder/video_mask2former_transformer_decoder.py
================================================
# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
import logging
import fvcore.nn.weight_init as weight_init
from typing import Optional
import torch
from torch import nn, Tensor
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.layers import Conv2d
from mask2former.modeling.transformer_decoder.maskformer_transformer_decoder import TRANSFORMER_DECODER_REGISTRY
from .position_encoding import PositionEmbeddingSine3D
class SelfAttentionLayer(nn.Module):
def __init__(self, d_model, nhead, dropout=0.0,
activation="relu", normalize_before=False):
super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.norm = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(self, tgt,
tgt_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
q = k = self.with_pos_embed(tgt, query_pos)
tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
key_padding_mask=tgt_key_padding_mask)[0]
tgt = tgt + self.dropout(tgt2)
tgt = self.norm(tgt)
return tgt
def forward_pre(self, tgt,
tgt_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
tgt2 = self.norm(tgt)
q = k = self.with_pos_embed(tgt2, query_pos)
tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
key_padding_mask=tgt_key_padding_mask)[0]
tgt = tgt + self.dropout(tgt2)
return tgt
def forward(self, tgt,
tgt_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
if self.normalize_before:
return self.forward_pre(tgt, tgt_mask,
tgt_key_padding_mask, query_pos)
return self.forward_post(tgt, tgt_mask,
tgt_key_padding_mask, query_pos)
class CrossAttentionLayer(nn.Module):
def __init__(self, d_model, nhead, dropout=0.0,
activation="relu", normalize_before=False):
super().__init__()
self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.norm = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(self, tgt, memory,
memory_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
key=self.with_pos_embed(memory, pos),
value=memory, attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask)[0]
tgt = tgt + self.dropout(tgt2)
tgt = self.norm(tgt)
return tgt
def forward_pre(self, tgt, memory,
memory_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
tgt2 = self.norm(tgt)
tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
key=self.with_pos_embed(memory, pos),
value=memory, attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask)[0]
tgt = tgt + self.dropout(tgt2)
return tgt
def forward(self, tgt, memory,
memory_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
if self.normalize_before:
return self.forward_pre(tgt, memory, memory_mask,
memory_key_padding_mask, pos, query_pos)
return self.forward_post(tgt, memory, memory_mask,
memory_key_padding_mask, pos, query_pos)
class FFNLayer(nn.Module):
def __init__(self, d_model, dim_feedforward=2048, dropout=0.0,
activation="relu", normalize_before=False):
super().__init__()
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm = nn.LayerNorm(d_model)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(self, tgt):
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
tgt = tgt + self.dropout(tgt2)
tgt = self.norm(tgt)
return tgt
def forward_pre(self, tgt):
tgt2 = self.norm(tgt)
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
tgt = tgt + self.dropout(tgt2)
return tgt
def forward(self, tgt):
if self.normalize_before:
return self.forward_pre(tgt)
return self.forward_post(tgt)
def _get_activation_fn(activation):
"""Return an activation function given a string"""
if activation == "relu":
return F.relu
if activation == "gelu":
return F.gelu
if activation == "glu":
return F.glu
raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
class MLP(nn.Module):
""" Very simple multi-layer perceptron (also called FFN)"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
def forward(self, x):
for i, layer in enumerate(self.layers):
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x
@TRANSFORMER_DECODER_REGISTRY.register()
class VideoMultiScaleMaskedTransformerDecoder(nn.Module):
_version = 2
def _load_from_state_dict(
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
):
version = local_metadata.get("version", None)
if version is None or version < 2:
# Do not warn if train from scratch
scratch = True
logger = logging.getLogger(__name__)
for k in list(state_dict.keys()):
newk = k
if "static_query" in k:
newk = k.replace("static_query", "query_feat")
if newk != k:
state_dict[newk] = state_dict[k]
del state_dict[k]
scratch = False
if not scratch:
logger.warning(
f"Weight format of {self.__class__.__name__} have changed! "
"Please upgrade your models. Applying automatic conversion now ..."
)
@configurable
def __init__(
self,
in_channels,
mask_classification=True,
*,
num_classes: int,
hidden_dim: int,
num_queries: int,
nheads: int,
dim_feedforward: int,
dec_layers: int,
pre_norm: bool,
mask_dim: int,
enforce_input_project: bool,
# video related
num_frames,
):
"""
NOTE: this interface is experimental.
Args:
in_channels: channels of the input features
mask_classification: whether to add mask classifier or not
num_classes: number of classes
hidden_dim: Transformer feature dimension
num_queries: number of queries
nheads: number of heads
dim_feedforward: feature dimension in feedforward network
enc_layers: number of Transformer encoder layers
dec_layers: number of Transformer decoder layers
pre_norm: whether to use pre-LayerNorm or not
mask_dim: mask feature dimension
enforce_input_project: add input project 1x1 conv even if input
channels and hidden dim is identical
"""
super().__init__()
assert mask_classification, "Only support mask classification model"
self.mask_classification = mask_classification
self.num_frames = num_frames
# positional encoding
N_steps = hidden_dim // 2
self.pe_layer = PositionEmbeddingSine3D(N_steps, normalize=True)
# define Transformer decoder here
self.num_heads = nheads
self.num_layers = dec_layers
self.transformer_self_attention_layers = nn.ModuleList()
self.transformer_cross_attention_layers = nn.ModuleList()
self.transformer_ffn_layers = nn.ModuleList()
for _ in range(self.num_layers):
self.transformer_self_attention_layers.append(
SelfAttentionLayer(
d_model=hidden_dim,
nhead=nheads,
dropout=0.0,
normalize_before=pre_norm,
)
)
self.transformer_cross_attention_layers.append(
CrossAttentionLayer(
d_model=hidden_dim,
nhead=nheads,
dropout=0.0,
normalize_before=pre_norm,
)
)
self.transformer_ffn_layers.append(
FFNLayer(
d_model=hidden_dim,
dim_feedforward=dim_feedforward,
dropout=0.0,
normalize_before=pre_norm,
)
)
self.decoder_norm = nn.LayerNorm(hidden_dim)
self.num_queries = num_queries
# learnable query features
self.query_feat = nn.Embedding(num_queries, hidden_dim)
# learnable query p.e.
self.query_embed = nn.Embedding(num_queries, hidden_dim)
# level embedding (we always use 3 scales)
self.num_feature_levels = 3
self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim)
self.input_proj = nn.ModuleList()
for _ in range(self.num_feature_levels):
if in_channels != hidden_dim or enforce_input_project:
self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1))
weight_init.c2_xavier_fill(self.input_proj[-1])
else:
self.input_proj.append(nn.Sequential())
# output FFNs
if self.mask_classification:
self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
@classmethod
def from_config(cls, cfg, in_channels, mask_classification):
ret = {}
ret["in_channels"] = in_channels
ret["mask_classification"] = mask_classification
ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
# Transformer parameters:
ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
# NOTE: because we add learnable query features which requires supervision,
# we add minus 1 to decoder layers to be consistent with our loss
# implementation: that is, number of auxiliary losses is always
# equal to number of decoder layers. With learnable query features, the number of
# auxiliary losses equals number of decoders plus 1.
assert cfg.MODEL.MASK_FORMER.DEC_LAYERS >= 1
ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS - 1
ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
ret["num_frames"] = cfg.INPUT.SAMPLING_FRAME_NUM
return ret
def forward(self, x, mask_features, mask = None):
bt, c_m, h_m, w_m = mask_features.shape
if bt == 6 or bt == 3: # 3 is for swinl which cannot afford batch size 2
bs = bt // self.num_frames if self.training else 1
else:
bs = bt // 4 if self.training else 1 # change here
t = bt // bs
mask_features = mask_features.view(bs, t, c_m, h_m, w_m)
# x is a list of multi-scale feature
assert len(x) == self.num_feature_levels
src = []
pos = []
size_list = []
# disable mask, it does not affect performance
del mask
for i in range(self.num_feature_levels):
size_list.append(x[i].shape[-2:])
pos.append(self.pe_layer(x[i].view(bs, t, -1, size_list[-1][0], size_list[-1][1]), None).flatten(3))
src.append(self.input_proj[i](x[i]).flatten(2) + self.level_embed.weight[i][None, :, None])
# NTxCxHW => NxTxCxHW => (TxHW)xNxC
_, c, hw = src[-1].shape
pos[-1] = pos[-1].view(bs, t, c, hw).permute(1, 3, 0, 2).flatten(0, 1)
src[-1] = src[-1].view(bs, t, c, hw).permute(1, 3, 0, 2).flatten(0, 1)
# QxNxC
query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1)
output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1)
predictions_class = []
predictions_mask = []
# prediction heads on learnable query features
outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0])
predictions_class.append(outputs_class)
predictions_mask.append(outputs_mask)
for i in range(self.num_layers):
level_index = i % self.num_feature_levels
attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False
# attention: cross-attention first
output = self.transformer_cross_attention_layers[i](
output, src[level_index],
memory_mask=attn_mask,
memory_key_padding_mask=None, # here we do not apply masking on padded region
pos=pos[level_index], query_pos=query_embed
)
output = self.transformer_self_attention_layers[i](
output, tgt_mask=None,
tgt_key_padding_mask=None,
query_pos=query_embed
)
# FFN
output = self.transformer_ffn_layers[i](
output
)
outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels])
predictions_class.append(outputs_class)
predictions_mask.append(outputs_mask)
assert len(predictions_class) == self.num_layers + 1
out = {
'pred_logits': predictions_class[-1],
'pred_masks': predictions_mask[-1],
'aux_outputs': self._set_aux_loss(
predictions_class if self.mask_classification else None, predictions_mask
)
}
return out
def forward_prediction_heads(self, output, mask_features, attn_mask_target_size):
decoder_output = self.decoder_norm(output)
decoder_output = decoder_output.transpose(0, 1)
outputs_class = self.class_embed(decoder_output)
mask_embed = self.mask_embed(decoder_output)
outputs_mask = torch.einsum("bqc,btchw->bqthw", mask_embed, mask_features)
b, q, t, _, _ = outputs_mask.shape
# NOTE: prediction is of higher-resolution
# [B, Q, T, H, W] -> [B, Q, T*H*W] -> [B, h, Q, T*H*W] -> [B*h, Q, T*HW]
attn_mask = F.interpolate(outputs_mask.flatten(0, 1), size=attn_mask_target_size, mode="bilinear", align_corners=False).view(
b, q, t, attn_mask_target_size[0], attn_mask_target_size[1])
# must use bool type
# If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged.
attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool()
attn_mask = attn_mask.detach()
return outputs_class, outputs_mask, attn_mask
@torch.jit.unused
def _set_aux_loss(self, outputs_class, outputs_seg_masks):
# this is a workaround to make torchscript happy, as torchscript
# doesn't support dictionary with non-homogeneous values, such
# as a dict having both a Tensor and a list.
if self.mask_classification:
return [
{"pred_logits": a, "pred_masks": b}
for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
]
else:
return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
================================================
FILE: mask2former_video/utils/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
================================================
FILE: mask2former_video/utils/__init__.py.new
================================================
================================================
FILE: mask2former_video/utils/memory.py
================================================
import logging
from contextlib import contextmanager
from functools import wraps
import torch
from torch.cuda.amp import autocast
__all__ = ["retry_if_cuda_oom"]
@contextmanager
def _ignore_torch_cuda_oom():
"""
A context which ignores CUDA OOM exception from pytorch.
"""
try:
yield
except RuntimeError as e:
# NOTE: the string may change?
if "CUDA out of memory. " in str(e):
pass
else:
raise
def retry_if_cuda_oom(func):
"""
Makes a function retry itself after encountering
pytorch's CUDA OOM error.
It will first retry after calling `torch.cuda.empty_cache()`.
If that still fails, it will then retry by trying to convert inputs to CPUs.
In this case, it expects the function to dispatch to CPU implementation.
The return values may become CPU tensors as well and it's user's
responsibility to convert it back to CUDA tensor if needed.
Args:
func: a stateless callable that takes tensor-like objects as arguments
Returns:
a callable which retries `func` if OOM is encountered.
Examples:
::
output = retry_if_cuda_oom(some_torch_function)(input1, input2)
# output may be on CPU even if inputs are on GPU
Note:
1. When converting inputs to CPU, it will only look at each argument and check
if it has `.device` and `.to` for conversion. Nested structures of tensors
are not supported.
2. Since the function might be called more than once, it has to be
stateless.
"""
def maybe_to_cpu(x):
try:
like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to")
except AttributeError:
like_gpu_tensor = False
if like_gpu_tensor:
return x.to(device="cpu").to(torch.float32)
else:
return x
@wraps(func)
def wrapped(*args, **kwargs):
with _ignore_torch_cuda_oom():
return func(*args, **kwargs)
# Clear cache and retry
torch.cuda.empty_cache()
with _ignore_torch_cuda_oom():
return func(*args, **kwargs)
# Try on CPU. This slows down the code significantly, therefore print a notice.
logger = logging.getLogger(__name__)
logger.info("Attempting to copy inputs to CPU due to CUDA OOM")
new_args = (maybe_to_cpu(x) for x in args)
new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()}
with autocast(enabled=False):
return func(*new_args, **new_kwargs)
return wrapped
================================================
FILE: mask2former_video/video_maskformer_model.py
================================================
import logging
import math
from typing import Tuple
import torch
from torch import nn
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.data import MetadataCatalog
from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head
from detectron2.modeling.backbone import Backbone
from detectron2.modeling.postprocessing import sem_seg_postprocess
from detectron2.structures import Boxes, ImageList, Instances, BitMasks
from .modeling.criterion import VideoSetCriterion
from .modeling.matcher import VideoHungarianMatcher
from .utils.memory import retry_if_cuda_oom
from skimage import color
import cv2
import numpy as np
def unfold_wo_center(x, kernel_size, dilation):
assert x.dim() == 4
assert kernel_size % 2 == 1
# using SAME padding
padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
unfolded_x = F.unfold(
x, kernel_size=kernel_size,
padding=padding,
dilation=dilation
)
unfolded_x = unfolded_x.reshape(
x.size(0), x.size(1), -1, x.size(2), x.size(3)
)
# remove the center pixels
size = kernel_size ** 2
unfolded_x = torch.cat((
unfolded_x[:, :, :size // 2],
unfolded_x[:, :, size // 2 + 1:]
), dim=2)
return unfolded_x
def unfold_w_center(x, kernel_size, dilation):
assert x.dim() == 4
assert kernel_size % 2 == 1
# using SAME padding
padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
unfolded_x = F.unfold(
x, kernel_size=kernel_size,
padding=padding,
dilation=dilation
)
unfolded_x = unfolded_x.reshape(
x.size(0), x.size(1), -1, x.size(2), x.size(3)
)
return unfolded_x
def get_images_color_similarity(images, kernel_size, dilation):
assert images.dim() == 4
assert images.size(0) == 1
unfolded_images = unfold_wo_center(
images, kernel_size=kernel_size, dilation=dilation
)
diff = images[:, :, None] - unfolded_images
similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5)
return similarity
def get_neighbor_images_color_similarity(images, images_neighbor, kernel_size, dilation):
assert images.dim() == 4
assert images.size(0) == 1
unfolded_images = unfold_w_center(
images, kernel_size=kernel_size, dilation=dilation
)
diff = images_neighbor[:, :, None] - unfolded_images
similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5)
return similarity
def get_neighbor_images_patch_color_similarity(images, images_neighbor, kernel_size, dilation):
assert images.dim() == 4
assert images.size(0) == 1
unfolded_images = unfold_w_center(
images, kernel_size=kernel_size, dilation= 1 #dilation
)
unfolded_images_neighbor = unfold_w_center(
images_neighbor, kernel_size=kernel_size, dilation= 1 #dilation
)
unfolded_images = unfolded_images.flatten(1,2)
unfolded_images_neighbor = unfolded_images_neighbor.flatten(1,2)
similarity = get_neighbor_images_color_similarity(unfolded_images, unfolded_images_neighbor, 3, 3)
return similarity
logger = logging.getLogger(__name__)
@META_ARCH_REGISTRY.register()
class VideoMaskFormer(nn.Module):
"""
Main class for mask classification semantic segmentation architectures.
"""
@configurable
def __init__(
self,
*,
backbone: Backbone,
sem_seg_head: nn.Module,
criterion: nn.Module,
num_queries: int,
object_mask_threshold: float,
overlap_threshold: float,
metadata,
size_divisibility: int,
sem_seg_postprocess_before_inference: bool,
pixel_mean: Tuple[float],
pixel_std: Tuple[float],
# video
num_frames,
):
"""
Args:
backbone: a backbone module, must follow detectron2's backbone interface
sem_seg_head: a module that predicts semantic segmentation from backbone features
criterion: a module that defines the loss
num_queries: int, number of queries
object_mask_threshold: float, threshold to filter query based on classification score
for panoptic segmentation inference
overlap_threshold: overlap threshold used in general inference for panoptic segmentation
metadata: dataset meta, get `thing` and `stuff` category names for panoptic
segmentation inference
size_divisibility: Some backbones require the input height and width to be divisible by a
specific integer. We can use this to override such requirement.
sem_seg_postprocess_before_inference: whether to resize the prediction back
to original input size before semantic segmentation inference or after.
For high-resolution dataset like Mapillary, resizing predictions before
inference will cause OOM error.
pixel_mean, pixel_std: list or tuple with #channels element, representing
the per-channel mean and std to be used to normalize the input image
semantic_on: bool, whether to output semantic segmentation prediction
instance_on: bool, whether to output instance segmentation prediction
panoptic_on: bool, whether to output panoptic segmentation prediction
test_topk_per_image: int, instance segmentation parameter, keep topk instances per image
"""
super().__init__()
self.backbone = backbone
self.sem_seg_head = sem_seg_head
self.criterion = criterion
self.num_queries = num_queries
self.overlap_threshold = overlap_threshold
self.object_mask_threshold = object_mask_threshold
self.metadata = metadata
if size_divisibility < 0:
# use backbone size_divisibility if not set
size_divisibility = self.backbone.size_divisibility
self.size_divisibility = size_divisibility
self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference
self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
self.num_frames = num_frames
#self.structure_fc = nn.Conv2d(27, 256, 1)
@classmethod
def from_config(cls, cfg):
backbone = build_backbone(cfg)
sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
# Loss parameters:
deep_supervision = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
no_object_weight = cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT
# loss weights
class_weight = cfg.MODEL.MASK_FORMER.CLASS_WEIGHT
dice_weight = cfg.MODEL.MASK_FORMER.DICE_WEIGHT
mask_weight = cfg.MODEL.MASK_FORMER.MASK_WEIGHT
# building criterion
matcher = VideoHungarianMatcher(
cost_class=class_weight,
cost_mask=mask_weight,
cost_dice=dice_weight,
num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS,
)
weight_dict = {"loss_ce": class_weight, "loss_mask": mask_weight, "loss_mask_proj": mask_weight, "loss_dice": dice_weight, "loss_bound": mask_weight, "loss_bound_neighbor": mask_weight, "loss_out_box": mask_weight}
if deep_supervision:
dec_layers = cfg.MODEL.MASK_FORMER.DEC_LAYERS
aux_weight_dict = {}
for i in range(dec_layers - 1):
aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
weight_dict.update(aux_weight_dict)
losses = ["labels", "masks"]
criterion = VideoSetCriterion(
sem_seg_head.num_classes,
matcher=matcher,
weight_dict=weight_dict,
eos_coef=no_object_weight,
losses=losses,
num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS,
oversample_ratio=cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO,
importance_sample_ratio=cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO,
)
return {
"backbone": backbone,
"sem_seg_head": sem_seg_head,
"criterion": criterion,
"num_queries": cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES,
"object_mask_threshold": cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD,
"overlap_threshold": cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD,
"metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
"size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY,
"sem_seg_postprocess_before_inference": True,
"pixel_mean": cfg.MODEL.PIXEL_MEAN,
"pixel_std": cfg.MODEL.PIXEL_STD,
# video
"num_frames": cfg.INPUT.SAMPLING_FRAME_NUM,
}
@property
def device(self):
return self.pixel_mean.device
def forward(self, batched_inputs):
"""
Args:
batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
* "image": Tensor, image in (C, H, W) format.
* "instances": per-region ground truth
* Other information that's included in the original dicts, such as:
"height", "width" (int): the output resolution of the model (may be different
from input resolution), used in inference.
Returns:
list[dict]:
each dict has the results for one image. The dict contains the following keys:
* "sem_seg":
A Tensor that represents the
per-pixel segmentation prediced by the head.
The prediction has shape KxHxW that represents the logits of
each class for each pixel.
* "panoptic_seg":
A tuple that represent panoptic output
panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
segments_info (list[dict]): Describe each segment in `panoptic_seg`.
Each dict contains keys "id", "category_id", "isthing".
"""
images = []
for video in batched_inputs:
for frame in video["image"]:
images.append(frame.to(self.device))
is_coco = (len(images) == 8) or (len(images) == 4)# change here, 4 is for swinl with bs 1 which cannot afford batch size 2
if self.training and not is_coco:
k_size = 3
rs_images = ImageList.from_tensors(images, self.size_divisibility)
downsampled_images = F.avg_pool2d(rs_images.tensor.float(), kernel_size=4, stride=4, padding=0) #for img in images]
images_lab = [torch.as_tensor(color.rgb2lab(ds_image[[2, 1, 0]].byte().permute(1, 2, 0).cpu().numpy()), device=ds_image.device, dtype=torch.float32).permute(2, 0, 1) for ds_image in downsampled_images]
images_lab_sim = [get_images_color_similarity(img_lab.unsqueeze(0), k_size, 2) for img_lab in images_lab] # ori is 0.3, 0.5, 0.7
images_lab_sim_nei = [get_neighbor_images_patch_color_similarity(images_lab[ii].unsqueeze(0), images_lab[ii+1].unsqueeze(0), 3, 3) for ii in range(0, len(images_lab), 3)] # change k form 3 to 5, ori is 3, ori dilation is 3
images_lab_sim_nei1 = [get_neighbor_images_patch_color_similarity(images_lab[ii].unsqueeze(0), images_lab[ii+2].unsqueeze(0), 3, 3) for ii in range(0, len(images_lab), 3)]
images_lab_sim_nei2 = [get_neighbor_images_patch_color_similarity(images_lab[ii+1].unsqueeze(0), images_lab[ii+2].unsqueeze(0), 3, 3) for ii in range(0, len(images_lab), 3)]
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
images = ImageList.from_tensors(images, self.size_divisibility)
features = self.backbone(images.tensor)
outputs = self.sem_seg_head(features)
if self.training:
# mask classification target
targets = self.prepare_targets(batched_inputs, images, is_coco)
if not is_coco:
# bipartite matching-based loss
losses = self.criterion(outputs, targets, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2)
else:
losses = self.criterion(outputs, targets, None, None, None, None)
for k in list(losses.keys()):
if k in self.criterion.weight_dict:
losses[k] *= self.criterion.weight_dict[k]
else:
# remove this loss if not specified in `weight_dict`
losses.pop(k)
return losses
else:
mask_cls_results = outputs["pred_logits"]
mask_pred_results = outputs["pred_masks"]
mask_cls_result = mask_cls_results[0]
# upsample masks
mask_pred_result = retry_if_cuda_oom(F.interpolate)(
mask_pred_results[0],
size=(images.tensor.shape[-2], images.tensor.shape[-1]),
mode="bilinear",
align_corners=False,
)
del outputs
input_per_image = batched_inputs[0]
image_size = images.image_sizes[0] # image size without padding after data augmentation
height = input_per_image.get("height", image_size[0]) # raw image size before data augmentation
width = input_per_image.get("width", image_size[1])
return retry_if_cuda_oom(self.inference_video)(mask_cls_result, mask_pred_result, image_size, height, width)
def prepare_targets(self, targets, images, is_coco):
h_pad, w_pad = images.tensor.shape[-2:]
gt_instances = []
for targets_per_video in targets:
_num_instance = len(targets_per_video["instances"][0])
if is_coco:
mask_shape = [_num_instance, 4, h_pad, w_pad] #change here
else:
mask_shape = [_num_instance, self.num_frames, h_pad, w_pad]
gt_masks_per_video = torch.zeros(mask_shape, dtype=torch.bool, device=self.device)
gt_classes_per_video = targets_per_video["instances"][0].gt_classes.to(self.device)
gt_ids_per_video = []
for f_i, targets_per_frame in enumerate(targets_per_video["instances"]):
targets_per_frame = targets_per_frame.to(self.device)
h, w = targets_per_frame.image_size
_update_cls = gt_classes_per_video == -1
gt_classes_per_video[_update_cls] = targets_per_frame.gt_classes[_update_cls]
gt_ids_per_video.append(targets_per_frame.gt_ids[:, None])
if isinstance(targets_per_frame.gt_masks, BitMasks):
gt_masks_per_video[:, f_i, :h, :w] = targets_per_frame.gt_masks.tensor
else: #polygon
gt_masks_per_video[:, f_i, :h, :w] = targets_per_frame.gt_masks
gt_ids_per_video = torch.cat(gt_ids_per_video, dim=1)
gt_ids_per_video[gt_masks_per_video.sum(dim=(2,3)) == 0] = -1
valid_bool_frame = (gt_ids_per_video != -1)
valid_bool_clip = valid_bool_frame.any(dim=-1)
# valid_idx = (gt_ids_per_video != -1).any(dim=-1)
gt_classes_per_video = gt_classes_per_video[valid_bool_clip].long() #targets_per_frame.gt_classes[valid_idx] # N,
gt_ids_per_video = gt_ids_per_video[valid_bool_clip].long() # N, num_frames
valid_bool_frame = valid_bool_frame[valid_bool_clip]
if len(gt_ids_per_video) > 0:
min_id = max(gt_ids_per_video[valid_bool_frame].min(), 0)
gt_ids_per_video[valid_bool_frame] -= min_id
gt_instances.append({"labels": gt_classes_per_video, "ids": gt_ids_per_video})
gt_masks_per_video = gt_masks_per_video[valid_bool_clip].float() # N, num_frames, H, W
gt_instances[-1].update({"masks": gt_masks_per_video})
return gt_instances
def inference_video(self, pred_cls, pred_masks, img_size, output_height, output_width):
if len(pred_cls) > 0:
scores = F.softmax(pred_cls, dim=-1)[:, :-1]
labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1)
# keep top-10 predictions
scores_per_image, topk_indices = scores.flatten(0, 1).topk(10, sorted=False)
labels_per_image = labels[topk_indices]
topk_indices = topk_indices // self.sem_seg_head.num_classes
pred_masks = pred_masks[topk_indices]
pred_masks = pred_masks[:, :, : img_size[0], : img_size[1]]
pred_masks = F.interpolate(
pred_masks, size=(output_height, output_width), mode="bilinear", align_corners=False
)
masks = pred_masks > 0.
out_scores = scores_per_image.tolist()
out_labels = labels_per_image.tolist()
out_masks = [m for m in masks.cpu()]
else:
out_scores = []
out_labels = []
out_masks = []
video_output = {
"image_size": (output_height, output_width),
"pred_scores": out_scores,
"pred_labels": out_labels,
"pred_masks": out_masks,
}
return video_output
================================================
FILE: mfvis_nococo/__init__.py
================================================
from . import modeling
# config
from .config import add_maskformer2_video_config
# models
from .video_maskformer_model import VideoMaskFormer
# video
from .data_video import (
YTVISDatasetMapper,
YTVISEvaluator,
build_detection_train_loader,
build_detection_test_loader,
get_detection_dataset_dicts,
)
================================================
FILE: mfvis_nococo/configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation.yaml
================================================
MODEL:
BACKBONE:
FREEZE_AT: 0
NAME: "build_resnet_backbone"
WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
MASK_ON: True
RESNETS:
DEPTH: 50
STEM_TYPE: "basic" # not used
STEM_OUT_CHANNELS: 64
STRIDE_IN_1X1: False
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
# NORM: "SyncBN"
RES5_MULTI_GRID: [1, 1, 1] # not used
DATASETS:
TRAIN: ("ytvis_2019_train",)
TEST: ("ytvis_2019_val",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.0001
STEPS: (4000,)
MAX_ITER: 6000
WARMUP_FACTOR: 1.0
WARMUP_ITERS: 10
WEIGHT_DECAY: 0.05
OPTIMIZER: "ADAMW"
BACKBONE_MULTIPLIER: 0.1
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 0.01
NORM_TYPE: 2.0
AMP:
ENABLED: True
INPUT:
MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
RANDOM_FLIP: "flip_by_clip"
AUGMENTATIONS: []
MIN_SIZE_TRAIN: (360, 480)
MIN_SIZE_TEST: 360
CROP:
ENABLED: False
TYPE: "absolute_range"
SIZE: (600, 720)
FORMAT: "RGB"
TEST:
EVAL_PERIOD: 0
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 4
VERSION: 2
================================================
FILE: mfvis_nococo/configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep_coco.yaml
================================================
_BASE_: video_maskformer2_R50_bs16_8ep.yaml
OUTPUT_DIR: 'box_patch_newknn_t5s5_spretrained1_r101_correct'
MODEL:
WEIGHTS: "./pretrained_model/model_final_eba159.pkl"
RESNETS:
DEPTH: 101
STEM_TYPE: "basic" # not used
STEM_OUT_CHANNELS: 64
STRIDE_IN_1X1: False
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
# NORM: "SyncBN"
RES5_MULTI_GRID: [1, 1, 1] # not used
================================================
FILE: mfvis_nococo/configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml
================================================
_BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml
OUTPUT_DIR: 'box_patch_newknn_t5s5_spretrained3_correct1'
SEED: 29118357
MODEL:
WEIGHTS: "./model_final_proj.pth"
META_ARCHITECTURE: "VideoMaskFormer"
SEM_SEG_HEAD:
NAME: "MaskFormerHead"
IGNORE_VALUE: 255
NUM_CLASSES: 40
LOSS_WEIGHT: 1.0
CONVS_DIM: 256
MASK_DIM: 256
NORM: "GN"
# pixel decoder
PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
IN_FEATURES: ["res2", "res3", "res4", "res5"]
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
COMMON_STRIDE: 4
TRANSFORMER_ENC_LAYERS: 6
MASK_FORMER:
TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder"
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
DEEP_SUPERVISION: True
NO_OBJECT_WEIGHT: 0.1
CLASS_WEIGHT: 2.0
MASK_WEIGHT: 5.0
DICE_WEIGHT: 5.0
HIDDEN_DIM: 256
NUM_OBJECT_QUERIES: 100
NHEADS: 8
DROPOUT: 0.0
DIM_FEEDFORWARD: 2048
ENC_LAYERS: 0
PRE_NORM: False
ENFORCE_INPUT_PROJ: False
SIZE_DIVISIBILITY: 32
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
TRAIN_NUM_POINTS: 20000 #20000 #12544
OVERSAMPLE_RATIO: 3.0
IMPORTANCE_SAMPLE_RATIO: 0.75
TEST:
SEMANTIC_ON: False
INSTANCE_ON: True
PANOPTIC_ON: False
OVERLAP_THRESHOLD: 0.8
OBJECT_MASK_THRESHOLD: 0.8
================================================
FILE: mfvis_nococo/configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep_coco.yaml
================================================
_BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml
OUTPUT_DIR: 'box_patch_newknn_t5s5_spretrained3_coco_correct1'
SEED: 29118357
MODEL:
WEIGHTS: "./pretrained_model/model_final_3c8ec9.pkl"
META_ARCHITECTURE: "VideoMaskFormer"
SEM_SEG_HEAD:
NAME: "MaskFormerHead"
IGNORE_VALUE: 255
NUM_CLASSES: 40
LOSS_WEIGHT: 1.0
CONVS_DIM: 256
MASK_DIM: 256
NORM: "GN"
# pixel decoder
PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
IN_FEATURES: ["res2", "res3", "res4", "res5"]
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
COMMON_STRIDE: 4
TRANSFORMER_ENC_LAYERS: 6
MASK_FORMER:
TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder"
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
DEEP_SUPERVISION: True
NO_OBJECT_WEIGHT: 0.1
CLASS_WEIGHT: 2.0
MASK_WEIGHT: 5.0
DICE_WEIGHT: 5.0
HIDDEN_DIM: 256
NUM_OBJECT_QUERIES: 100
NHEADS: 8
DROPOUT: 0.0
DIM_FEEDFORWARD: 2048
ENC_LAYERS: 0
PRE_NORM: False
ENFORCE_INPUT_PROJ: False
SIZE_DIVISIBILITY: 32
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
TRAIN_NUM_POINTS: 20000 #20000 #12544
OVERSAMPLE_RATIO: 3.0
IMPORTANCE_SAMPLE_RATIO: 0.75
TEST:
SEMANTIC_ON: False
INSTANCE_ON: True
PANOPTIC_ON: False
OVERLAP_THRESHOLD: 0.8
OBJECT_MASK_THRESHOLD: 0.8
================================================
FILE: mfvis_nococo/mask2former/__init__.py
================================================
from . import data # register all new datasets
from . import modeling
# config
from .config import add_maskformer2_config
# dataset loading
from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
MaskFormerInstanceDatasetMapper,
)
from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
MaskFormerPanopticDatasetMapper,
)
from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
MaskFormerSemanticDatasetMapper,
)
# models
from .maskformer_model import MaskFormer
from .test_time_augmentation import SemanticSegmentorWithTTA
# evaluation
from .evaluation.instance_evaluation import InstanceSegEvaluator
================================================
FILE: mfvis_nococo/mask2former/config.py
================================================
# -*- coding: utf-8 -*-
from detectron2.config import CfgNode as CN
def add_maskformer2_config(cfg):
"""
Add config for MASK_FORMER.
"""
# NOTE: configs from original maskformer
# data config
# select the dataset mapper
cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
# Color augmentation
cfg.INPUT.COLOR_AUG_SSD = False
# We retry random cropping until no single category in semantic segmentation GT occupies more
# than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
# Pad image and segmentation GT in dataset mapper.
cfg.INPUT.SIZE_DIVISIBILITY = -1
# solver config
# weight decay on embedding
cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
# optimizer
cfg.SOLVER.OPTIMIZER = "ADAMW"
cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
# mask_former model config
cfg.MODEL.MASK_FORMER = CN()
# loss
cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0
cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0
# transformer config
cfg.MODEL.MASK_FORMER.NHEADS = 8
cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
cfg.MODEL.MASK_FORMER.PRE_NORM = False
cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100
cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False
# mask_former inference config
cfg.MODEL.MASK_FORMER.TEST = CN()
cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True
cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False
cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
# Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
# you can use this config to override
cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32
# pixel decoder config
cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
# adding transformer in pixel decoder
cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
# pixel decoder
cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
# swin transformer backbone
cfg.MODEL.SWIN = CN()
cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
cfg.MODEL.SWIN.PATCH_SIZE = 4
cfg.MODEL.SWIN.EMBED_DIM = 96
cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
cfg.MODEL.SWIN.WINDOW_SIZE = 7
cfg.MODEL.SWIN.MLP_RATIO = 4.0
cfg.MODEL.SWIN.QKV_BIAS = True
cfg.MODEL.SWIN.QK_SCALE = None
cfg.MODEL.SWIN.DROP_RATE = 0.0
cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
cfg.MODEL.SWIN.APE = False
cfg.MODEL.SWIN.PATCH_NORM = True
cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
cfg.MODEL.SWIN.USE_CHECKPOINT = False
# NOTE: maskformer2 extra configs
# transformer module
cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder"
# LSJ aug
cfg.INPUT.IMAGE_SIZE = 1024
cfg.INPUT.MIN_SCALE = 0.1
cfg.INPUT.MAX_SCALE = 2.0
# MSDeformAttn encoder configs
cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8
# point loss configs
# Number of points sampled during training for a mask point head.
cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112
# Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
# original paper.
cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0
# Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
# the original paper.
cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75
================================================
FILE: mfvis_nococo/mask2former/data/__init__.py
================================================
from . import datasets
================================================
FILE: mfvis_nococo/mask2former/data/dataset_mappers/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
================================================
FILE: mfvis_nococo/mask2former/data/dataset_mappers/__init__.py.new
================================================
================================================
FILE: mfvis_nococo/mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py
================================================
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
import copy
import logging
import numpy as np
import torch
from detectron2.config import configurable
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.data.transforms import TransformGen
from detectron2.structures import BitMasks, Instances
from pycocotools import mask as coco_mask
__all__ = ["COCOInstanceNewBaselineDatasetMapper"]
def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
"""
Compute the bounding boxes around the provided masks.
Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with
``0 <= x1 < x2`` and ``0 <= y1 < y2``.
Args:
masks (Tensor[N, H, W]): masks to transform where N is the number of masks
and (H, W) are the spatial dimensions.
Returns:
Tensor[N, 4]: bounding boxes
"""
if masks.numel() == 0:
return masks
n = masks.shape[0]
for index, mask in enumerate(masks):
y, x = torch.where(mask != 0)
if len(x) * len(y) == 0:
continue
h = torch.max(y) - torch.min(y)
w = torch.max(x) - torch.min(x)
masks[index, torch.min(y):torch.max(y), torch.min(x):torch.max(x)] = 1.0
return masks
def convert_coco_poly_to_mask(segmentations, height, width):
masks = []
for polygons in segmentations:
rles = coco_mask.frPyObjects(polygons, height, width)
mask = coco_mask.decode(rles)
if len(mask.shape) < 3:
mask = mask[..., None]
mask = torch.as_tensor(mask, dtype=torch.uint8)
mask = mask.any(dim=2)
masks.append(mask)
if masks:
masks = torch.stack(masks, dim=0)
masks = masks_to_boxes(masks)
else:
masks = torch.zeros((0, height, width), dtype=torch.uint8)
return masks
def build_transform_gen(cfg, is_train):
"""
Create a list of default :class:`Augmentation` from config.
Now it includes resizing and flipping.
Returns:
list[Augmentation]
"""
assert is_train, "Only support training augmentation"
image_size = cfg.INPUT.IMAGE_SIZE
min_scale = cfg.INPUT.MIN_SCALE
max_scale = cfg.INPUT.MAX_SCALE
augmentation = []
if cfg.INPUT.RANDOM_FLIP != "none":
augmentation.append(
T.RandomFlip(
horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
)
)
augmentation.extend([
T.ResizeScale(
min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
),
T.FixedSizeCrop(crop_size=(image_size, image_size)),
])
return augmentation
# This is specifically designed for the COCO dataset.
class COCOInstanceNewBaselineDatasetMapper:
"""
A callable which takes a dataset dict in Detectron2 Dataset format,
and map it into a format used by MaskFormer.
This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
The callable currently does the following:
1. Read the image from "file_name"
2. Applies geometric transforms to the image and annotation
3. Find and applies suitable cropping to the image and annotation
4. Prepare image and annotation to Tensors
"""
@configurable
def __init__(
self,
is_train=True,
*,
tfm_gens,
image_format,
):
"""
NOTE: this interface is experimental.
Args:
is_train: for training or inference
augmentations: a list of augmentations or deterministic transforms to apply
tfm_gens: data augmentation
image_format: an image format supported by :func:`detection_utils.read_image`.
"""
self.tfm_gens = tfm_gens
logging.getLogger(__name__).info(
"[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens))
)
self.img_format = image_format
self.is_train = is_train
@classmethod
def from_config(cls, cfg, is_train=True):
# Build augmentation
tfm_gens = build_transform_gen(cfg, is_train)
ret = {
"is_train": is_train,
"tfm_gens": tfm_gens,
"image_format": cfg.INPUT.FORMAT,
}
return ret
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
utils.check_image_size(dataset_dict, image)
# TODO: get padding mask
# by feeding a "segmentation mask" to the same transforms
padding_mask = np.ones(image.shape[:2])
image, transforms = T.apply_transform_gens(self.tfm_gens, image)
# the crop transformation has default padding value 0 for segmentation
padding_mask = transforms.apply_segmentation(padding_mask)
padding_mask = ~ padding_mask.astype(bool)
image_shape = image.shape[:2] # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))
if not self.is_train:
# USER: Modify this if you want to keep them for some reason.
dataset_dict.pop("annotations", None)
return dataset_dict
if "annotations" in dataset_dict:
# USER: Modify this if you want to keep them for some reason.
for anno in dataset_dict["annotations"]:
# Let's always keep mask
# if not self.mask_on:
# anno.pop("segmentation", None)
anno.pop("keypoints", None)
# USER: Implement additional transformations if you have other types of data
annos = [
utils.transform_instance_annotations(obj, transforms, image_shape)
for obj in dataset_dict.pop("annotations")
if obj.get("iscrowd", 0) == 0
]
# NOTE: does not support BitMask due to augmentation
# Current BitMask cannot handle empty objects
instances = utils.annotations_to_instances(annos, image_shape)
# After transforms such as cropping are applied, the bounding box may no longer
# tightly bound the object. As an example, imagine a triangle object
# [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
# bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
# the intersection of original bounding box and the cropping box.
instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
# Need to filter empty instances first (due to augmentation)
instances = utils.filter_empty_instances(instances)
# Generate masks from polygon
h, w = instances.image_size
# image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
if hasattr(instances, 'gt_masks'):
gt_masks = instances.gt_masks
gt_masks_box = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
instances.gt_masks = gt_masks_box
dataset_dict["instances"] = instances
return dataset_dict
================================================
FILE: mfvis_nococo/mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py
================================================
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
import copy
import logging
import numpy as np
import torch
from detectron2.config import configurable
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.data.transforms import TransformGen
from detectron2.structures import BitMasks, Boxes, Instances
__all__ = ["COCOPanopticNewBaselineDatasetMapper"]
def build_transform_gen(cfg, is_train):
"""
Create a list of default :class:`Augmentation` from config.
Now it includes resizing and flipping.
Returns:
list[Augmentation]
"""
assert is_train, "Only support training augmentation"
image_size = cfg.INPUT.IMAGE_SIZE
min_scale = cfg.INPUT.MIN_SCALE
max_scale = cfg.INPUT.MAX_SCALE
augmentation = []
if cfg.INPUT.RANDOM_FLIP != "none":
augmentation.append(
T.RandomFlip(
horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
)
)
augmentation.extend([
T.ResizeScale(
min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
),
T.FixedSizeCrop(crop_size=(image_size, image_size)),
])
return augmentation
# This is specifically designed for the COCO dataset.
class COCOPanopticNewBaselineDatasetMapper:
"""
A callable which takes a dataset dict in Detectron2 Dataset format,
and map it into a format used by MaskFormer.
This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
The callable currently does the following:
1. Read the image from "file_name"
2. Applies geometric transforms to the image and annotation
3. Find and applies suitable cropping to the image and annotation
4. Prepare image and annotation to Tensors
"""
@configurable
def __init__(
self,
is_train=True,
*,
tfm_gens,
image_format,
):
"""
NOTE: this interface is experimental.
Args:
is_train: for training or inference
augmentations: a list of augmentations or deterministic transforms to apply
crop_gen: crop augmentation
tfm_gens: data augmentation
image_format: an image format supported by :func:`detection_utils.read_image`.
"""
self.tfm_gens = tfm_gens
logging.getLogger(__name__).info(
"[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
str(self.tfm_gens)
)
)
self.img_format = image_format
self.is_train = is_train
@classmethod
def from_config(cls, cfg, is_train=True):
# Build augmentation
tfm_gens = build_transform_gen(cfg, is_train)
ret = {
"is_train": is_train,
"tfm_gens": tfm_gens,
"image_format": cfg.INPUT.FORMAT,
}
return ret
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
utils.check_image_size(dataset_dict, image)
image, transforms = T.apply_transform_gens(self.tfm_gens, image)
image_shape = image.shape[:2] # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
if not self.is_train:
# USER: Modify this if you want to keep them for some reason.
dataset_dict.pop("annotations", None)
return dataset_dict
if "pan_seg_file_name" in dataset_dict:
pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
segments_info = dataset_dict["segments_info"]
# apply the same transformation to panoptic segmentation
pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
from panopticapi.utils import rgb2id
pan_seg_gt = rgb2id(pan_seg_gt)
instances = Instances(image_shape)
classes = []
masks = []
for segment_info in segments_info:
class_id = segment_info["category_id"]
if not segment_info["iscrowd"]:
classes.append(class_id)
masks.append(pan_seg_gt == segment_info["id"])
classes = np.array(classes)
instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
if len(masks) == 0:
# Some image does not have annotation (all ignored)
instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
instances.gt_boxes = Boxes(torch.zeros((0, 4)))
else:
masks = BitMasks(
torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
)
instances.gt_masks = masks.tensor
instances.gt_boxes = masks.get_bounding_boxes()
dataset_dict["instances"] = instances
return dataset_dict
================================================
FILE: mfvis_nococo/mask2former/data/dataset_mappers/mask_former_instance_dataset_mapper.py
================================================
import copy
import logging
import numpy as np
import pycocotools.mask as mask_util
import torch
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.projects.point_rend import ColorAugSSDTransform
from detectron2.structures import BitMasks, Instances, polygons_to_bitmask
__all__ = ["MaskFormerInstanceDatasetMapper"]
class MaskFormerInstanceDatasetMapper:
"""
A callable which takes a dataset dict in Detectron2 Dataset format,
and map it into a format used by MaskFormer for instance segmentation.
The callable currently does the following:
1. Read the image from "file_name"
2. Applies geometric transforms to the image and annotation
3. Find and applies suitable cropping to the image and annotation
4. Prepare image and annotation to Tensors
"""
@configurable
def __init__(
self,
is_train=True,
*,
augmentations,
image_format,
size_divisibility,
):
"""
NOTE: this interface is experimental.
Args:
is_train: for training or inference
augmentations: a list of augmentations or deterministic transforms to apply
image_format: an image format supported by :func:`detection_utils.read_image`.
size_divisibility: pad image size to be divisible by this value
"""
self.is_train = is_train
self.tfm_gens = augmentations
self.img_format = image_format
self.size_divisibility = size_divisibility
logger = logging.getLogger(__name__)
mode = "training" if is_train else "inference"
logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
@classmethod
def from_config(cls, cfg, is_train=True):
# Build augmentation
augs = [
T.ResizeShortestEdge(
cfg.INPUT.MIN_SIZE_TRAIN,
cfg.INPUT.MAX_SIZE_TRAIN,
cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
)
]
if cfg.INPUT.CROP.ENABLED:
augs.append(
T.RandomCrop(
cfg.INPUT.CROP.TYPE,
cfg.INPUT.CROP.SIZE,
)
)
if cfg.INPUT.COLOR_AUG_SSD:
augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
augs.append(T.RandomFlip())
ret = {
"is_train": is_train,
"augmentations": augs,
"image_format": cfg.INPUT.FORMAT,
"size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
}
return ret
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
utils.check_image_size(dataset_dict, image)
aug_input = T.AugInput(image)
aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
image = aug_input.image
# transform instnace masks
assert "annotations" in dataset_dict
for anno in dataset_dict["annotations"]:
anno.pop("keypoints", None)
annos = [
utils.transform_instance_annotations(obj, transforms, image.shape[:2])
for obj in dataset_dict.pop("annotations")
if obj.get("iscrowd", 0) == 0
]
if len(annos):
assert "segmentation" in annos[0]
segms = [obj["segmentation"] for obj in annos]
masks = []
for segm in segms:
if isinstance(segm, list):
# polygon
masks.append(polygons_to_bitmask(segm, *image.shape[:2]))
elif isinstance(segm, dict):
# COCO RLE
masks.append(mask_util.decode(segm))
elif isinstance(segm, np.ndarray):
assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
segm.ndim
)
# mask array
masks.append(segm)
else:
raise ValueError(
"Cannot convert segmentation of type '{}' to BitMasks!"
"Supported types are: polygons as list[list[float] or ndarray],"
" COCO-style RLE as a dict, or a binary segmentation mask "
" in a 2D numpy array of shape HxW.".format(type(segm))
)
# Pad image and segmentation label here!
image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks]
classes = [int(obj["category_id"]) for obj in annos]
classes = torch.tensor(classes, dtype=torch.int64)
if self.size_divisibility > 0:
image_size = (image.shape[-2], image.shape[-1])
padding_size = [
0,
self.size_divisibility - image_size[1],
0,
self.size_divisibility - image_size[0],
]
# pad image
image = F.pad(image, padding_size, value=128).contiguous()
# pad mask
masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks]
image_shape = (image.shape[-2], image.shape[-1]) # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"] = image
# Prepare per-category binary masks
instances = Instances(image_shape)
instances.gt_classes = classes
if len(masks) == 0:
# Some image does not have annotation (all ignored)
instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1]))
else:
masks = BitMasks(torch.stack(masks))
instances.gt_masks = masks.tensor
dataset_dict["instances"] = instances
return dataset_dict
================================================
FILE: mfvis_nococo/mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py
================================================
import copy
import logging
import numpy as np
import torch
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.structures import BitMasks, Instances
from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
__all__ = ["MaskFormerPanopticDatasetMapper"]
class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper):
"""
A callable which takes a dataset dict in Detectron2 Dataset format,
and map it into a format used by MaskFormer for panoptic segmentation.
The callable currently does the following:
1. Read the image from "file_name"
2. Applies geometric transforms to the image and annotation
3. Find and applies suitable cropping to the image and annotation
4. Prepare image and annotation to Tensors
"""
@configurable
def __init__(
self,
is_train=True,
*,
augmentations,
image_format,
ignore_label,
size_divisibility,
):
"""
NOTE: this interface is experimental.
Args:
is_train: for training or inference
augmentations: a list of augmentations or deterministic transforms to apply
image_format: an image format supported by :func:`detection_utils.read_image`.
ignore_label: the label that is ignored to evaluation
size_divisibility: pad image size to be divisible by this value
"""
super().__init__(
is_train,
augmentations=augmentations,
image_format=image_format,
ignore_label=ignore_label,
size_divisibility=size_divisibility,
)
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
utils.check_image_size(dataset_dict, image)
# semantic segmentation
if "sem_seg_file_name" in dataset_dict:
# PyTorch transformation not implemented for uint16, so converting it to double first
sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
else:
sem_seg_gt = None
# panoptic segmentation
if "pan_seg_file_name" in dataset_dict:
pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
segments_info = dataset_dict["segments_info"]
else:
pan_seg_gt = None
segments_info = None
if pan_seg_gt is None:
raise ValueError(
"Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format(
dataset_dict["file_name"]
)
)
aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
image = aug_input.image
if sem_seg_gt is not None:
sem_seg_gt = aug_input.sem_seg
# apply the same transformation to panoptic segmentation
pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
from panopticapi.utils import rgb2id
pan_seg_gt = rgb2id(pan_seg_gt)
# Pad image and segmentation label here!
image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
if sem_seg_gt is not None:
sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long"))
if self.size_divisibility > 0:
image_size = (image.shape[-2], image.shape[-1])
padding_size = [
0,
self.size_divisibility - image_size[1],
0,
self.size_divisibility - image_size[0],
]
image = F.pad(image, padding_size, value=128).contiguous()
if sem_seg_gt is not None:
sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
pan_seg_gt = F.pad(
pan_seg_gt, padding_size, value=0
).contiguous() # 0 is the VOID panoptic label
image_shape = (image.shape[-2], image.shape[-1]) # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"] = image
if sem_seg_gt is not None:
dataset_dict["sem_seg"] = sem_seg_gt.long()
if "annotations" in dataset_dict:
raise ValueError("Pemantic segmentation dataset should not have 'annotations'.")
# Prepare per-category binary masks
pan_seg_gt = pan_seg_gt.numpy()
instances = Instances(image_shape)
classes = []
masks = []
for segment_info in segments_info:
class_id = segment_info["category_id"]
if not segment_info["iscrowd"]:
classes.append(class_id)
masks.append(pan_seg_gt == segment_info["id"])
classes = np.array(classes)
instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
if len(masks) == 0:
# Some image does not have annotation (all ignored)
instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
else:
masks = BitMasks(
torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
)
instances.gt_masks = masks.tensor
dataset_dict["instances"] = instances
return dataset_dict
================================================
FILE: mfvis_nococo/mask2former/data/dataset_mappers/mask_former_semantic_dataset_mapper.py
================================================
import copy
import logging
import numpy as np
import torch
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.data import MetadataCatalog
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.projects.point_rend import ColorAugSSDTransform
from detectron2.structures import BitMasks, Instances
__all__ = ["MaskFormerSemanticDatasetMapper"]
class MaskFormerSemanticDatasetMapper:
"""
A callable which takes a dataset dict in Detectron2 Dataset format,
and map it into a format used by MaskFormer for semantic segmentation.
The callable currently does the following:
1. Read the image from "file_name"
2. Applies geometric transforms to the image and annotation
3. Find and applies suitable cropping to the image and annotation
4. Prepare image and annotation to Tensors
"""
@configurable
def __init__(
self,
is_train=True,
*,
augmentations,
image_format,
ignore_label,
size_divisibility,
):
"""
NOTE: this interface is experimental.
Args:
is_train: for training or inference
augmentations: a list of augmentations or deterministic transforms to apply
image_format: an image format supported by :func:`detection_utils.read_image`.
ignore_label: the label that is ignored to evaluation
size_divisibility: pad image size to be divisible by this value
"""
self.is_train = is_train
self.tfm_gens = augmentations
self.img_format = image_format
self.ignore_label = ignore_label
self.size_divisibility = size_divisibility
logger = logging.getLogger(__name__)
mode = "training" if is_train else "inference"
logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
@classmethod
def from_config(cls, cfg, is_train=True):
# Build augmentation
augs = [
T.ResizeShortestEdge(
cfg.INPUT.MIN_SIZE_TRAIN,
cfg.INPUT.MAX_SIZE_TRAIN,
cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
)
]
if cfg.INPUT.CROP.ENABLED:
augs.append(
T.RandomCrop_CategoryAreaConstraint(
cfg.INPUT.CROP.TYPE,
cfg.INPUT.CROP.SIZE,
cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA,
cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
)
)
if cfg.INPUT.COLOR_AUG_SSD:
augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
augs.append(T.RandomFlip())
# Assume always applies to the training set.
dataset_names = cfg.DATASETS.TRAIN
meta = MetadataCatalog.get(dataset_names[0])
ignore_label = meta.ignore_label
ret = {
"is_train": is_train,
"augmentations": augs,
"image_format": cfg.INPUT.FORMAT,
"ignore_label": ignore_label,
"size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
}
return ret
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!"
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
utils.check_image_size(dataset_dict, image)
if "sem_seg_file_name" in dataset_dict:
# PyTorch transformation not implemented for uint16, so converting it to double first
sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
else:
sem_seg_gt = None
if sem_seg_gt is None:
raise ValueError(
"Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format(
dataset_dict["file_name"]
)
)
aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
image = aug_input.image
sem_seg_gt = aug_input.sem_seg
# Pad image and segmentation label here!
image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
if sem_seg_gt is not None:
sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
if self.size_divisibility > 0:
image_size = (image.shape[-2], image.shape[-1])
padding_size = [
0,
self.size_divisibility - image_size[1],
0,
self.size_divisibility - image_size[0],
]
image = F.pad(image, padding_size, value=128).contiguous()
if sem_seg_gt is not None:
sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
image_shape = (image.shape[-2], image.shape[-1]) # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"] = image
if sem_seg_gt is not None:
dataset_dict["sem_seg"] = sem_seg_gt.long()
if "annotations" in dataset_dict:
raise ValueError("Semantic segmentation dataset should not have 'annotations'.")
# Prepare per-category binary masks
if sem_seg_gt is not None:
sem_seg_gt = sem_seg_gt.numpy()
instances = Instances(image_shape)
classes = np.unique(sem_seg_gt)
# remove ignored region
classes = classes[classes != self.ignore_label]
instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
masks = []
for class_id in classes:
masks.append(sem_seg_gt == class_id)
if len(masks) == 0:
# Some image does not have annotation (all ignored)
instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1]))
else:
masks = BitMasks(
torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
)
instances.gt_masks = masks.tensor
dataset_dict["instances"] = instances
return dataset_dict
================================================
FILE: mfvis_nococo/mask2former/data/datasets/__init__.py
================================================
from . import (
register_ade20k_full,
register_ade20k_panoptic,
register_coco_stuff_10k,
register_mapillary_vistas,
register_coco_panoptic_annos_semseg,
register_ade20k_instance,
register_mapillary_vistas_panoptic,
)
================================================
FILE: mfvis_nococo/mask2former/data/datasets/register_ade20k_full.py
================================================
import os
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets import load_sem_seg
ADE20K_SEM_SEG_FULL_CATEGORIES = [
{"name": "wall", "id": 2978, "trainId": 0},
{"name": "building, edifice", "id": 312, "trainId": 1},
{"name": "sky", "id": 2420, "trainId": 2},
{"name": "tree", "id": 2855, "trainId": 3},
{"name": "road, route", "id": 2131, "trainId": 4},
{"name": "floor, flooring", "id": 976, "trainId": 5},
{"name": "ceiling", "id": 447, "trainId": 6},
{"name": "bed", "id": 165, "trainId": 7},
{"name": "sidewalk, pavement", "id": 2377, "trainId": 8},
{"name": "earth, ground", "id": 838, "trainId": 9},
{"name": "cabinet", "id": 350, "trainId": 10},
{"name": "person, individual, someone, somebody, mortal, soul", "id": 1831, "trainId": 11},
{"name": "grass", "id": 1125, "trainId": 12},
{"name": "windowpane, window", "id": 3055, "trainId": 13},
{"name": "car, auto, automobile, machine, motorcar", "id": 401, "trainId": 14},
{"name": "mountain, mount", "id": 1610, "trainId": 15},
{"name": "plant, flora, plant life", "id": 1910, "trainId": 16},
{"name": "table", "id": 2684, "trainId": 17},
{"name": "chair", "id": 471, "trainId": 18},
{"name": "curtain, drape, drapery, mantle, pall", "id": 687, "trainId": 19},
{"name": "door", "id": 774, "trainId": 20},
{"name": "sofa, couch, lounge", "id": 2473, "trainId": 21},
{"name": "sea", "id": 2264, "trainId": 22},
{"name": "painting, picture", "id": 1735, "trainId": 23},
{"name": "water", "id": 2994, "trainId": 24},
{"name": "mirror", "id": 1564, "trainId": 25},
{"name": "house", "id": 1276, "trainId": 26},
{"name": "rug, carpet, carpeting", "id": 2178, "trainId": 27},
{"name": "shelf", "id": 2329, "trainId": 28},
{"name": "armchair", "id": 57, "trainId": 29},
{"name": "fence, fencing", "id": 907, "trainId": 30},
{"name": "field", "id": 913, "trainId": 31},
{"name": "lamp", "id": 1395, "trainId": 32},
{"name": "rock, stone", "id": 2138, "trainId": 33},
{"name": "seat", "id": 2272, "trainId": 34},
{"name": "river", "id": 2128, "trainId": 35},
{"name": "desk", "id": 724, "trainId": 36},
{"name": "bathtub, bathing tub, bath, tub", "id": 155, "trainId": 37},
{"name": "railing, rail", "id": 2053, "trainId": 38},
{"name": "signboard, sign", "id": 2380, "trainId": 39},
{"name": "cushion", "id": 689, "trainId": 40},
{"name": "path", "id": 1788, "trainId": 41},
{"name": "work surface", "id": 3087, "trainId": 42},
{"name": "stairs, steps", "id": 2530, "trainId": 43},
{"name": "column, pillar", "id": 581, "trainId": 44},
{"name": "sink", "id": 2388, "trainId": 45},
{"name": "wardrobe, closet, press", "id": 2985, "trainId": 46},
{"name": "snow", "id": 2454, "trainId": 47},
{"name": "refrigerator, icebox", "id": 2096, "trainId": 48},
{"name": "base, pedestal, stand", "id": 137, "trainId": 49},
{"name": "bridge, span", "id": 294, "trainId": 50},
{"name": "blind, screen", "id": 212, "trainId": 51},
{"name": "runway", "id": 2185, "trainId": 52},
{"name": "cliff, drop, drop-off", "id": 524, "trainId": 53},
{"name": "sand", "id": 2212, "trainId": 54},
{"name": "fireplace, hearth, open fireplace", "id": 943, "trainId": 55},
{"name": "pillow", "id": 1869, "trainId": 56},
{"name": "screen door, screen", "id": 2251, "trainId": 57},
{"name": "toilet, can, commode, crapper, pot, potty, stool, throne", "id": 2793, "trainId": 58},
{"name": "skyscraper", "id": 2423, "trainId": 59},
{"name": "grandstand, covered stand", "id": 1121, "trainId": 60},
{"name": "box", "id": 266, "trainId": 61},
{"name": "pool table, billiard table, snooker table", "id": 1948, "trainId": 62},
{"name": "palm, palm tree", "id": 1744, "trainId": 63},
{"name": "double door", "id": 783, "trainId": 64},
{"name": "coffee table, cocktail table", "id": 571, "trainId": 65},
{"name": "counter", "id": 627, "trainId": 66},
{"name": "countertop", "id": 629, "trainId": 67},
{"name": "chest of drawers, chest, bureau, dresser", "id": 491, "trainId": 68},
{"name": "kitchen island", "id": 1374, "trainId": 69},
{"name": "boat", "id": 223, "trainId": 70},
{"name": "waterfall, falls", "id": 3016, "trainId": 71},
{
"name": "stove, kitchen stove, range, kitchen range, cooking stove",
"id": 2598,
"trainId": 72,
},
{"name": "flower", "id": 978, "trainId": 73},
{"name": "bookcase", "id": 239, "trainId": 74},
{"name": "controls", "id": 608, "trainId": 75},
{"name": "book", "id": 236, "trainId": 76},
{"name": "stairway, staircase", "id": 2531, "trainId": 77},
{"name": "streetlight, street lamp", "id": 2616, "trainId": 78},
{
"name": "computer, computing machine, computing device, data processor, electronic computer, information processing system",
"id": 591,
"trainId": 79,
},
{
"name": "bus, autobus, coach, charabanc, double-decker, jitney, motorbus, motorcoach, omnibus, passenger vehicle",
"id": 327,
"trainId": 80,
},
{"name": "swivel chair", "id": 2679, "trainId": 81},
{"name": "light, light source", "id": 1451, "trainId": 82},
{"name": "bench", "id": 181, "trainId": 83},
{"name": "case, display case, showcase, vitrine", "id": 420, "trainId": 84},
{"name": "towel", "id": 2821, "trainId": 85},
{"name": "fountain", "id": 1023, "trainId": 86},
{"name": "embankment", "id": 855, "trainId": 87},
{
"name": "television receiver, television, television set, tv, tv set, idiot box, boob tube, telly, goggle box",
"id": 2733,
"trainId": 88,
},
{"name": "van", "id": 2928, "trainId": 89},
{"name": "hill", "id": 1240, "trainId": 90},
{"name": "awning, sunshade, sunblind", "id": 77, "trainId": 91},
{"name": "poster, posting, placard, notice, bill, card", "id": 1969, "trainId": 92},
{"name": "truck, motortruck", "id": 2880, "trainId": 93},
{"name": "airplane, aeroplane, plane", "id": 14, "trainId": 94},
{"name": "pole", "id": 1936, "trainId": 95},
{"name": "tower", "id": 2828, "trainId": 96},
{"name": "court", "id": 631, "trainId": 97},
{"name": "ball", "id": 103, "trainId": 98},
{
"name": "aircraft carrier, carrier, flattop, attack aircraft carrier",
"id": 3144,
"trainId": 99,
},
{"name": "buffet, counter, sideboard", "id": 308, "trainId": 100},
{"name": "hovel, hut, hutch, shack, shanty", "id": 1282, "trainId": 101},
{"name": "apparel, wearing apparel, dress, clothes", "id": 38, "trainId": 102},
{"name": "minibike, motorbike", "id": 1563, "trainId": 103},
{"name": "animal, animate being, beast, brute, creature, fauna", "id": 29, "trainId": 104},
{"name": "chandelier, pendant, pendent", "id": 480, "trainId": 105},
{"name": "step, stair", "id": 2569, "trainId": 106},
{"name": "booth, cubicle, stall, kiosk", "id": 247, "trainId": 107},
{"name": "bicycle, bike, wheel, cycle", "id": 187, "trainId": 108},
{"name": "doorframe, doorcase", "id": 778, "trainId": 109},
{"name": "sconce", "id": 2243, "trainId": 110},
{"name": "pond", "id": 1941, "trainId": 111},
{"name": "trade name, brand name, brand, marque", "id": 2833, "trainId": 112},
{"name": "bannister, banister, balustrade, balusters, handrail", "id": 120, "trainId": 113},
{"name": "bag", "id": 95, "trainId": 114},
{"name": "traffic light, traffic signal, stoplight", "id": 2836, "trainId": 115},
{"name": "gazebo", "id": 1087, "trainId": 116},
{"name": "escalator, moving staircase, moving stairway", "id": 868, "trainId": 117},
{"name": "land, ground, soil", "id": 1401, "trainId": 118},
{"name": "board, plank", "id": 220, "trainId": 119},
{"name": "arcade machine", "id": 47, "trainId": 120},
{"name": "eiderdown, duvet, continental quilt", "id": 843, "trainId": 121},
{"name": "bar", "id": 123, "trainId": 122},
{"name": "stall, stand, sales booth", "id": 2537, "trainId": 123},
{"name": "playground", "id": 1927, "trainId": 124},
{"name": "ship", "id": 2337, "trainId": 125},
{"name": "ottoman, pouf, pouffe, puff, hassock", "id": 1702, "trainId": 126},
{
"name": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
"id": 64,
"trainId": 127,
},
{"name": "bottle", "id": 249, "trainId": 128},
{"name": "cradle", "id": 642, "trainId": 129},
{"name": "pot, flowerpot", "id": 1981, "trainId": 130},
{
"name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
"id": 609,
"trainId": 131,
},
{"name": "train, railroad train", "id": 2840, "trainId": 132},
{"name": "stool", "id": 2586, "trainId": 133},
{"name": "lake", "id": 1393, "trainId": 134},
{"name": "tank, storage tank", "id": 2704, "trainId": 135},
{"name": "ice, water ice", "id": 1304, "trainId": 136},
{"name": "basket, handbasket", "id": 146, "trainId": 137},
{"name": "manhole", "id": 1494, "trainId": 138},
{"name": "tent, collapsible shelter", "id": 2739, "trainId": 139},
{"name": "canopy", "id": 389, "trainId": 140},
{"name": "microwave, microwave oven", "id": 1551, "trainId": 141},
{"name": "barrel, cask", "id": 131, "trainId": 142},
{"name": "dirt track", "id": 738, "trainId": 143},
{"name": "beam", "id": 161, "trainId": 144},
{"name": "dishwasher, dish washer, dishwashing machine", "id": 747, "trainId": 145},
{"name": "plate", "id": 1919, "trainId": 146},
{"name": "screen, crt screen", "id": 3109, "trainId": 147},
{"name": "ruins", "id": 2179, "trainId": 148},
{"name": "washer, automatic washer, washing machine", "id": 2989, "trainId": 149},
{"name": "blanket, cover", "id": 206, "trainId": 150},
{"name": "plaything, toy", "id": 1930, "trainId": 151},
{"name": "food, solid food", "id": 1002, "trainId": 152},
{"name": "screen, silver screen, projection screen", "id": 2254, "trainId": 153},
{"name": "oven", "id": 1708, "trainId": 154},
{"name": "stage", "id": 2526, "trainId": 155},
{"name": "beacon, lighthouse, beacon light, pharos", "id": 160, "trainId": 156},
{"name": "umbrella", "id": 2901, "trainId": 157},
{"name": "sculpture", "id": 2262, "trainId": 158},
{"name": "aqueduct", "id": 44, "trainId": 159},
{"name": "container", "id": 597, "trainId": 160},
{"name": "scaffolding, staging", "id": 2235, "trainId": 161},
{"name": "hood, exhaust hood", "id": 1260, "trainId": 162},
{"name": "curb, curbing, kerb", "id": 682, "trainId": 163},
{"name": "roller coaster", "id": 2151, "trainId": 164},
{"name": "horse, equus caballus", "id": 3107, "trainId": 165},
{"name": "catwalk", "id": 432, "trainId": 166},
{"name": "glass, drinking glass", "id": 1098, "trainId": 167},
{"name": "vase", "id": 2932, "trainId": 168},
{"name": "central reservation", "id": 461, "trainId": 169},
{"name": "carousel", "id": 410, "trainId": 170},
{"name": "radiator", "id": 2046, "trainId": 171},
{"name": "closet", "id": 533, "trainId": 172},
{"name": "machine", "id": 1481, "trainId": 173},
{"name": "pier, wharf, wharfage, dock", "id": 1858, "trainId": 174},
{"name": "fan", "id": 894, "trainId": 175},
{"name": "inflatable bounce game", "id": 1322, "trainId": 176},
{"name": "pitch", "id": 1891, "trainId": 177},
{"name": "paper", "id": 1756, "trainId": 178},
{"name": "arcade, colonnade", "id": 49, "trainId": 179},
{"name": "hot tub", "id": 1272, "trainId": 180},
{"name": "helicopter", "id": 1229, "trainId": 181},
{"name": "tray", "id": 2850, "trainId": 182},
{"name": "partition, divider", "id": 1784, "trainId": 183},
{"name": "vineyard", "id": 2962, "trainId": 184},
{"name": "bowl", "id": 259, "trainId": 185},
{"name": "bullring", "id": 319, "trainId": 186},
{"name": "flag", "id": 954, "trainId": 187},
{"name": "pot", "id": 1974, "trainId": 188},
{"name": "footbridge, overcrossing, pedestrian bridge", "id": 1013, "trainId": 189},
{"name": "shower", "id": 2356, "trainId": 190},
{"name": "bag, traveling bag, travelling bag, grip, suitcase", "id": 97, "trainId": 191},
{"name": "bulletin board, notice board", "id": 318, "trainId": 192},
{"name": "confessional booth", "id": 592, "trainId": 193},
{"name": "trunk, tree trunk, bole", "id": 2885, "trainId": 194},
{"name": "forest", "id": 1017, "trainId": 195},
{"name": "elevator door", "id": 851, "trainId": 196},
{"name": "laptop, laptop computer", "id": 1407, "trainId": 197},
{"name": "instrument panel", "id": 1332, "trainId": 198},
{"name": "bucket, pail", "id": 303, "trainId": 199},
{"name": "tapestry, tapis", "id": 2714, "trainId": 200},
{"name": "platform", "id": 1924, "trainId": 201},
{"name": "jacket", "id": 1346, "trainId": 202},
{"name": "gate", "id": 1081, "trainId": 203},
{"name": "monitor, monitoring device", "id": 1583, "trainId": 204},
{
"name": "telephone booth, phone booth, call box, telephone box, telephone kiosk",
"id": 2727,
"trainId": 205,
},
{"name": "spotlight, spot", "id": 2509, "trainId": 206},
{"name": "ring", "id": 2123, "trainId": 207},
{"name": "control panel", "id": 602, "trainId": 208},
{"name": "blackboard, chalkboard", "id": 202, "trainId": 209},
{"name": "air conditioner, air conditioning", "id": 10, "trainId": 210},
{"name": "chest", "id": 490, "trainId": 211},
{"name": "clock", "id": 530, "trainId": 212},
{"name": "sand dune", "id": 2213, "trainId": 213},
{"name": "pipe, pipage, piping", "id": 1884, "trainId": 214},
{"name": "vault", "id": 2934, "trainId": 215},
{"name": "table football", "id": 2687, "trainId": 216},
{"name": "cannon", "id": 387, "trainId": 217},
{"name": "swimming pool, swimming bath, natatorium", "id": 2668, "trainId": 218},
{"name": "fluorescent, fluorescent fixture", "id": 982, "trainId": 219},
{"name": "statue", "id": 2547, "trainId": 220},
{
"name": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
"id": 1474,
"trainId": 221,
},
{"name": "exhibitor", "id": 877, "trainId": 222},
{"name": "ladder", "id": 1391, "trainId": 223},
{"name": "carport", "id": 414, "trainId": 224},
{"name": "dam", "id": 698, "trainId": 225},
{"name": "pulpit", "id": 2019, "trainId": 226},
{"name": "skylight, fanlight", "id": 2422, "trainId": 227},
{"name": "water tower", "id": 3010, "trainId": 228},
{"name": "grill, grille, grillwork", "id": 1139, "trainId": 229},
{"name": "display board", "id": 753, "trainId": 230},
{"name": "pane, pane of glass, window glass", "id": 1747, "trainId": 231},
{"name": "rubbish, trash, scrap", "id": 2175, "trainId": 232},
{"name": "ice rink", "id": 1301, "trainId": 233},
{"name": "fruit", "id": 1033, "trainId": 234},
{"name": "patio", "id": 1789, "trainId": 235},
{"name": "vending machine", "id": 2939, "trainId": 236},
{"name": "telephone, phone, telephone set", "id": 2730, "trainId": 237},
{"name": "net", "id": 1652, "trainId": 238},
{
"name": "backpack, back pack, knapsack, packsack, rucksack, haversack",
"id": 90,
"trainId": 239,
},
{"name": "jar", "id": 1349, "trainId": 240},
{"name": "track", "id": 2830, "trainId": 241},
{"name": "magazine", "id": 1485, "trainId": 242},
{"name": "shutter", "id": 2370, "trainId": 243},
{"name": "roof", "id": 2155, "trainId": 244},
{"name": "banner, streamer", "id": 118, "trainId": 245},
{"name": "landfill", "id": 1402, "trainId": 246},
{"name": "post", "id": 1957, "trainId": 247},
{"name": "altarpiece, reredos", "id": 3130, "trainId": 248},
{"name": "hat, chapeau, lid", "id": 1197, "trainId": 249},
{"name": "arch, archway", "id": 52, "trainId": 250},
{"name": "table game", "id": 2688, "trainId": 251},
{"name": "bag, handbag, pocketbook, purse", "id": 96, "trainId": 252},
{"name": "document, written document, papers", "id": 762, "trainId": 253},
{"name": "dome", "id": 772, "trainId": 254},
{"name": "pier", "id": 1857, "trainId": 255},
{"name": "shanties", "id": 2315, "trainId": 256},
{"name": "forecourt", "id": 1016, "trainId": 257},
{"name": "crane", "id": 643, "trainId": 258},
{"name": "dog, domestic dog, canis familiaris", "id": 3105, "trainId": 259},
{"name": "piano, pianoforte, forte-piano", "id": 1849, "trainId": 260},
{"name": "drawing", "id": 791, "trainId": 261},
{"name": "cabin", "id": 349, "trainId": 262},
{
"name": "ad, advertisement, advertizement, advertising, advertizing, advert",
"id": 6,
"trainId": 263,
},
{"name": "amphitheater, amphitheatre, coliseum", "id": 3114, "trainId": 264},
{"name": "monument", "id": 1587, "trainId": 265},
{"name": "henhouse", "id": 1233, "trainId": 266},
{"name": "cockpit", "id": 559, "trainId": 267},
{"name": "heater, warmer", "id": 1223, "trainId": 268},
{"name": "windmill, aerogenerator, wind generator", "id": 3049, "trainId": 269},
{"name": "pool", "id": 1943, "trainId": 270},
{"name": "elevator, lift", "id": 853, "trainId": 271},
{"name": "decoration, ornament, ornamentation", "id": 709, "trainId": 272},
{"name": "labyrinth", "id": 1390, "trainId": 273},
{"name": "text, textual matter", "id": 2748, "trainId": 274},
{"name": "printer", "id": 2007, "trainId": 275},
{"name": "mezzanine, first balcony", "id": 1546, "trainId": 276},
{"name": "mattress", "id": 1513, "trainId": 277},
{"name": "straw", "id": 2600, "trainId": 278},
{"name": "stalls", "id": 2538, "trainId": 279},
{"name": "patio, terrace", "id": 1790, "trainId": 280},
{"name": "billboard, hoarding", "id": 194, "trainId": 281},
{"name": "bus stop", "id": 326, "trainId": 282},
{"name": "trouser, pant", "id": 2877, "trainId": 283},
{"name": "console table, console", "id": 594, "trainId": 284},
{"name": "rack", "id": 2036, "trainId": 285},
{"name": "notebook", "id": 1662, "trainId": 286},
{"name": "shrine", "id": 2366, "trainId": 287},
{"name": "pantry", "id": 1754, "trainId": 288},
{"name": "cart", "id": 418, "trainId": 289},
{"name": "steam shovel", "id": 2553, "trainId": 290},
{"name": "porch", "id": 1951, "trainId": 291},
{"name": "postbox, mailbox, letter box", "id": 1963, "trainId": 292},
{"name": "figurine, statuette", "id": 918, "trainId": 293},
{"name": "recycling bin", "id": 2086, "trainId": 294},
{"name": "folding screen", "id": 997, "trainId": 295},
{"name": "telescope", "id": 2731, "trainId": 296},
{"name": "deck chair, beach chair", "id": 704, "trainId": 297},
{"name": "kennel", "id": 1365, "trainId": 298},
{"name": "coffee maker", "id": 569, "trainId": 299},
{"name": "altar, communion table, lord's table", "id": 3108, "trainId": 300},
{"name": "fish", "id": 948, "trainId": 301},
{"name": "easel", "id": 839, "trainId": 302},
{"name": "artificial golf green", "id": 63, "trainId": 303},
{"name": "iceberg", "id": 1305, "trainId": 304},
{"name": "candlestick, candle holder", "id": 378, "trainId": 305},
{"name": "shower stall, shower bath", "id": 2362, "trainId": 306},
{"name": "television stand", "id": 2734, "trainId": 307},
{
"name": "wall socket, wall plug, electric outlet, electrical outlet, outlet, electric receptacle",
"id": 2982,
"trainId": 308,
},
{"name": "skeleton", "id": 2398, "trainId": 309},
{"name": "grand piano, grand", "id": 1119, "trainId": 310},
{"name": "candy, confect", "id": 382, "trainId": 311},
{"name": "grille door", "id": 1141, "trainId": 312},
{"name": "pedestal, plinth, footstall", "id": 1805, "trainId": 313},
{"name": "jersey, t-shirt, tee shirt", "id": 3102, "trainId": 314},
{"name": "shoe", "id": 2341, "trainId": 315},
{"name": "gravestone, headstone, tombstone", "id": 1131, "trainId": 316},
{"name": "shanty", "id": 2316, "trainId": 317},
{"name": "structure", "id": 2626, "trainId": 318},
{"name": "rocking chair, rocker", "id": 3104, "trainId": 319},
{"name": "bird", "id": 198, "trainId": 320},
{"name": "place mat", "id": 1896, "trainId": 321},
{"name": "tomb", "id": 2800, "trainId": 322},
{"name": "big top", "id": 190, "trainId": 323},
{"name": "gas pump, gasoline pump, petrol pump, island dispenser", "id": 3131, "trainId": 324},
{"name": "lockers", "id": 1463, "trainId": 325},
{"name": "cage", "id": 357, "trainId": 326},
{"name": "finger", "id": 929, "trainId": 327},
{"name": "bleachers", "id": 209, "trainId": 328},
{"name": "ferris wheel", "id": 912, "trainId": 329},
{"name": "hairdresser chair", "id": 1164, "trainId": 330},
{"name": "mat", "id": 1509, "trainId": 331},
{"name": "stands", "id": 2539, "trainId": 332},
{"name": "aquarium, fish tank, marine museum", "id": 3116, "trainId": 333},
{"name": "streetcar, tram, tramcar, trolley, trolley car", "id": 2615, "trainId": 334},
{"name": "napkin, table napkin, serviette", "id": 1644, "trainId": 335},
{"name": "dummy", "id": 818, "trainId": 336},
{"name": "booklet, brochure, folder, leaflet, pamphlet", "id": 242, "trainId": 337},
{"name": "sand trap", "id": 2217, "trainId": 338},
{"name": "shop, store", "id": 2347, "trainId": 339},
{"name": "table cloth", "id": 2686, "trainId": 340},
{"name": "service station", "id": 2300, "trainId": 341},
{"name": "coffin", "id": 572, "trainId": 342},
{"name": "drawer", "id": 789, "trainId": 343},
{"name": "cages", "id": 358, "trainId": 344},
{"name": "slot machine, coin machine", "id": 2443, "trainId": 345},
{"name": "balcony", "id": 101, "trainId": 346},
{"name": "volleyball court", "id": 2969, "trainId": 347},
{"name": "table tennis", "id": 2692, "trainId": 348},
{"name": "control table", "id": 606, "trainId": 349},
{"name": "shirt", "id": 2339, "trainId": 350},
{"name": "merchandise, ware, product", "id": 1533, "trainId": 351},
{"name": "railway", "id": 2060, "trainId": 352},
{"name": "parterre", "id": 1782, "trainId": 353},
{"name": "chimney", "id": 495, "trainId": 354},
{"name": "can, tin, tin can", "id": 371, "trainId": 355},
{"name": "tanks", "id": 2707, "trainId": 356},
{"name": "fabric, cloth, material, textile", "id": 889, "trainId": 357},
{"name": "alga, algae", "id": 3156, "trainId": 358},
{"name": "system", "id": 2683, "trainId": 359},
{"name": "map", "id": 1499, "trainId": 360},
{"name": "greenhouse", "id": 1135, "trainId": 361},
{"name": "mug", "id": 1619, "trainId": 362},
{"name": "barbecue", "id": 125, "trainId": 363},
{"name": "trailer", "id": 2838, "trainId": 364},
{"name": "toilet tissue, toilet paper, bathroom tissue", "id": 2792, "trainId": 365},
{"name": "organ", "id": 1695, "trainId": 366},
{"name": "dishrag, dishcloth", "id": 746, "trainId": 367},
{"name": "island", "id": 1343, "trainId": 368},
{"name": "keyboard", "id": 1370, "trainId": 369},
{"name": "trench", "id": 2858, "trainId": 370},
{"name": "basket, basketball hoop, hoop", "id": 145, "trainId": 371},
{"name": "steering wheel, wheel", "id": 2565, "trainId": 372},
{"name": "pitcher, ewer", "id": 1892, "trainId": 373},
{"name": "goal", "id": 1103, "trainId": 374},
{"name": "bread, breadstuff, staff of life", "id": 286, "trainId": 375},
{"name": "beds", "id": 170, "trainId": 376},
{"name": "wood", "id": 3073, "trainId": 377},
{"name": "file cabinet", "id": 922, "trainId": 378},
{"name": "newspaper, paper", "id": 1655, "trainId": 379},
{"name": "motorboat", "id": 1602, "trainId": 380},
{"name": "rope", "id": 2160, "trainId": 381},
{"name": "guitar", "id": 1151, "trainId": 382},
{"name": "rubble", "id": 2176, "trainId": 383},
{"name": "scarf", "id": 2239, "trainId": 384},
{"name": "barrels", "id": 132, "trainId": 385},
{"name": "cap", "id": 394, "trainId": 386},
{"name": "leaves", "id": 1424, "trainId": 387},
{"name": "control tower", "id": 607, "trainId": 388},
{"name": "dashboard", "id": 700, "trainId": 389},
{"name": "bandstand", "id": 116, "trainId": 390},
{"name": "lectern", "id": 1425, "trainId": 391},
{"name": "switch, electric switch, electrical switch", "id": 2676, "trainId": 392},
{"name": "baseboard, mopboard, skirting board", "id": 141, "trainId": 393},
{"name": "shower room", "id": 2360, "trainId": 394},
{"name": "smoke", "id": 2449, "trainId": 395},
{"name": "faucet, spigot", "id": 897, "trainId": 396},
{"name": "bulldozer", "id": 317, "trainId": 397},
{"name": "saucepan", "id": 2228, "trainId": 398},
{"name": "shops", "id": 2351, "trainId": 399},
{"name": "meter", "id": 1543, "trainId": 400},
{"name": "crevasse", "id": 656, "trainId": 401},
{"name": "gear", "id": 1088, "trainId": 402},
{"name": "candelabrum, candelabra", "id": 373, "trainId": 403},
{"name": "sofa bed", "id": 2472, "trainId": 404},
{"name": "tunnel", "id": 2892, "trainId": 405},
{"name": "pallet", "id": 1740, "trainId": 406},
{"name": "wire, conducting wire", "id": 3067, "trainId": 407},
{"name": "kettle, boiler", "id": 1367, "trainId": 408},
{"name": "bidet", "id": 188, "trainId": 409},
{
"name": "baby buggy, baby carriage, carriage, perambulator, pram, stroller, go-cart, pushchair, pusher",
"id": 79,
"trainId": 410,
},
{"name": "music stand", "id": 1633, "trainId": 411},
{"name": "pipe, tube", "id": 1885, "trainId": 412},
{"name": "cup", "id": 677, "trainId": 413},
{"name": "parking meter", "id": 1779, "trainId": 414},
{"name": "ice hockey rink", "id": 1297, "trainId": 415},
{"name": "shelter", "id": 2334, "trainId": 416},
{"name": "weeds", "id": 3027, "trainId": 417},
{"name": "temple", "id": 2735, "trainId": 418},
{"name": "patty, cake", "id": 1791, "trainId": 419},
{"name": "ski slope", "id": 2405, "trainId": 420},
{"name": "panel", "id": 1748, "trainId": 421},
{"name": "wallet", "id": 2983, "trainId": 422},
{"name": "wheel", "id": 3035, "trainId": 423},
{"name": "towel rack, towel horse", "id": 2824, "trainId": 424},
{"name": "roundabout", "id": 2168, "trainId": 425},
{"name": "canister, cannister, tin", "id": 385, "trainId": 426},
{"name": "rod", "id": 2148, "trainId": 427},
{"name": "soap dispenser", "id": 2465, "trainId": 428},
{"name": "bell", "id": 175, "trainId": 429},
{"name": "canvas", "id": 390, "trainId": 430},
{"name": "box office, ticket office, ticket booth", "id": 268, "trainId": 431},
{"name": "teacup", "id": 2722, "trainId": 432},
{"name": "trellis", "id": 2857, "trainId": 433},
{"name": "workbench", "id": 3088, "trainId": 434},
{"name": "valley, vale", "id": 2926, "trainId": 435},
{"name": "toaster", "id": 2782, "trainId": 436},
{"name": "knife", "id": 1378, "trainId": 437},
{"name": "podium", "id": 1934, "trainId": 438},
{"name": "ramp", "id": 2072, "trainId": 439},
{"name": "tumble dryer", "id": 2889, "trainId": 440},
{"name": "fireplug, fire hydrant, plug", "id": 944, "trainId": 441},
{"name": "gym shoe, sneaker, tennis shoe", "id": 1158, "trainId": 442},
{"name": "lab bench", "id": 1383, "trainId": 443},
{"name": "equipment", "id": 867, "trainId": 444},
{"name": "rocky formation", "id": 2145, "trainId": 445},
{"name": "plastic", "id": 1915, "trainId": 446},
{"name": "calendar", "id": 361, "trainId": 447},
{"name": "caravan", "id": 402, "trainId": 448},
{"name": "check-in-desk", "id": 482, "trainId": 449},
{"name": "ticket counter", "id": 2761, "trainId": 450},
{"name": "brush", "id": 300, "trainId": 451},
{"name": "mill", "id": 1554, "trainId": 452},
{"name": "covered bridge", "id": 636, "trainId": 453},
{"name": "bowling alley", "id": 260, "trainId": 454},
{"name": "hanger", "id": 1186, "trainId": 455},
{"name": "excavator", "id": 871, "trainId": 456},
{"name": "trestle", "id": 2859, "trainId": 457},
{"name": "revolving door", "id": 2103, "trainId": 458},
{"name": "blast furnace", "id": 208, "trainId": 459},
{"name": "scale, weighing machine", "id": 2236, "trainId": 460},
{"name": "projector", "id": 2012, "trainId": 461},
{"name": "soap", "id": 2462, "trainId": 462},
{"name": "locker", "id": 1462, "trainId": 463},
{"name": "tractor", "id": 2832, "trainId": 464},
{"name": "stretcher", "id": 2617, "trainId": 465},
{"name": "frame", "id": 1024, "trainId": 466},
{"name": "grating", "id": 1129, "trainId": 467},
{"name": "alembic", "id": 18, "trainId": 468},
{"name": "candle, taper, wax light", "id": 376, "trainId": 469},
{"name": "barrier", "id": 134, "trainId": 470},
{"name": "cardboard", "id": 407, "trainId": 471},
{"name": "cave", "id": 434, "trainId": 472},
{"name": "puddle", "id": 2017, "trainId": 473},
{"name": "tarp", "id": 2717, "trainId": 474},
{"name": "price tag", "id": 2005, "trainId": 475},
{"name": "watchtower", "id": 2993, "trainId": 476},
{"name": "meters", "id": 1545, "trainId": 477},
{
"name": "light bulb, lightbulb, bulb, incandescent lamp, electric light, electric-light bulb",
"id": 1445,
"trainId": 478,
},
{"name": "tracks", "id": 2831, "trainId": 479},
{"name": "hair dryer", "id": 1161, "trainId": 480},
{"name": "skirt", "id": 2411, "trainId": 481},
{"name": "viaduct", "id": 2949, "trainId": 482},
{"name": "paper towel", "id": 1769, "trainId": 483},
{"name": "coat", "id": 552, "trainId": 484},
{"name": "sheet", "id": 2327, "trainId": 485},
{"name": "fire extinguisher, extinguisher, asphyxiator", "id": 939, "trainId": 486},
{"name": "water wheel", "id": 3013, "trainId": 487},
{"name": "pottery, clayware", "id": 1986, "trainId": 488},
{"name": "magazine rack", "id": 1486, "trainId": 489},
{"name": "teapot", "id": 2723, "trainId": 490},
{"name": "microphone, mike", "id": 1549, "trainId": 491},
{"name": "support", "id": 2649, "trainId": 492},
{"name": "forklift", "id": 1020, "trainId": 493},
{"name": "canyon", "id": 392, "trainId": 494},
{"name": "cash register, register", "id": 422, "trainId": 495},
{"name": "leaf, leafage, foliage", "id": 1419, "trainId": 496},
{"name": "remote control, remote", "id": 2099, "trainId": 497},
{"name": "soap dish", "id": 2464, "trainId": 498},
{"name": "windshield, windscreen", "id": 3058, "trainId": 499},
{"name": "cat", "id": 430, "trainId": 500},
{"name": "cue, cue stick, pool cue, pool stick", "id": 675, "trainId": 501},
{"name": "vent, venthole, vent-hole, blowhole", "id": 2941, "trainId": 502},
{"name": "videos", "id": 2955, "trainId": 503},
{"name": "shovel", "id": 2355, "trainId": 504},
{"name": "eaves", "id": 840, "trainId": 505},
{"name": "antenna, aerial, transmitting aerial", "id": 32, "trainId": 506},
{"name": "shipyard", "id": 2338, "trainId": 507},
{"name": "hen, biddy", "id": 1232, "trainId": 508},
{"name": "traffic cone", "id": 2834, "trainId": 509},
{"name": "washing machines", "id": 2991, "trainId": 510},
{"name": "truck crane", "id": 2879, "trainId": 511},
{"name": "cds", "id": 444, "trainId": 512},
{"name": "niche", "id": 1657, "trainId": 513},
{"name": "scoreboard", "id": 2246, "trainId": 514},
{"name": "briefcase", "id": 296, "trainId": 515},
{"name": "boot", "id": 245, "trainId": 516},
{"name": "sweater, jumper", "id": 2661, "trainId": 517},
{"name": "hay", "id": 1202, "trainId": 518},
{"name": "pack", "id": 1714, "trainId": 519},
{"name": "bottle rack", "id": 251, "trainId": 520},
{"name": "glacier", "id": 1095, "trainId": 521},
{"name": "pergola", "id": 1828, "trainId": 522},
{"name": "building materials", "id": 311, "trainId": 523},
{"name": "television camera", "id": 2732, "trainId": 524},
{"name": "first floor", "id": 947, "trainId": 525},
{"name": "rifle", "id": 2115, "trainId": 526},
{"name": "tennis table", "id": 2738, "trainId": 527},
{"name": "stadium", "id": 2525, "trainId": 528},
{"name": "safety belt", "id": 2194, "trainId": 529},
{"name": "cover", "id": 634, "trainId": 530},
{"name": "dish rack", "id": 740, "trainId": 531},
{"name": "synthesizer", "id": 2682, "trainId": 532},
{"name": "pumpkin", "id": 2020, "trainId": 533},
{"name": "gutter", "id": 1156, "trainId": 534},
{"name": "fruit stand", "id": 1036, "trainId": 535},
{"name": "ice floe, floe", "id": 1295, "trainId": 536},
{"name": "handle, grip, handgrip, hold", "id": 1181, "trainId": 537},
{"name": "wheelchair", "id": 3037, "trainId": 538},
{"name": "mousepad, mouse mat", "id": 1614, "trainId": 539},
{"name": "diploma", "id": 736, "trainId": 540},
{"name": "fairground ride", "id": 893, "trainId": 541},
{"name": "radio", "id": 2047, "trainId": 542},
{"name": "hotplate", "id": 1274, "trainId": 543},
{"name": "junk", "id": 1361, "trainId": 544},
{"name": "wheelbarrow", "id": 3036, "trainId": 545},
{"name": "stream", "id": 2606, "trainId": 546},
{"name": "toll plaza", "id": 2797, "trainId": 547},
{"name": "punching bag", "id": 2022, "trainId": 548},
{"name": "trough", "id": 2876, "trainId": 549},
{"name": "throne", "id": 2758, "trainId": 550},
{"name": "chair desk", "id": 472, "trainId": 551},
{"name": "weighbridge", "id": 3028, "trainId": 552},
{"name": "extractor fan", "id": 882, "trainId": 553},
{"name": "hanging clothes", "id": 1189, "trainId": 554},
{"name": "dish, dish aerial, dish antenna, saucer", "id": 743, "trainId": 555},
{"name": "alarm clock, alarm", "id": 3122, "trainId": 556},
{"name": "ski lift", "id": 2401, "trainId": 557},
{"name": "chain", "id": 468, "trainId": 558},
{"name": "garage", "id": 1061, "trainId": 559},
{"name": "mechanical shovel", "id": 1523, "trainId": 560},
{"name": "wine rack", "id": 3059, "trainId": 561},
{"name": "tramway", "id": 2843, "trainId": 562},
{"name": "treadmill", "id": 2853, "trainId": 563},
{"name": "menu", "id": 1529, "trainId": 564},
{"name": "block", "id": 214, "trainId": 565},
{"name": "well", "id": 3032, "trainId": 566},
{"name": "witness stand", "id": 3071, "trainId": 567},
{"name": "branch", "id": 277, "trainId": 568},
{"name": "duck", "id": 813, "trainId": 569},
{"name": "casserole", "id": 426, "trainId": 570},
{"name": "frying pan", "id": 1039, "trainId": 571},
{"name": "desk organizer", "id": 727, "trainId": 572},
{"name": "mast", "id": 1508, "trainId": 573},
{"name": "spectacles, specs, eyeglasses, glasses", "id": 2490, "trainId": 574},
{"name": "service elevator", "id": 2299, "trainId": 575},
{"name": "dollhouse", "id": 768, "trainId": 576},
{"name": "hammock", "id": 1172, "trainId": 577},
{"name": "clothes hanging", "id": 537, "trainId": 578},
{"name": "photocopier", "id": 1847, "trainId": 579},
{"name": "notepad", "id": 1664, "trainId": 580},
{"name": "golf cart", "id": 1110, "trainId": 581},
{"name": "footpath", "id": 1014, "trainId": 582},
{"name": "cross", "id": 662, "trainId": 583},
{"name": "baptismal font", "id": 121, "trainId": 584},
{"name": "boiler", "id": 227, "trainId": 585},
{"name": "skip", "id": 2410, "trainId": 586},
{"name": "rotisserie", "id": 2165, "trainId": 587},
{"name": "tables", "id": 2696, "trainId": 588},
{"name": "water mill", "id": 3005, "trainId": 589},
{"name": "helmet", "id": 1231, "trainId": 590},
{"name": "cover curtain", "id": 635, "trainId": 591},
{"name": "brick", "id": 292, "trainId": 592},
{"name": "table runner", "id": 2690, "trainId": 593},
{"name": "ashtray", "id": 65, "trainId": 594},
{"name": "street box", "id": 2607, "trainId": 595},
{"name": "stick", "id": 2574, "trainId": 596},
{"name": "hangers", "id": 1188, "trainId": 597},
{"name": "cells", "id": 456, "trainId": 598},
{"name": "urinal", "id": 2913, "trainId": 599},
{"name": "centerpiece", "id": 459, "trainId": 600},
{"name": "portable fridge", "id": 1955, "trainId": 601},
{"name": "dvds", "id": 827, "trainId": 602},
{"name": "golf club", "id": 1111, "trainId": 603},
{"name": "skirting board", "id": 2412, "trainId": 604},
{"name": "water cooler", "id": 2997, "trainId": 605},
{"name": "clipboard", "id": 528, "trainId": 606},
{"name": "camera, photographic camera", "id": 366, "trainId": 607},
{"name": "pigeonhole", "id": 1863, "trainId": 608},
{"name": "chips", "id": 500, "trainId": 609},
{"name": "food processor", "id": 1001, "trainId": 610},
{"name": "post box", "id": 1958, "trainId": 611},
{"name": "lid", "id": 1441, "trainId": 612},
{"name": "drum", "id": 809, "trainId": 613},
{"name": "blender", "id": 210, "trainId": 614},
{"name": "cave entrance", "id": 435, "trainId": 615},
{"name": "dental chair", "id": 718, "trainId": 616},
{"name": "obelisk", "id": 1674, "trainId": 617},
{"name": "canoe", "id": 388, "trainId": 618},
{"name": "mobile", "id": 1572, "trainId": 619},
{"name": "monitors", "id": 1584, "trainId": 620},
{"name": "pool ball", "id": 1944, "trainId": 621},
{"name": "cue rack", "id": 674, "trainId": 622},
{"name": "baggage carts", "id": 99, "trainId": 623},
{"name": "shore", "id": 2352, "trainId": 624},
{"name": "fork", "id": 1019, "trainId": 625},
{"name": "paper filer", "id": 1763, "trainId": 626},
{"name": "bicycle rack", "id": 185, "trainId": 627},
{"name": "coat rack", "id": 554, "trainId": 628},
{"name": "garland", "id": 1066, "trainId": 629},
{"name": "sports bag", "id": 2508, "trainId": 630},
{"name": "fish tank", "id": 951, "trainId": 631},
{"name": "towel dispenser", "id": 2822, "trainId": 632},
{"name": "carriage", "id": 415, "trainId": 633},
{"name": "brochure", "id": 297, "trainId": 634},
{"name": "plaque", "id": 1914, "trainId": 635},
{"name": "stringer", "id": 2619, "trainId": 636},
{"name": "iron", "id": 1338, "trainId": 637},
{"name": "spoon", "id": 2505, "trainId": 638},
{"name": "flag pole", "id": 955, "trainId": 639},
{"name": "toilet brush", "id": 2786, "trainId": 640},
{"name": "book stand", "id": 238, "trainId": 641},
{"name": "water faucet, water tap, tap, hydrant", "id": 3000, "trainId": 642},
{"name": "ticket office", "id": 2763, "trainId": 643},
{"name": "broom", "id": 299, "trainId": 644},
{"name": "dvd", "id": 822, "trainId": 645},
{"name": "ice bucket", "id": 1288, "trainId": 646},
{"name": "carapace, shell, cuticle, shield", "id": 3101, "trainId": 647},
{"name": "tureen", "id": 2894, "trainId": 648},
{"name": "folders", "id": 992, "trainId": 649},
{"name": "chess", "id": 489, "trainId": 650},
{"name": "root", "id": 2157, "trainId": 651},
{"name": "sewing machine", "id": 2309, "trainId": 652},
{"name": "model", "id": 1576, "trainId": 653},
{"name": "pen", "id": 1810, "trainId": 654},
{"name": "violin", "id": 2964, "trainId": 655},
{"name": "sweatshirt", "id": 2662, "trainId": 656},
{"name": "recycling materials", "id": 2087, "trainId": 657},
{"name": "mitten", "id": 1569, "trainId": 658},
{"name": "chopping board, cutting board", "id": 503, "trainId": 659},
{"name": "mask", "id": 1505, "trainId": 660},
{"name": "log", "id": 1468, "trainId": 661},
{"name": "mouse, computer mouse", "id": 1613, "trainId": 662},
{"name": "grill", "id": 1138, "trainId": 663},
{"name": "hole", "id": 1256, "trainId": 664},
{"name": "target", "id": 2715, "trainId": 665},
{"name": "trash bag", "id": 2846, "trainId": 666},
{"name": "chalk", "id": 477, "trainId": 667},
{"name": "sticks", "id": 2576, "trainId": 668},
{"name": "balloon", "id": 108, "trainId": 669},
{"name": "score", "id": 2245, "trainId": 670},
{"name": "hair spray", "id": 1162, "trainId": 671},
{"name": "roll", "id": 2149, "trainId": 672},
{"name": "runner", "id": 2183, "trainId": 673},
{"name": "engine", "id": 858, "trainId": 674},
{"name": "inflatable glove", "id": 1324, "trainId": 675},
{"name": "games", "id": 1055, "trainId": 676},
{"name": "pallets", "id": 1741, "trainId": 677},
{"name": "baskets", "id": 149, "trainId": 678},
{"name": "coop", "id": 615, "trainId": 679},
{"name": "dvd player", "id": 825, "trainId": 680},
{"name": "rocking horse", "id": 2143, "trainId": 681},
{"name": "buckets", "id": 304, "trainId": 682},
{"name": "bread rolls", "id": 283, "trainId": 683},
{"name": "shawl", "id": 2322, "trainId": 684},
{"name": "watering can", "id": 3017, "trainId": 685},
{"name": "spotlights", "id": 2510, "trainId": 686},
{"name": "post-it", "id": 1960, "trainId": 687},
{"name": "bowls", "id": 265, "trainId": 688},
{"name": "security camera", "id": 2282, "trainId": 689},
{"name": "runner cloth", "id": 2184, "trainId": 690},
{"name": "lock", "id": 1461, "trainId": 691},
{"name": "alarm, warning device, alarm system", "id": 3113, "trainId": 692},
{"name": "side", "id": 2372, "trainId": 693},
{"name": "roulette", "id": 2166, "trainId": 694},
{"name": "bone", "id": 232, "trainId": 695},
{"name": "cutlery", "id": 693, "trainId": 696},
{"name": "pool balls", "id": 1945, "trainId": 697},
{"name": "wheels", "id": 3039, "trainId": 698},
{"name": "spice rack", "id": 2494, "trainId": 699},
{"name": "plant pots", "id": 1908, "trainId": 700},
{"name": "towel ring", "id": 2827, "trainId": 701},
{"name": "bread box", "id": 280, "trainId": 702},
{"name": "video", "id": 2950, "trainId": 703},
{"name": "funfair", "id": 1044, "trainId": 704},
{"name": "breads", "id": 288, "trainId": 705},
{"name": "tripod", "id": 2863, "trainId": 706},
{"name": "ironing board", "id": 1342, "trainId": 707},
{"name": "skimmer", "id": 2409, "trainId": 708},
{"name": "hollow", "id": 1258, "trainId": 709},
{"name": "scratching post", "id": 2249, "trainId": 710},
{"name": "tricycle", "id": 2862, "trainId": 711},
{"name": "file box", "id": 920, "trainId": 712},
{"name": "mountain pass", "id": 1607, "trainId": 713},
{"name": "tombstones", "id": 2802, "trainId": 714},
{"name": "cooker", "id": 610, "trainId": 715},
{"name": "card game, cards", "id": 3129, "trainId": 716},
{"name": "golf bag", "id": 1108, "trainId": 717},
{"name": "towel paper", "id": 2823, "trainId": 718},
{"name": "chaise lounge", "id": 476, "trainId": 719},
{"name": "sun", "id": 2641, "trainId": 720},
{"name": "toilet paper holder", "id": 2788, "trainId": 721},
{"name": "rake", "id": 2070, "trainId": 722},
{"name": "key", "id": 1368, "trainId": 723},
{"name": "umbrella stand", "id": 2903, "trainId": 724},
{"name": "dartboard", "id": 699, "trainId": 725},
{"name": "transformer", "id": 2844, "trainId": 726},
{"name": "fireplace utensils", "id": 942, "trainId": 727},
{"name": "sweatshirts", "id": 2663, "trainId": 728},
{
"name": "cellular telephone, cellular phone, cellphone, cell, mobile phone",
"id": 457,
"trainId": 729,
},
{"name": "tallboy", "id": 2701, "trainId": 730},
{"name": "stapler", "id": 2540, "trainId": 731},
{"name": "sauna", "id": 2231, "trainId": 732},
{"name": "test tube", "id": 2746, "trainId": 733},
{"name": "palette", "id": 1738, "trainId": 734},
{"name": "shopping carts", "id": 2350, "trainId": 735},
{"name": "tools", "id": 2808, "trainId": 736},
{"name": "push button, push, button", "id": 2025, "trainId": 737},
{"name": "star", "id": 2541, "trainId": 738},
{"name": "roof rack", "id": 2156, "trainId": 739},
{"name": "barbed wire", "id": 126, "trainId": 740},
{"name": "spray", "id": 2512, "trainId": 741},
{"name": "ear", "id": 831, "trainId": 742},
{"name": "sponge", "id": 2503, "trainId": 743},
{"name": "racket", "id": 2039, "trainId": 744},
{"name": "tins", "id": 2774, "trainId": 745},
{"name": "eyeglasses", "id": 886, "trainId": 746},
{"name": "file", "id": 919, "trainId": 747},
{"name": "scarfs", "id": 2240, "trainId": 748},
{"name": "sugar bowl", "id": 2636, "trainId": 749},
{"name": "flip flop", "id": 963, "trainId": 750},
{"name": "headstones", "id": 1218, "trainId": 751},
{"name": "laptop bag", "id": 1406, "trainId": 752},
{"name": "leash", "id": 1420, "trainId": 753},
{"name": "climbing frame", "id": 526, "trainId": 754},
{"name": "suit hanger", "id": 2639, "trainId": 755},
{"name": "floor spotlight", "id": 975, "trainId": 756},
{"name": "plate rack", "id": 1921, "trainId": 757},
{"name": "sewer", "id": 2305, "trainId": 758},
{"name": "hard drive", "id": 1193, "trainId": 759},
{"name": "sprinkler", "id": 2517, "trainId": 760},
{"name": "tools box", "id": 2809, "trainId": 761},
{"name": "necklace", "id": 1647, "trainId": 762},
{"name": "bulbs", "id": 314, "trainId": 763},
{"name": "steel industry", "id": 2560, "trainId": 764},
{"name": "club", "id": 545, "trainId": 765},
{"name": "jack", "id": 1345, "trainId": 766},
{"name": "door bars", "id": 775, "trainId": 767},
{
"name": "control panel, instrument panel, control board, board, panel",
"id": 603,
"trainId": 768,
},
{"name": "hairbrush", "id": 1163, "trainId": 769},
{"name": "napkin holder", "id": 1641, "trainId": 770},
{"name": "office", "id": 1678, "trainId": 771},
{"name": "smoke detector", "id": 2450, "trainId": 772},
{"name": "utensils", "id": 2915, "trainId": 773},
{"name": "apron", "id": 42, "trainId": 774},
{"name": "scissors", "id": 2242, "trainId": 775},
{"name": "terminal", "id": 2741, "trainId": 776},
{"name": "grinder", "id": 1143, "trainId": 777},
{"name": "entry phone", "id": 862, "trainId": 778},
{"name": "newspaper stand", "id": 1654, "trainId": 779},
{"name": "pepper shaker", "id": 1826, "trainId": 780},
{"name": "onions", "id": 1689, "trainId": 781},
{
"name": "central processing unit, cpu, c p u , central processor, processor, mainframe",
"id": 3124,
"trainId": 782,
},
{"name": "tape", "id": 2710, "trainId": 783},
{"name": "bat", "id": 152, "trainId": 784},
{"name": "coaster", "id": 549, "trainId": 785},
{"name": "calculator", "id": 360, "trainId": 786},
{"name": "potatoes", "id": 1982, "trainId": 787},
{"name": "luggage rack", "id": 1478, "trainId": 788},
{"name": "salt", "id": 2203, "trainId": 789},
{"name": "street number", "id": 2612, "trainId": 790},
{"name": "viewpoint", "id": 2956, "trainId": 791},
{"name": "sword", "id": 2681, "trainId": 792},
{"name": "cd", "id": 437, "trainId": 793},
{"name": "rowing machine", "id": 2171, "trainId": 794},
{"name": "plug", "id": 1933, "trainId": 795},
{"name": "andiron, firedog, dog, dog-iron", "id": 3110, "trainId": 796},
{"name": "pepper", "id": 1824, "trainId": 797},
{"name": "tongs", "id": 2803, "trainId": 798},
{"name": "bonfire", "id": 234, "trainId": 799},
{"name": "dog dish", "id": 764, "trainId": 800},
{"name": "belt", "id": 177, "trainId": 801},
{"name": "dumbbells", "id": 817, "trainId": 802},
{"name": "videocassette recorder, vcr", "id": 3145, "trainId": 803},
{"name": "hook", "id": 1262, "trainId": 804},
{"name": "envelopes", "id": 864, "trainId": 805},
{"name": "shower faucet", "id": 2359, "trainId": 806},
{"name": "watch", "id": 2992, "trainId": 807},
{"name": "padlock", "id": 1725, "trainId": 808},
{"name": "swimming pool ladder", "id": 2667, "trainId": 809},
{"name": "spanners", "id": 2484, "trainId": 810},
{"name": "gravy boat", "id": 1133, "trainId": 811},
{"name": "notice board", "id": 1667, "trainId": 812},
{"name": "trash bags", "id": 2847, "trainId": 813},
{"name": "fire alarm", "id": 932, "trainId": 814},
{"name": "ladle", "id": 1392, "trainId": 815},
{"name": "stethoscope", "id": 2573, "trainId": 816},
{"name": "rocket", "id": 2140, "trainId": 817},
{"name": "funnel", "id": 1046, "trainId": 818},
{"name": "bowling pins", "id": 264, "trainId": 819},
{"name": "valve", "id": 2927, "trainId": 820},
{"name": "thermometer", "id": 2752, "trainId": 821},
{"name": "cups", "id": 679, "trainId": 822},
{"name": "spice jar", "id": 2493, "trainId": 823},
{"name": "night light", "id": 1658, "trainId": 824},
{"name": "soaps", "id": 2466, "trainId": 825},
{"name": "games table", "id": 1057, "trainId": 826},
{"name": "slotted spoon", "id": 2444, "trainId": 827},
{"name": "reel", "id": 2093, "trainId": 828},
{"name": "scourer", "id": 2248, "trainId": 829},
{"name": "sleeping robe", "id": 2432, "trainId": 830},
{"name": "desk mat", "id": 726, "trainId": 831},
{"name": "dumbbell", "id": 816, "trainId": 832},
{"name": "hammer", "id": 1171, "trainId": 833},
{"name": "tie", "id": 2766, "trainId": 834},
{"name": "typewriter", "id": 2900, "trainId": 835},
{"name": "shaker", "id": 2313, "trainId": 836},
{"name": "cheese dish", "id": 488, "trainId": 837},
{"name": "sea star", "id": 2265, "trainId": 838},
{"name": "racquet", "id": 2043, "trainId": 839},
{"name": "butane gas cylinder", "id": 332, "trainId": 840},
{"name": "paper weight", "id": 1771, "trainId": 841},
{"name": "shaving brush", "id": 2320, "trainId": 842},
{"name": "sunglasses", "id": 2646, "trainId": 843},
{"name": "gear shift", "id": 1089, "trainId": 844},
{"name": "towel rail", "id": 2826, "trainId": 845},
{"name": "adding machine, totalizer, totaliser", "id": 3148, "trainId": 846},
]
def _get_ade20k_full_meta():
# Id 0 is reserved for ignore_label, we change ignore_label for 0
# to 255 in our pre-processing, so all ids are shifted by 1.
stuff_ids = [k["id"] for k in ADE20K_SEM_SEG_FULL_CATEGORIES]
assert len(stuff_ids) == 847, len(stuff_ids)
# For semantic segmentation, this mapping maps from contiguous stuff id
# (in [0, 91], used in models) to ids in the dataset (used for processing results)
stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)}
stuff_classes = [k["name"] for k in ADE20K_SEM_SEG_FULL_CATEGORIES]
ret = {
"stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
"stuff_classes": stuff_classes,
}
return ret
def register_all_ade20k_full(root):
root = os.path.join(root, "ADE20K_2021_17_01")
meta = _get_ade20k_full_meta()
for name, dirname in [("train", "training"), ("val", "validation")]:
image_dir = os.path.join(root, "images_detectron2", dirname)
gt_dir = os.path.join(root, "annotations_detectron2", dirname)
name = f"ade20k_full_sem_seg_{name}"
DatasetCatalog.register(
name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="tif", image_ext="jpg")
)
MetadataCatalog.get(name).set(
stuff_classes=meta["stuff_classes"][:],
image_root=image_dir,
sem_seg_root=gt_dir,
evaluator_type="sem_seg",
ignore_label=65535, # NOTE: gt is saved in 16-bit TIFF images
)
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_ade20k_full(_root)
================================================
FILE: mfvis_nococo/mask2former/data/datasets/register_ade20k_instance.py
================================================
import json
import logging
import numpy as np
import os
from PIL import Image
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets.coco import load_coco_json, register_coco_instances
from detectron2.utils.file_io import PathManager
ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}]
_PREDEFINED_SPLITS = {
# point annotations without masks
"ade20k_instance_train": (
"ADEChallengeData2016/images/training",
"ADEChallengeData2016/ade20k_instance_train.json",
),
"ade20k_instance_val": (
"ADEChallengeData2016/images/validation",
"ADEChallengeData2016/ade20k_instance_val.json",
),
}
def _get_ade_instances_meta():
thing_ids = [k["id"] for k in ADE_CATEGORIES]
assert len(thing_ids) == 100, len(thing_ids)
# Mapping from the incontiguous ADE category id to an id in [0, 99]
thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
thing_classes = [k["name"] for k in ADE_CATEGORIES]
ret = {
"thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
"thing_classes": thing_classes,
}
return ret
def register_all_ade20k_instance(root):
for key, (image_root, json_file) in _PREDEFINED_SPLITS.items():
# Assume pre-defined datasets live in `./datasets`.
register_coco_instances(
key,
_get_ade_instances_meta(),
os.path.join(root, json_file) if "://" not in json_file else json_file,
os.path.join(root, image_root),
)
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_ade20k_instance(_root)
================================================
FILE: mfvis_nococo/mask2former/data/datasets/register_ade20k_panoptic.py
================================================
import json
import os
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.utils.file_io import PathManager
ADE20K_150_CATEGORIES = [
{"color": [120, 120, 120], "id": 0, "isthing": 0, "name": "wall"},
{"color": [180, 120, 120], "id": 1, "isthing": 0, "name": "building"},
{"color": [6, 230, 230], "id": 2, "isthing": 0, "name": "sky"},
{"color": [80, 50, 50], "id": 3, "isthing": 0, "name": "floor"},
{"color": [4, 200, 3], "id": 4, "isthing": 0, "name": "tree"},
{"color": [120, 120, 80], "id": 5, "isthing": 0, "name": "ceiling"},
{"color": [140, 140, 140], "id": 6, "isthing": 0, "name": "road, route"},
{"color": [204, 5, 255], "id": 7, "isthing": 1, "name": "bed"},
{"color": [230, 230, 230], "id": 8, "isthing": 1, "name": "window "},
{"color": [4, 250, 7], "id": 9, "isthing": 0, "name": "grass"},
{"color": [224, 5, 255], "id": 10, "isthing": 1, "name": "cabinet"},
{"color": [235, 255, 7], "id": 11, "isthing": 0, "name": "sidewalk, pavement"},
{"color": [150, 5, 61], "id": 12, "isthing": 1, "name": "person"},
{"color": [120, 120, 70], "id": 13, "isthing": 0, "name": "earth, ground"},
{"color": [8, 255, 51], "id": 14, "isthing": 1, "name": "door"},
{"color": [255, 6, 82], "id": 15, "isthing": 1, "name": "table"},
{"color": [143, 255, 140], "id": 16, "isthing": 0, "name": "mountain, mount"},
{"color": [204, 255, 4], "id": 17, "isthing": 0, "name": "plant"},
{"color": [255, 51, 7], "id": 18, "isthing": 1, "name": "curtain"},
{"color": [204, 70, 3], "id": 19, "isthing": 1, "name": "chair"},
{"color": [0, 102, 200], "id": 20, "isthing": 1, "name": "car"},
{"color": [61, 230, 250], "id": 21, "isthing": 0, "name": "water"},
{"color": [255, 6, 51], "id": 22, "isthing": 1, "name": "painting, picture"},
{"color": [11, 102, 255], "id": 23, "isthing": 1, "name": "sofa"},
{"color": [255, 7, 71], "id": 24, "isthing": 1, "name": "shelf"},
{"color": [255, 9, 224], "id": 25, "isthing": 0, "name": "house"},
{"color": [9, 7, 230], "id": 26, "isthing": 0, "name": "sea"},
{"color": [220, 220, 220], "id": 27, "isthing": 1, "name": "mirror"},
{"color": [255, 9, 92], "id": 28, "isthing": 0, "name": "rug"},
{"color": [112, 9, 255], "id": 29, "isthing": 0, "name": "field"},
{"color": [8, 255, 214], "id": 30, "isthing": 1, "name": "armchair"},
{"color": [7, 255, 224], "id": 31, "isthing": 1, "name": "seat"},
{"color": [255, 184, 6], "id": 32, "isthing": 1, "name": "fence"},
{"color": [10, 255, 71], "id": 33, "isthing": 1, "name": "desk"},
{"color": [255, 41, 10], "id": 34, "isthing": 0, "name": "rock, stone"},
{"color": [7, 255, 255], "id": 35, "isthing": 1, "name": "wardrobe, closet, press"},
{"color": [224, 255, 8], "id": 36, "isthing": 1, "name": "lamp"},
{"color": [102, 8, 255], "id": 37, "isthing": 1, "name": "tub"},
{"color": [255, 61, 6], "id": 38, "isthing": 1, "name": "rail"},
{"color": [255, 194, 7], "id": 39, "isthing": 1, "name": "cushion"},
{"color": [255, 122, 8], "id": 40, "isthing": 0, "name": "base, pedestal, stand"},
{"color": [0, 255, 20], "id": 41, "isthing": 1, "name": "box"},
{"color": [255, 8, 41], "id": 42, "isthing": 1, "name": "column, pillar"},
{"color": [255, 5, 153], "id": 43, "isthing": 1, "name": "signboard, sign"},
{
"color": [6, 51, 255],
"id": 44,
"isthing": 1,
"name": "chest of drawers, chest, bureau, dresser",
},
{"color": [235, 12, 255], "id": 45, "isthing": 1, "name": "counter"},
{"color": [160, 150, 20], "id": 46, "isthing": 0, "name": "sand"},
{"color": [0, 163, 255], "id": 47, "isthing": 1, "name": "sink"},
{"color": [140, 140, 140], "id": 48, "isthing": 0, "name": "skyscraper"},
{"color": [250, 10, 15], "id": 49, "isthing": 1, "name": "fireplace"},
{"color": [20, 255, 0], "id": 50, "isthing": 1, "name": "refrigerator, icebox"},
{"color": [31, 255, 0], "id": 51, "isthing": 0, "name": "grandstand, covered stand"},
{"color": [255, 31, 0], "id": 52, "isthing": 0, "name": "path"},
{"color": [255, 224, 0], "id": 53, "isthing": 1, "name": "stairs"},
{"color": [153, 255, 0], "id": 54, "isthing": 0, "name": "runway"},
{"color": [0, 0, 255], "id": 55, "isthing": 1, "name": "case, display case, showcase, vitrine"},
{
"color": [255, 71, 0],
"id": 56,
"isthing": 1,
"name": "pool table, billiard table, snooker table",
},
{"color": [0, 235, 255], "id": 57, "isthing": 1, "name": "pillow"},
{"color": [0, 173, 255], "id": 58, "isthing": 1, "name": "screen door, screen"},
{"color": [31, 0, 255], "id": 59, "isthing": 0, "name": "stairway, staircase"},
{"color": [11, 200, 200], "id": 60, "isthing": 0, "name": "river"},
{"color": [255, 82, 0], "id": 61, "isthing": 0, "name": "bridge, span"},
{"color": [0, 255, 245], "id": 62, "isthing": 1, "name": "bookcase"},
{"color": [0, 61, 255], "id": 63, "isthing": 0, "name": "blind, screen"},
{"color": [0, 255, 112], "id": 64, "isthing": 1, "name": "coffee table"},
{
"color": [0, 255, 133],
"id": 65,
"isthing": 1,
"name": "toilet, can, commode, crapper, pot, potty, stool, throne",
},
{"color": [255, 0, 0], "id": 66, "isthing": 1, "name": "flower"},
{"color": [255, 163, 0], "id": 67, "isthing": 1, "name": "book"},
{"color": [255, 102, 0], "id": 68, "isthing": 0, "name": "hill"},
{"color": [194, 255, 0], "id": 69, "isthing": 1, "name": "bench"},
{"color": [0, 143, 255], "id": 70, "isthing": 1, "name": "countertop"},
{"color": [51, 255, 0], "id": 71, "isthing": 1, "name": "stove"},
{"color": [0, 82, 255], "id": 72, "isthing": 1, "name": "palm, palm tree"},
{"color": [0, 255, 41], "id": 73, "isthing": 1, "name": "kitchen island"},
{"color": [0, 255, 173], "id": 74, "isthing": 1, "name": "computer"},
{"color": [10, 0, 255], "id": 75, "isthing": 1, "name": "swivel chair"},
{"color": [173, 255, 0], "id": 76, "isthing": 1, "name": "boat"},
{"color": [0, 255, 153], "id": 77, "isthing": 0, "name": "bar"},
{"color": [255, 92, 0], "id": 78, "isthing": 1, "name": "arcade machine"},
{"color": [255, 0, 255], "id": 79, "isthing": 0, "name": "hovel, hut, hutch, shack, shanty"},
{"color": [255, 0, 245], "id": 80, "isthing": 1, "name": "bus"},
{"color": [255, 0, 102], "id": 81, "isthing": 1, "name": "towel"},
{"color": [255, 173, 0], "id": 82, "isthing": 1, "name": "light"},
{"color": [255, 0, 20], "id": 83, "isthing": 1, "name": "truck"},
{"color": [255, 184, 184], "id": 84, "isthing": 0, "name": "tower"},
{"color": [0, 31, 255], "id": 85, "isthing": 1, "name": "chandelier"},
{"color": [0, 255, 61], "id": 86, "isthing": 1, "name": "awning, sunshade, sunblind"},
{"color": [0, 71, 255], "id": 87, "isthing": 1, "name": "street lamp"},
{"color": [255, 0, 204], "id": 88, "isthing": 1, "name": "booth"},
{"color": [0, 255, 194], "id": 89, "isthing": 1, "name": "tv"},
{"color": [0, 255, 82], "id": 90, "isthing": 1, "name": "plane"},
{"color": [0, 10, 255], "id": 91, "isthing": 0, "name": "dirt track"},
{"color": [0, 112, 255], "id": 92, "isthing": 1, "name": "clothes"},
{"color": [51, 0, 255], "id": 93, "isthing": 1, "name": "pole"},
{"color": [0, 194, 255], "id": 94, "isthing": 0, "name": "land, ground, soil"},
{
"color": [0, 122, 255],
"id": 95,
"isthing": 1,
"name": "bannister, banister, balustrade, balusters, handrail",
},
{
"color": [0, 255, 163],
"id": 96,
"isthing": 0,
"name": "escalator, moving staircase, moving stairway",
},
{
"color": [255, 153, 0],
"id": 97,
"isthing": 1,
"name": "ottoman, pouf, pouffe, puff, hassock",
},
{"color": [0, 255, 10], "id": 98, "isthing": 1, "name": "bottle"},
{"color": [255, 112, 0], "id": 99, "isthing": 0, "name": "buffet, counter, sideboard"},
{
"color": [143, 255, 0],
"id": 100,
"isthing": 0,
"name": "poster, posting, placard, notice, bill, card",
},
{"color": [82, 0, 255], "id": 101, "isthing": 0, "name": "stage"},
{"color": [163, 255, 0], "id": 102, "isthing": 1, "name": "van"},
{"color": [255, 235, 0], "id": 103, "isthing": 1, "name": "ship"},
{"color": [8, 184, 170], "id": 104, "isthing": 1, "name": "fountain"},
{
"color": [133, 0, 255],
"id": 105,
"isthing": 0,
"name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
},
{"color": [0, 255, 92], "id": 106, "isthing": 0, "name": "canopy"},
{
"color": [184, 0, 255],
"id": 107,
"isthing": 1,
"name": "washer, automatic washer, washing machine",
},
{"color": [255, 0, 31], "id": 108, "isthing": 1, "name": "plaything, toy"},
{"color": [0, 184, 255], "id": 109, "isthing": 0, "name": "pool"},
{"color": [0, 214, 255], "id": 110, "isthing": 1, "name": "stool"},
{"color": [255, 0, 112], "id": 111, "isthing": 1, "name": "barrel, cask"},
{"color": [92, 255, 0], "id": 112, "isthing": 1, "name": "basket, handbasket"},
{"color": [0, 224, 255], "id": 113, "isthing": 0, "name": "falls"},
{"color": [112, 224, 255], "id": 114, "isthing": 0, "name": "tent"},
{"color": [70, 184, 160], "id": 115, "isthing": 1, "name": "bag"},
{"color": [163, 0, 255], "id": 116, "isthing": 1, "name": "minibike, motorbike"},
{"color": [153, 0, 255], "id": 117, "isthing": 0, "name": "cradle"},
{"color": [71, 255, 0], "id": 118, "isthing": 1, "name": "oven"},
{"color": [255, 0, 163], "id": 119, "isthing": 1, "name": "ball"},
{"color": [255, 204, 0], "id": 120, "isthing": 1, "name": "food, solid food"},
{"color": [255, 0, 143], "id": 121, "isthing": 1, "name": "step, stair"},
{"color": [0, 255, 235], "id": 122, "isthing": 0, "name": "tank, storage tank"},
{"color": [133, 255, 0], "id": 123, "isthing": 1, "name": "trade name"},
{"color": [255, 0, 235], "id": 124, "isthing": 1, "name": "microwave"},
{"color": [245, 0, 255], "id": 125, "isthing": 1, "name": "pot"},
{"color": [255, 0, 122], "id": 126, "isthing": 1, "name": "animal"},
{"color": [255, 245, 0], "id": 127, "isthing": 1, "name": "bicycle"},
{"color": [10, 190, 212], "id": 128, "isthing": 0, "name": "lake"},
{"color": [214, 255, 0], "id": 129, "isthing": 1, "name": "dishwasher"},
{"color": [0, 204, 255], "id": 130, "isthing": 1, "name": "screen"},
{"color": [20, 0, 255], "id": 131, "isthing": 0, "name": "blanket, cover"},
{"color": [255, 255, 0], "id": 132, "isthing": 1, "name": "sculpture"},
{"color": [0, 153, 255], "id": 133, "isthing": 1, "name": "hood, exhaust hood"},
{"color": [0, 41, 255], "id": 134, "isthing": 1, "name": "sconce"},
{"color": [0, 255, 204], "id": 135, "isthing": 1, "name": "vase"},
{"color": [41, 0, 255], "id": 136, "isthing": 1, "name": "traffic light"},
{"color": [41, 255, 0], "id": 137, "isthing": 1, "name": "tray"},
{"color": [173, 0, 255], "id": 138, "isthing": 1, "name": "trash can"},
{"color": [0, 245, 255], "id": 139, "isthing": 1, "name": "fan"},
{"color": [71, 0, 255], "id": 140, "isthing": 0, "name": "pier"},
{"color": [122, 0, 255], "id": 141, "isthing": 0, "name": "crt screen"},
{"color": [0, 255, 184], "id": 142, "isthing": 1, "name": "plate"},
{"color": [0, 92, 255], "id": 143, "isthing": 1, "name": "monitor"},
{"color": [184, 255, 0], "id": 144, "isthing": 1, "name": "bulletin board"},
{"color": [0, 133, 255], "id": 145, "isthing": 0, "name": "shower"},
{"color": [255, 214, 0], "id": 146, "isthing": 1, "name": "radiator"},
{"color": [25, 194, 194], "id": 147, "isthing": 1, "name": "glass, drinking glass"},
{"color": [102, 255, 0], "id": 148, "isthing": 1, "name": "clock"},
{"color": [92, 0, 255], "id": 149, "isthing": 1, "name": "flag"},
]
ADE20k_COLORS = [k["color"] for k in ADE20K_150_CATEGORIES]
MetadataCatalog.get("ade20k_sem_seg_train").set(
stuff_colors=ADE20k_COLORS[:],
)
MetadataCatalog.get("ade20k_sem_seg_val").set(
stuff_colors=ADE20k_COLORS[:],
)
def load_ade20k_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
"""
Args:
image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
Returns:
list[dict]: a list of dicts in Detectron2 standard format. (See
`Using Custom Datasets `_ )
"""
def _convert_category_id(segment_info, meta):
if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
segment_info["category_id"]
]
segment_info["isthing"] = True
else:
segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
segment_info["category_id"]
]
segment_info["isthing"] = False
return segment_info
with PathManager.open(json_file) as f:
json_info = json.load(f)
ret = []
for ann in json_info["annotations"]:
image_id = ann["image_id"]
# TODO: currently we assume image and label has the same filename but
# different extension, and images have extension ".jpg" for COCO. Need
# to make image extension a user-provided argument if we extend this
# function to support other COCO-like datasets.
image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
label_file = os.path.join(gt_dir, ann["file_name"])
sem_label_file = os.path.join(semseg_dir, ann["file_name"])
segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
ret.append(
{
"file_name": image_file,
"image_id": image_id,
"pan_seg_file_name": label_file,
"sem_seg_file_name": sem_label_file,
"segments_info": segments_info,
}
)
assert len(ret), f"No images found in {image_dir}!"
assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
return ret
def register_ade20k_panoptic(
name, metadata, image_root, panoptic_root, semantic_root, panoptic_json, instances_json=None
):
"""
Register a "standard" version of ADE20k panoptic segmentation dataset named `name`.
The dictionaries in this registered dataset follows detectron2's standard format.
Hence it's called "standard".
Args:
name (str): the name that identifies a dataset,
e.g. "ade20k_panoptic_train"
metadata (dict): extra metadata associated with this dataset.
image_root (str): directory which contains all the images
panoptic_root (str): directory which contains panoptic annotation images in COCO format
panoptic_json (str): path to the json panoptic annotation file in COCO format
sem_seg_root (none): not used, to be consistent with
`register_coco_panoptic_separated`.
instances_json (str): path to the json instance annotation file
"""
panoptic_name = name
DatasetCatalog.register(
panoptic_name,
lambda: load_ade20k_panoptic_json(
panoptic_json, image_root, panoptic_root, semantic_root, metadata
),
)
MetadataCatalog.get(panoptic_name).set(
panoptic_root=panoptic_root,
image_root=image_root,
panoptic_json=panoptic_json,
json_file=instances_json,
evaluator_type="ade20k_panoptic_seg",
ignore_label=255,
label_divisor=1000,
**metadata,
)
_PREDEFINED_SPLITS_ADE20K_PANOPTIC = {
"ade20k_panoptic_train": (
"ADEChallengeData2016/images/training",
"ADEChallengeData2016/ade20k_panoptic_train",
"ADEChallengeData2016/ade20k_panoptic_train.json",
"ADEChallengeData2016/annotations_detectron2/training",
"ADEChallengeData2016/ade20k_instance_train.json",
),
"ade20k_panoptic_val": (
"ADEChallengeData2016/images/validation",
"ADEChallengeData2016/ade20k_panoptic_val",
"ADEChallengeData2016/ade20k_panoptic_val.json",
"ADEChallengeData2016/annotations_detectron2/validation",
"ADEChallengeData2016/ade20k_instance_val.json",
),
}
def get_metadata():
meta = {}
# The following metadata maps contiguous id from [0, #thing categories +
# #stuff categories) to their names and colors. We have to replica of the
# same name and color under "thing_*" and "stuff_*" because the current
# visualization function in D2 handles thing and class classes differently
# due to some heuristic used in Panoptic FPN. We keep the same naming to
# enable reusing existing visualization functions.
thing_classes = [k["name"] for k in ADE20K_150_CATEGORIES if k["isthing"] == 1]
thing_colors = [k["color"] for k in ADE20K_150_CATEGORIES if k["isthing"] == 1]
stuff_classes = [k["name"] for k in ADE20K_150_CATEGORIES]
stuff_colors = [k["color"] for k in ADE20K_150_CATEGORIES]
meta["thing_classes"] = thing_classes
meta["thing_colors"] = thing_colors
meta["stuff_classes"] = stuff_classes
meta["stuff_colors"] = stuff_colors
# Convert category id for training:
# category id: like semantic segmentation, it is the class id for each
# pixel. Since there are some classes not used in evaluation, the category
# id is not always contiguous and thus we have two set of category ids:
# - original category id: category id in the original dataset, mainly
# used for evaluation.
# - contiguous category id: [0, #classes), in order to train the linear
# softmax classifier.
thing_dataset_id_to_contiguous_id = {}
stuff_dataset_id_to_contiguous_id = {}
for i, cat in enumerate(ADE20K_150_CATEGORIES):
if cat["isthing"]:
thing_dataset_id_to_contiguous_id[cat["id"]] = i
# else:
# stuff_dataset_id_to_contiguous_id[cat["id"]] = i
# in order to use sem_seg evaluator
stuff_dataset_id_to_contiguous_id[cat["id"]] = i
meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
return meta
def register_all_ade20k_panoptic(root):
metadata = get_metadata()
for (
prefix,
(image_root, panoptic_root, panoptic_json, semantic_root, instance_json),
) in _PREDEFINED_SPLITS_ADE20K_PANOPTIC.items():
# The "standard" version of COCO panoptic segmentation dataset,
# e.g. used by Panoptic-DeepLab
register_ade20k_panoptic(
prefix,
metadata,
os.path.join(root, image_root),
os.path.join(root, panoptic_root),
os.path.join(root, semantic_root),
os.path.join(root, panoptic_json),
os.path.join(root, instance_json),
)
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_ade20k_panoptic(_root)
================================================
FILE: mfvis_nococo/mask2former/data/datasets/register_coco_panoptic_annos_semseg.py
================================================
import json
import os
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets import load_sem_seg
from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
from detectron2.utils.file_io import PathManager
_PREDEFINED_SPLITS_COCO_PANOPTIC = {
"coco_2017_train_panoptic": (
# This is the original panoptic annotation directory
"coco/panoptic_train2017",
"coco/annotations/panoptic_train2017.json",
# This directory contains semantic annotations that are
# converted from panoptic annotations.
# It is used by PanopticFPN.
# You can use the script at detectron2/datasets/prepare_panoptic_fpn.py
# to create these directories.
"coco/panoptic_semseg_train2017",
),
"coco_2017_val_panoptic": (
"coco/panoptic_val2017",
"coco/annotations/panoptic_val2017.json",
"coco/panoptic_semseg_val2017",
),
}
def get_metadata():
meta = {}
# The following metadata maps contiguous id from [0, #thing categories +
# #stuff categories) to their names and colors. We have to replica of the
# same name and color under "thing_*" and "stuff_*" because the current
# visualization function in D2 handles thing and class classes differently
# due to some heuristic used in Panoptic FPN. We keep the same naming to
# enable reusing existing visualization functions.
thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
stuff_classes = [k["name"] for k in COCO_CATEGORIES]
stuff_colors = [k["color"] for k in COCO_CATEGORIES]
meta["thing_classes"] = thing_classes
meta["thing_colors"] = thing_colors
meta["stuff_classes"] = stuff_classes
meta["stuff_colors"] = stuff_colors
# Convert category id for training:
# category id: like semantic segmentation, it is the class id for each
# pixel. Since there are some classes not used in evaluation, the category
# id is not always contiguous and thus we have two set of category ids:
# - original category id: category id in the original dataset, mainly
# used for evaluation.
# - contiguous category id: [0, #classes), in order to train the linear
# softmax classifier.
thing_dataset_id_to_contiguous_id = {}
stuff_dataset_id_to_contiguous_id = {}
for i, cat in enumerate(COCO_CATEGORIES):
if cat["isthing"]:
thing_dataset_id_to_contiguous_id[cat["id"]] = i
# else:
# stuff_dataset_id_to_contiguous_id[cat["id"]] = i
# in order to use sem_seg evaluator
stuff_dataset_id_to_contiguous_id[cat["id"]] = i
meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
return meta
def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
"""
Args:
image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
Returns:
list[dict]: a list of dicts in Detectron2 standard format. (See
`Using Custom Datasets `_ )
"""
def _convert_category_id(segment_info, meta):
if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
segment_info["category_id"]
]
segment_info["isthing"] = True
else:
segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
segment_info["category_id"]
]
segment_info["isthing"] = False
return segment_info
with PathManager.open(json_file) as f:
json_info = json.load(f)
ret = []
for ann in json_info["annotations"]:
image_id = int(ann["image_id"])
# TODO: currently we assume image and label has the same filename but
# different extension, and images have extension ".jpg" for COCO. Need
# to make image extension a user-provided argument if we extend this
# function to support other COCO-like datasets.
image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
label_file = os.path.join(gt_dir, ann["file_name"])
sem_label_file = os.path.join(semseg_dir, ann["file_name"])
segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
ret.append(
{
"file_name": image_file,
"image_id": image_id,
"pan_seg_file_name": label_file,
"sem_seg_file_name": sem_label_file,
"segments_info": segments_info,
}
)
assert len(ret), f"No images found in {image_dir}!"
assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
return ret
def register_coco_panoptic_annos_sem_seg(
name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json
):
panoptic_name = name
delattr(MetadataCatalog.get(panoptic_name), "thing_classes")
delattr(MetadataCatalog.get(panoptic_name), "thing_colors")
MetadataCatalog.get(panoptic_name).set(
thing_classes=metadata["thing_classes"],
thing_colors=metadata["thing_colors"],
# thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"],
)
# the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg"
semantic_name = name + "_with_sem_seg"
DatasetCatalog.register(
semantic_name,
lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, sem_seg_root, metadata),
)
MetadataCatalog.get(semantic_name).set(
sem_seg_root=sem_seg_root,
panoptic_root=panoptic_root,
image_root=image_root,
panoptic_json=panoptic_json,
json_file=instances_json,
evaluator_type="coco_panoptic_seg",
ignore_label=255,
label_divisor=1000,
**metadata,
)
def register_all_coco_panoptic_annos_sem_seg(root):
for (
prefix,
(panoptic_root, panoptic_json, semantic_root),
) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items():
prefix_instances = prefix[: -len("_panoptic")]
instances_meta = MetadataCatalog.get(prefix_instances)
image_root, instances_json = instances_meta.image_root, instances_meta.json_file
register_coco_panoptic_annos_sem_seg(
prefix,
get_metadata(),
image_root,
os.path.join(root, panoptic_root),
os.path.join(root, panoptic_json),
os.path.join(root, semantic_root),
instances_json,
)
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_coco_panoptic_annos_sem_seg(_root)
================================================
FILE: mfvis_nococo/mask2former/data/datasets/register_coco_stuff_10k.py
================================================
import os
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets import load_sem_seg
COCO_CATEGORIES = [
{"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
{"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
{"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
{"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
{"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
{"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
{"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
{"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
{"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
{"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
{"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
{"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
{"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
{"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
{"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
{"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
{"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
{"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
{"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
{"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
{"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
{"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
{"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
{"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
{"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
{"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
{"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
{"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
{"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
{"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
{"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
{"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
{"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
{"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
{"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
{"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
{"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
{"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
{"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
{"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
{"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
{"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
{"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
{"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
{"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
{"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
{"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
{"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
{"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
{"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
{"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
{"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
{"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
{"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
{"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
{"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
{"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
{"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
{"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
{"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
{"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
{"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
{"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
{"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
{"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
{"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
{"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
{"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
{"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
{"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
{"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
{"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
{"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
{"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
{"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
{"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
{"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
{"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
{"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
{"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
{"id": 92, "name": "banner", "supercategory": "textile"},
{"id": 93, "name": "blanket", "supercategory": "textile"},
{"id": 94, "name": "branch", "supercategory": "plant"},
{"id": 95, "name": "bridge", "supercategory": "building"},
{"id": 96, "name": "building-other", "supercategory": "building"},
{"id": 97, "name": "bush", "supercategory": "plant"},
{"id": 98, "name": "cabinet", "supercategory": "furniture-stuff"},
{"id": 99, "name": "cage", "supercategory": "structural"},
{"id": 100, "name": "cardboard", "supercategory": "raw-material"},
{"id": 101, "name": "carpet", "supercategory": "floor"},
{"id": 102, "name": "ceiling-other", "supercategory": "ceiling"},
{"id": 103, "name": "ceiling-tile", "supercategory": "ceiling"},
{"id": 104, "name": "cloth", "supercategory": "textile"},
{"id": 105, "name": "clothes", "supercategory": "textile"},
{"id": 106, "name": "clouds", "supercategory": "sky"},
{"id": 107, "name": "counter", "supercategory": "furniture-stuff"},
{"id": 108, "name": "cupboard", "supercategory": "furniture-stuff"},
{"id": 109, "name": "curtain", "supercategory": "textile"},
{"id": 110, "name": "desk-stuff", "supercategory": "furniture-stuff"},
{"id": 111, "name": "dirt", "supercategory": "ground"},
{"id": 112, "name": "door-stuff", "supercategory": "furniture-stuff"},
{"id": 113, "name": "fence", "supercategory": "structural"},
{"id": 114, "name": "floor-marble", "supercategory": "floor"},
{"id": 115, "name": "floor-other", "supercategory": "floor"},
{"id": 116, "name": "floor-stone", "supercategory": "floor"},
{"id": 117, "name": "floor-tile", "supercategory": "floor"},
{"id": 118, "name": "floor-wood", "supercategory": "floor"},
{"id": 119, "name": "flower", "supercategory": "plant"},
{"id": 120, "name": "fog", "supercategory": "water"},
{"id": 121, "name": "food-other", "supercategory": "food-stuff"},
{"id": 122, "name": "fruit", "supercategory": "food-stuff"},
{"id": 123, "name": "furniture-other", "supercategory": "furniture-stuff"},
{"id": 124, "name": "grass", "supercategory": "plant"},
{"id": 125, "name": "gravel", "supercategory": "ground"},
{"id": 126, "name": "ground-other", "supercategory": "ground"},
{"id": 127, "name": "hill", "supercategory": "solid"},
{"id": 128, "name": "house", "supercategory": "building"},
{"id": 129, "name": "leaves", "supercategory": "plant"},
{"id": 130, "name": "light", "supercategory": "furniture-stuff"},
{"id": 131, "name": "mat", "supercategory": "textile"},
{"id": 132, "name": "metal", "supercategory": "raw-material"},
{"id": 133, "name": "mirror-stuff", "supercategory": "furniture-stuff"},
{"id": 134, "name": "moss", "supercategory": "plant"},
{"id": 135, "name": "mountain", "supercategory": "solid"},
{"id": 136, "name": "mud", "supercategory": "ground"},
{"id": 137, "name": "napkin", "supercategory": "textile"},
{"id": 138, "name": "net", "supercategory": "structural"},
{"id": 139, "name": "paper", "supercategory": "raw-material"},
{"id": 140, "name": "pavement", "supercategory": "ground"},
{"id": 141, "name": "pillow", "supercategory": "textile"},
{"id": 142, "name": "plant-other", "supercategory": "plant"},
{"id": 143, "name": "plastic", "supercategory": "raw-material"},
{"id": 144, "name": "platform", "supercategory": "ground"},
{"id": 145, "name": "playingfield", "supercategory": "ground"},
{"id": 146, "name": "railing", "supercategory": "structural"},
{"id": 147, "name": "railroad", "supercategory": "ground"},
{"id": 148, "name": "river", "supercategory": "water"},
{"id": 149, "name": "road", "supercategory": "ground"},
{"id": 150, "name": "rock", "supercategory": "solid"},
{"id": 151, "name": "roof", "supercategory": "building"},
{"id": 152, "name": "rug", "supercategory": "textile"},
{"id": 153, "name": "salad", "supercategory": "food-stuff"},
{"id": 154, "name": "sand", "supercategory": "ground"},
{"id": 155, "name": "sea", "supercategory": "water"},
{"id": 156, "name": "shelf", "supercategory": "furniture-stuff"},
{"id": 157, "name": "sky-other", "supercategory": "sky"},
{"id": 158, "name": "skyscraper", "supercategory": "building"},
{"id": 159, "name": "snow", "supercategory": "ground"},
{"id": 160, "name": "solid-other", "supercategory": "solid"},
{"id": 161, "name": "stairs", "supercategory": "furniture-stuff"},
{"id": 162, "name": "stone", "supercategory": "solid"},
{"id": 163, "name": "straw", "supercategory": "plant"},
{"id": 164, "name": "structural-other", "supercategory": "structural"},
{"id": 165, "name": "table", "supercategory": "furniture-stuff"},
{"id": 166, "name": "tent", "supercategory": "building"},
{"id": 167, "name": "textile-other", "supercategory": "textile"},
{"id": 168, "name": "towel", "supercategory": "textile"},
{"id": 169, "name": "tree", "supercategory": "plant"},
{"id": 170, "name": "vegetable", "supercategory": "food-stuff"},
{"id": 171, "name": "wall-brick", "supercategory": "wall"},
{"id": 172, "name": "wall-concrete", "supercategory": "wall"},
{"id": 173, "name": "wall-other", "supercategory": "wall"},
{"id": 174, "name": "wall-panel", "supercategory": "wall"},
{"id": 175, "name": "wall-stone", "supercategory": "wall"},
{"id": 176, "name": "wall-tile", "supercategory": "wall"},
{"id": 177, "name": "wall-wood", "supercategory": "wall"},
{"id": 178, "name": "water-other", "supercategory": "water"},
{"id": 179, "name": "waterdrops", "supercategory": "water"},
{"id": 180, "name": "window-blind", "supercategory": "window"},
{"id": 181, "name": "window-other", "supercategory": "window"},
{"id": 182, "name": "wood", "supercategory": "solid"},
]
def _get_coco_stuff_meta():
# Id 0 is reserved for ignore_label, we change ignore_label for 0
# to 255 in our pre-processing.
stuff_ids = [k["id"] for k in COCO_CATEGORIES]
assert len(stuff_ids) == 171, len(stuff_ids)
# For semantic segmentation, this mapping maps from contiguous stuff id
# (in [0, 91], used in models) to ids in the dataset (used for processing results)
stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)}
stuff_classes = [k["name"] for k in COCO_CATEGORIES]
ret = {
"stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
"stuff_classes": stuff_classes,
}
return ret
def register_all_coco_stuff_10k(root):
root = os.path.join(root, "coco", "coco_stuff_10k")
meta = _get_coco_stuff_meta()
for name, image_dirname, sem_seg_dirname in [
("train", "images_detectron2/train", "annotations_detectron2/train"),
("test", "images_detectron2/test", "annotations_detectron2/test"),
]:
image_dir = os.path.join(root, image_dirname)
gt_dir = os.path.join(root, sem_seg_dirname)
name = f"coco_2017_{name}_stuff_10k_sem_seg"
DatasetCatalog.register(
name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg")
)
MetadataCatalog.get(name).set(
image_root=image_dir,
sem_seg_root=gt_dir,
evaluator_type="sem_seg",
ignore_label=255,
**meta,
)
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_coco_stuff_10k(_root)
================================================
FILE: mfvis_nococo/mask2former/data/datasets/register_mapillary_vistas.py
================================================
import os
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets import load_sem_seg
MAPILLARY_VISTAS_SEM_SEG_CATEGORIES = [
{
"color": [165, 42, 42],
"instances": True,
"readable": "Bird",
"name": "animal--bird",
"evaluate": True,
},
{
"color": [0, 192, 0],
"instances": True,
"readable": "Ground Animal",
"name": "animal--ground-animal",
"evaluate": True,
},
{
"color": [196, 196, 196],
"instances": False,
"readable": "Curb",
"name": "construction--barrier--curb",
"evaluate": True,
},
{
"color": [190, 153, 153],
"instances": False,
"readable": "Fence",
"name": "construction--barrier--fence",
"evaluate": True,
},
{
"color": [180, 165, 180],
"instances": False,
"readable": "Guard Rail",
"name": "construction--barrier--guard-rail",
"evaluate": True,
},
{
"color": [90, 120, 150],
"instances": False,
"readable": "Barrier",
"name": "construction--barrier--other-barrier",
"evaluate": True,
},
{
"color": [102, 102, 156],
"instances": False,
"readable": "Wall",
"name": "construction--barrier--wall",
"evaluate": True,
},
{
"color": [128, 64, 255],
"instances": False,
"readable": "Bike Lane",
"name": "construction--flat--bike-lane",
"evaluate": True,
},
{
"color": [140, 140, 200],
"instances": True,
"readable": "Crosswalk - Plain",
"name": "construction--flat--crosswalk-plain",
"evaluate": True,
},
{
"color": [170, 170, 170],
"instances": False,
"readable": "Curb Cut",
"name": "construction--flat--curb-cut",
"evaluate": True,
},
{
"color": [250, 170, 160],
"instances": False,
"readable": "Parking",
"name": "construction--flat--parking",
"evaluate": True,
},
{
"color": [96, 96, 96],
"instances": False,
"readable": "Pedestrian Area",
"name": "construction--flat--pedestrian-area",
"evaluate": True,
},
{
"color": [230, 150, 140],
"instances": False,
"readable": "Rail Track",
"name": "construction--flat--rail-track",
"evaluate": True,
},
{
"color": [128, 64, 128],
"instances": False,
"readable": "Road",
"name": "construction--flat--road",
"evaluate": True,
},
{
"color": [110, 110, 110],
"instances": False,
"readable": "Service Lane",
"name": "construction--flat--service-lane",
"evaluate": True,
},
{
"color": [244, 35, 232],
"instances": False,
"readable": "Sidewalk",
"name": "construction--flat--sidewalk",
"evaluate": True,
},
{
"color": [150, 100, 100],
"instances": False,
"readable": "Bridge",
"name": "construction--structure--bridge",
"evaluate": True,
},
{
"color": [70, 70, 70],
"instances": False,
"readable": "Building",
"name": "construction--structure--building",
"evaluate": True,
},
{
"color": [150, 120, 90],
"instances": False,
"readable": "Tunnel",
"name": "construction--structure--tunnel",
"evaluate": True,
},
{
"color": [220, 20, 60],
"instances": True,
"readable": "Person",
"name": "human--person",
"evaluate": True,
},
{
"color": [255, 0, 0],
"instances": True,
"readable": "Bicyclist",
"name": "human--rider--bicyclist",
"evaluate": True,
},
{
"color": [255, 0, 100],
"instances": True,
"readable": "Motorcyclist",
"name": "human--rider--motorcyclist",
"evaluate": True,
},
{
"color": [255, 0, 200],
"instances": True,
"readable": "Other Rider",
"name": "human--rider--other-rider",
"evaluate": True,
},
{
"color": [200, 128, 128],
"instances": True,
"readable": "Lane Marking - Crosswalk",
"name": "marking--crosswalk-zebra",
"evaluate": True,
},
{
"color": [255, 255, 255],
"instances": False,
"readable": "Lane Marking - General",
"name": "marking--general",
"evaluate": True,
},
{
"color": [64, 170, 64],
"instances": False,
"readable": "Mountain",
"name": "nature--mountain",
"evaluate": True,
},
{
"color": [230, 160, 50],
"instances": False,
"readable": "Sand",
"name": "nature--sand",
"evaluate": True,
},
{
"color": [70, 130, 180],
"instances": False,
"readable": "Sky",
"name": "nature--sky",
"evaluate": True,
},
{
"color": [190, 255, 255],
"instances": False,
"readable": "Snow",
"name": "nature--snow",
"evaluate": True,
},
{
"color": [152, 251, 152],
"instances": False,
"readable": "Terrain",
"name": "nature--terrain",
"evaluate": True,
},
{
"color": [107, 142, 35],
"instances": False,
"readable": "Vegetation",
"name": "nature--vegetation",
"evaluate": True,
},
{
"color": [0, 170, 30],
"instances": False,
"readable": "Water",
"name": "nature--water",
"evaluate": True,
},
{
"color": [255, 255, 128],
"instances": True,
"readable": "Banner",
"name": "object--banner",
"evaluate": True,
},
{
"color": [250, 0, 30],
"instances": True,
"readable": "Bench",
"name": "object--bench",
"evaluate": True,
},
{
"color": [100, 140, 180],
"instances": True,
"readable": "Bike Rack",
"name": "object--bike-rack",
"evaluate": True,
},
{
"color": [220, 220, 220],
"instances": True,
"readable": "Billboard",
"name": "object--billboard",
"evaluate": True,
},
{
"color": [220, 128, 128],
"instances": True,
"readable": "Catch Basin",
"name": "object--catch-basin",
"evaluate": True,
},
{
"color": [222, 40, 40],
"instances": True,
"readable": "CCTV Camera",
"name": "object--cctv-camera",
"evaluate": True,
},
{
"color": [100, 170, 30],
"instances": True,
"readable": "Fire Hydrant",
"name": "object--fire-hydrant",
"evaluate": True,
},
{
"color": [40, 40, 40],
"instances": True,
"readable": "Junction Box",
"name": "object--junction-box",
"evaluate": True,
},
{
"color": [33, 33, 33],
"instances": True,
"readable": "Mailbox",
"name": "object--mailbox",
"evaluate": True,
},
{
"color": [100, 128, 160],
"instances": True,
"readable": "Manhole",
"name": "object--manhole",
"evaluate": True,
},
{
"color": [142, 0, 0],
"instances": True,
"readable": "Phone Booth",
"name": "object--phone-booth",
"evaluate": True,
},
{
"color": [70, 100, 150],
"instances": False,
"readable": "Pothole",
"name": "object--pothole",
"evaluate": True,
},
{
"color": [210, 170, 100],
"instances": True,
"readable": "Street Light",
"name": "object--street-light",
"evaluate": True,
},
{
"color": [153, 153, 153],
"instances": True,
"readable": "Pole",
"name": "object--support--pole",
"evaluate": True,
},
{
"color": [128, 128, 128],
"instances": True,
"readable": "Traffic Sign Frame",
"name": "object--support--traffic-sign-frame",
"evaluate": True,
},
{
"color": [0, 0, 80],
"instances": True,
"readable": "Utility Pole",
"name": "object--support--utility-pole",
"evaluate": True,
},
{
"color": [250, 170, 30],
"instances": True,
"readable": "Traffic Light",
"name": "object--traffic-light",
"evaluate": True,
},
{
"color": [192, 192, 192],
"instances": True,
"readable": "Traffic Sign (Back)",
"name": "object--traffic-sign--back",
"evaluate": True,
},
{
"color": [220, 220, 0],
"instances": True,
"readable": "Traffic Sign (Front)",
"name": "object--traffic-sign--front",
"evaluate": True,
},
{
"color": [140, 140, 20],
"instances": True,
"readable": "Trash Can",
"name": "object--trash-can",
"evaluate": True,
},
{
"color": [119, 11, 32],
"instances": True,
"readable": "Bicycle",
"name": "object--vehicle--bicycle",
"evaluate": True,
},
{
"color": [150, 0, 255],
"instances": True,
"readable": "Boat",
"name": "object--vehicle--boat",
"evaluate": True,
},
{
"color": [0, 60, 100],
"instances": True,
"readable": "Bus",
"name": "object--vehicle--bus",
"evaluate": True,
},
{
"color": [0, 0, 142],
"instances": True,
"readable": "Car",
"name": "object--vehicle--car",
"evaluate": True,
},
{
"color": [0, 0, 90],
"instances": True,
"readable": "Caravan",
"name": "object--vehicle--caravan",
"evaluate": True,
},
{
"color": [0, 0, 230],
"instances": True,
"readable": "Motorcycle",
"name": "object--vehicle--motorcycle",
"evaluate": True,
},
{
"color": [0, 80, 100],
"instances": False,
"readable": "On Rails",
"name": "object--vehicle--on-rails",
"evaluate": True,
},
{
"color": [128, 64, 64],
"instances": True,
"readable": "Other Vehicle",
"name": "object--vehicle--other-vehicle",
"evaluate": True,
},
{
"color": [0, 0, 110],
"instances": True,
"readable": "Trailer",
"name": "object--vehicle--trailer",
"evaluate": True,
},
{
"color": [0, 0, 70],
"instances": True,
"readable": "Truck",
"name": "object--vehicle--truck",
"evaluate": True,
},
{
"color": [0, 0, 192],
"instances": True,
"readable": "Wheeled Slow",
"name": "object--vehicle--wheeled-slow",
"evaluate": True,
},
{
"color": [32, 32, 32],
"instances": False,
"readable": "Car Mount",
"name": "void--car-mount",
"evaluate": True,
},
{
"color": [120, 10, 10],
"instances": False,
"readable": "Ego Vehicle",
"name": "void--ego-vehicle",
"evaluate": True,
},
{
"color": [0, 0, 0],
"instances": False,
"readable": "Unlabeled",
"name": "void--unlabeled",
"evaluate": False,
},
]
def _get_mapillary_vistas_meta():
stuff_classes = [k["readable"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES if k["evaluate"]]
assert len(stuff_classes) == 65
stuff_colors = [k["color"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES if k["evaluate"]]
assert len(stuff_colors) == 65
ret = {
"stuff_classes": stuff_classes,
"stuff_colors": stuff_colors,
}
return ret
def register_all_mapillary_vistas(root):
root = os.path.join(root, "mapillary_vistas")
meta = _get_mapillary_vistas_meta()
for name, dirname in [("train", "training"), ("val", "validation")]:
image_dir = os.path.join(root, dirname, "images")
gt_dir = os.path.join(root, dirname, "labels")
name = f"mapillary_vistas_sem_seg_{name}"
DatasetCatalog.register(
name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg")
)
MetadataCatalog.get(name).set(
image_root=image_dir,
sem_seg_root=gt_dir,
evaluator_type="sem_seg",
ignore_label=65, # different from other datasets, Mapillary Vistas sets ignore_label to 65
**meta,
)
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_mapillary_vistas(_root)
================================================
FILE: mfvis_nococo/mask2former/data/datasets/register_mapillary_vistas_panoptic.py
================================================
import json
import os
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.utils.file_io import PathManager
MAPILLARY_VISTAS_SEM_SEG_CATEGORIES = [
{'color': [165, 42, 42],
'id': 1,
'isthing': 1,
'name': 'Bird',
'supercategory': 'animal--bird'},
{'color': [0, 192, 0],
'id': 2,
'isthing': 1,
'name': 'Ground Animal',
'supercategory': 'animal--ground-animal'},
{'color': [196, 196, 196],
'id': 3,
'isthing': 0,
'name': 'Curb',
'supercategory': 'construction--barrier--curb'},
{'color': [190, 153, 153],
'id': 4,
'isthing': 0,
'name': 'Fence',
'supercategory': 'construction--barrier--fence'},
{'color': [180, 165, 180],
'id': 5,
'isthing': 0,
'name': 'Guard Rail',
'supercategory': 'construction--barrier--guard-rail'},
{'color': [90, 120, 150],
'id': 6,
'isthing': 0,
'name': 'Barrier',
'supercategory': 'construction--barrier--other-barrier'},
{'color': [102, 102, 156],
'id': 7,
'isthing': 0,
'name': 'Wall',
'supercategory': 'construction--barrier--wall'},
{'color': [128, 64, 255],
'id': 8,
'isthing': 0,
'name': 'Bike Lane',
'supercategory': 'construction--flat--bike-lane'},
{'color': [140, 140, 200],
'id': 9,
'isthing': 1,
'name': 'Crosswalk - Plain',
'supercategory': 'construction--flat--crosswalk-plain'},
{'color': [170, 170, 170],
'id': 10,
'isthing': 0,
'name': 'Curb Cut',
'supercategory': 'construction--flat--curb-cut'},
{'color': [250, 170, 160],
'id': 11,
'isthing': 0,
'name': 'Parking',
'supercategory': 'construction--flat--parking'},
{'color': [96, 96, 96],
'id': 12,
'isthing': 0,
'name': 'Pedestrian Area',
'supercategory': 'construction--flat--pedestrian-area'},
{'color': [230, 150, 140],
'id': 13,
'isthing': 0,
'name': 'Rail Track',
'supercategory': 'construction--flat--rail-track'},
{'color': [128, 64, 128],
'id': 14,
'isthing': 0,
'name': 'Road',
'supercategory': 'construction--flat--road'},
{'color': [110, 110, 110],
'id': 15,
'isthing': 0,
'name': 'Service Lane',
'supercategory': 'construction--flat--service-lane'},
{'color': [244, 35, 232],
'id': 16,
'isthing': 0,
'name': 'Sidewalk',
'supercategory': 'construction--flat--sidewalk'},
{'color': [150, 100, 100],
'id': 17,
'isthing': 0,
'name': 'Bridge',
'supercategory': 'construction--structure--bridge'},
{'color': [70, 70, 70],
'id': 18,
'isthing': 0,
'name': 'Building',
'supercategory': 'construction--structure--building'},
{'color': [150, 120, 90],
'id': 19,
'isthing': 0,
'name': 'Tunnel',
'supercategory': 'construction--structure--tunnel'},
{'color': [220, 20, 60],
'id': 20,
'isthing': 1,
'name': 'Person',
'supercategory': 'human--person'},
{'color': [255, 0, 0],
'id': 21,
'isthing': 1,
'name': 'Bicyclist',
'supercategory': 'human--rider--bicyclist'},
{'color': [255, 0, 100],
'id': 22,
'isthing': 1,
'name': 'Motorcyclist',
'supercategory': 'human--rider--motorcyclist'},
{'color': [255, 0, 200],
'id': 23,
'isthing': 1,
'name': 'Other Rider',
'supercategory': 'human--rider--other-rider'},
{'color': [200, 128, 128],
'id': 24,
'isthing': 1,
'name': 'Lane Marking - Crosswalk',
'supercategory': 'marking--crosswalk-zebra'},
{'color': [255, 255, 255],
'id': 25,
'isthing': 0,
'name': 'Lane Marking - General',
'supercategory': 'marking--general'},
{'color': [64, 170, 64],
'id': 26,
'isthing': 0,
'name': 'Mountain',
'supercategory': 'nature--mountain'},
{'color': [230, 160, 50],
'id': 27,
'isthing': 0,
'name': 'Sand',
'supercategory': 'nature--sand'},
{'color': [70, 130, 180],
'id': 28,
'isthing': 0,
'name': 'Sky',
'supercategory': 'nature--sky'},
{'color': [190, 255, 255],
'id': 29,
'isthing': 0,
'name': 'Snow',
'supercategory': 'nature--snow'},
{'color': [152, 251, 152],
'id': 30,
'isthing': 0,
'name': 'Terrain',
'supercategory': 'nature--terrain'},
{'color': [107, 142, 35],
'id': 31,
'isthing': 0,
'name': 'Vegetation',
'supercategory': 'nature--vegetation'},
{'color': [0, 170, 30],
'id': 32,
'isthing': 0,
'name': 'Water',
'supercategory': 'nature--water'},
{'color': [255, 255, 128],
'id': 33,
'isthing': 1,
'name': 'Banner',
'supercategory': 'object--banner'},
{'color': [250, 0, 30],
'id': 34,
'isthing': 1,
'name': 'Bench',
'supercategory': 'object--bench'},
{'color': [100, 140, 180],
'id': 35,
'isthing': 1,
'name': 'Bike Rack',
'supercategory': 'object--bike-rack'},
{'color': [220, 220, 220],
'id': 36,
'isthing': 1,
'name': 'Billboard',
'supercategory': 'object--billboard'},
{'color': [220, 128, 128],
'id': 37,
'isthing': 1,
'name': 'Catch Basin',
'supercategory': 'object--catch-basin'},
{'color': [222, 40, 40],
'id': 38,
'isthing': 1,
'name': 'CCTV Camera',
'supercategory': 'object--cctv-camera'},
{'color': [100, 170, 30],
'id': 39,
'isthing': 1,
'name': 'Fire Hydrant',
'supercategory': 'object--fire-hydrant'},
{'color': [40, 40, 40],
'id': 40,
'isthing': 1,
'name': 'Junction Box',
'supercategory': 'object--junction-box'},
{'color': [33, 33, 33],
'id': 41,
'isthing': 1,
'name': 'Mailbox',
'supercategory': 'object--mailbox'},
{'color': [100, 128, 160],
'id': 42,
'isthing': 1,
'name': 'Manhole',
'supercategory': 'object--manhole'},
{'color': [142, 0, 0],
'id': 43,
'isthing': 1,
'name': 'Phone Booth',
'supercategory': 'object--phone-booth'},
{'color': [70, 100, 150],
'id': 44,
'isthing': 0,
'name': 'Pothole',
'supercategory': 'object--pothole'},
{'color': [210, 170, 100],
'id': 45,
'isthing': 1,
'name': 'Street Light',
'supercategory': 'object--street-light'},
{'color': [153, 153, 153],
'id': 46,
'isthing': 1,
'name': 'Pole',
'supercategory': 'object--support--pole'},
{'color': [128, 128, 128],
'id': 47,
'isthing': 1,
'name': 'Traffic Sign Frame',
'supercategory': 'object--support--traffic-sign-frame'},
{'color': [0, 0, 80],
'id': 48,
'isthing': 1,
'name': 'Utility Pole',
'supercategory': 'object--support--utility-pole'},
{'color': [250, 170, 30],
'id': 49,
'isthing': 1,
'name': 'Traffic Light',
'supercategory': 'object--traffic-light'},
{'color': [192, 192, 192],
'id': 50,
'isthing': 1,
'name': 'Traffic Sign (Back)',
'supercategory': 'object--traffic-sign--back'},
{'color': [220, 220, 0],
'id': 51,
'isthing': 1,
'name': 'Traffic Sign (Front)',
'supercategory': 'object--traffic-sign--front'},
{'color': [140, 140, 20],
'id': 52,
'isthing': 1,
'name': 'Trash Can',
'supercategory': 'object--trash-can'},
{'color': [119, 11, 32],
'id': 53,
'isthing': 1,
'name': 'Bicycle',
'supercategory': 'object--vehicle--bicycle'},
{'color': [150, 0, 255],
'id': 54,
'isthing': 1,
'name': 'Boat',
'supercategory': 'object--vehicle--boat'},
{'color': [0, 60, 100],
'id': 55,
'isthing': 1,
'name': 'Bus',
'supercategory': 'object--vehicle--bus'},
{'color': [0, 0, 142],
'id': 56,
'isthing': 1,
'name': 'Car',
'supercategory': 'object--vehicle--car'},
{'color': [0, 0, 90],
'id': 57,
'isthing': 1,
'name': 'Caravan',
'supercategory': 'object--vehicle--caravan'},
{'color': [0, 0, 230],
'id': 58,
'isthing': 1,
'name': 'Motorcycle',
'supercategory': 'object--vehicle--motorcycle'},
{'color': [0, 80, 100],
'id': 59,
'isthing': 0,
'name': 'On Rails',
'supercategory': 'object--vehicle--on-rails'},
{'color': [128, 64, 64],
'id': 60,
'isthing': 1,
'name': 'Other Vehicle',
'supercategory': 'object--vehicle--other-vehicle'},
{'color': [0, 0, 110],
'id': 61,
'isthing': 1,
'name': 'Trailer',
'supercategory': 'object--vehicle--trailer'},
{'color': [0, 0, 70],
'id': 62,
'isthing': 1,
'name': 'Truck',
'supercategory': 'object--vehicle--truck'},
{'color': [0, 0, 192],
'id': 63,
'isthing': 1,
'name': 'Wheeled Slow',
'supercategory': 'object--vehicle--wheeled-slow'},
{'color': [32, 32, 32],
'id': 64,
'isthing': 0,
'name': 'Car Mount',
'supercategory': 'void--car-mount'},
{'color': [120, 10, 10],
'id': 65,
'isthing': 0,
'name': 'Ego Vehicle',
'supercategory': 'void--ego-vehicle'}
]
def load_mapillary_vistas_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
"""
Args:
image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
Returns:
list[dict]: a list of dicts in Detectron2 standard format. (See
`Using Custom Datasets `_ )
"""
def _convert_category_id(segment_info, meta):
if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
segment_info["category_id"]
]
segment_info["isthing"] = True
else:
segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
segment_info["category_id"]
]
segment_info["isthing"] = False
return segment_info
with PathManager.open(json_file) as f:
json_info = json.load(f)
ret = []
for ann in json_info["annotations"]:
image_id = ann["image_id"]
# TODO: currently we assume image and label has the same filename but
# different extension, and images have extension ".jpg" for COCO. Need
# to make image extension a user-provided argument if we extend this
# function to support other COCO-like datasets.
image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
label_file = os.path.join(gt_dir, ann["file_name"])
sem_label_file = os.path.join(semseg_dir, ann["file_name"])
segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
ret.append(
{
"file_name": image_file,
"image_id": image_id,
"pan_seg_file_name": label_file,
"sem_seg_file_name": sem_label_file,
"segments_info": segments_info,
}
)
assert len(ret), f"No images found in {image_dir}!"
assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
return ret
def register_mapillary_vistas_panoptic(
name, metadata, image_root, panoptic_root, semantic_root, panoptic_json, instances_json=None
):
"""
Register a "standard" version of ADE20k panoptic segmentation dataset named `name`.
The dictionaries in this registered dataset follows detectron2's standard format.
Hence it's called "standard".
Args:
name (str): the name that identifies a dataset,
e.g. "ade20k_panoptic_train"
metadata (dict): extra metadata associated with this dataset.
image_root (str): directory which contains all the images
panoptic_root (str): directory which contains panoptic annotation images in COCO format
panoptic_json (str): path to the json panoptic annotation file in COCO format
sem_seg_root (none): not used, to be consistent with
`register_coco_panoptic_separated`.
instances_json (str): path to the json instance annotation file
"""
panoptic_name = name
DatasetCatalog.register(
panoptic_name,
lambda: load_mapillary_vistas_panoptic_json(
panoptic_json, image_root, panoptic_root, semantic_root, metadata
),
)
MetadataCatalog.get(panoptic_name).set(
panoptic_root=panoptic_root,
image_root=image_root,
panoptic_json=panoptic_json,
json_file=instances_json,
evaluator_type="mapillary_vistas_panoptic_seg",
ignore_label=65, # different from other datasets, Mapillary Vistas sets ignore_label to 65
label_divisor=1000,
**metadata,
)
_PREDEFINED_SPLITS_ADE20K_PANOPTIC = {
"mapillary_vistas_panoptic_train": (
"mapillary_vistas/training/images",
"mapillary_vistas/training/panoptic",
"mapillary_vistas/training/panoptic/panoptic_2018.json",
"mapillary_vistas/training/labels",
),
"mapillary_vistas_panoptic_val": (
"mapillary_vistas/validation/images",
"mapillary_vistas/validation/panoptic",
"mapillary_vistas/validation/panoptic/panoptic_2018.json",
"mapillary_vistas/validation/labels",
),
}
def get_metadata():
meta = {}
# The following metadata maps contiguous id from [0, #thing categories +
# #stuff categories) to their names and colors. We have to replica of the
# same name and color under "thing_*" and "stuff_*" because the current
# visualization function in D2 handles thing and class classes differently
# due to some heuristic used in Panoptic FPN. We keep the same naming to
# enable reusing existing visualization functions.
thing_classes = [k["name"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES]
thing_colors = [k["color"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES]
stuff_classes = [k["name"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES]
stuff_colors = [k["color"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES]
meta["thing_classes"] = thing_classes
meta["thing_colors"] = thing_colors
meta["stuff_classes"] = stuff_classes
meta["stuff_colors"] = stuff_colors
# Convert category id for training:
# category id: like semantic segmentation, it is the class id for each
# pixel. Since there are some classes not used in evaluation, the category
# id is not always contiguous and thus we have two set of category ids:
# - original category id: category id in the original dataset, mainly
# used for evaluation.
# - contiguous category id: [0, #classes), in order to train the linear
# softmax classifier.
thing_dataset_id_to_contiguous_id = {}
stuff_dataset_id_to_contiguous_id = {}
for i, cat in enumerate(MAPILLARY_VISTAS_SEM_SEG_CATEGORIES):
if cat["isthing"]:
thing_dataset_id_to_contiguous_id[cat["id"]] = i
# else:
# stuff_dataset_id_to_contiguous_id[cat["id"]] = i
# in order to use sem_seg evaluator
stuff_dataset_id_to_contiguous_id[cat["id"]] = i
meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
return meta
def register_all_mapillary_vistas_panoptic(root):
metadata = get_metadata()
for (
prefix,
(image_root, panoptic_root, panoptic_json, semantic_root),
) in _PREDEFINED_SPLITS_ADE20K_PANOPTIC.items():
# The "standard" version of COCO panoptic segmentation dataset,
# e.g. used by Panoptic-DeepLab
register_mapillary_vistas_panoptic(
prefix,
metadata,
os.path.join(root, image_root),
os.path.join(root, panoptic_root),
os.path.join(root, semantic_root),
os.path.join(root, panoptic_json),
)
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_mapillary_vistas_panoptic(_root)
================================================
FILE: mfvis_nococo/mask2former/evaluation/__init__.py
================================================
================================================
FILE: mfvis_nococo/mask2former/evaluation/__init__.py.new
================================================
================================================
FILE: mfvis_nococo/mask2former/evaluation/instance_evaluation.py
================================================
import contextlib
import copy
import io
import itertools
import json
import logging
import numpy as np
import os
import pickle
from collections import OrderedDict
import pycocotools.mask as mask_util
import torch
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from tabulate import tabulate
import detectron2.utils.comm as comm
from detectron2.config import CfgNode
from detectron2.data import MetadataCatalog
from detectron2.data.datasets.coco import convert_to_coco_json
from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
from detectron2.evaluation.fast_eval_api import COCOeval_opt
from detectron2.structures import Boxes, BoxMode, pairwise_iou
from detectron2.utils.file_io import PathManager
from detectron2.utils.logger import create_small_table
# modified from COCOEvaluator for instance segmetnat
class InstanceSegEvaluator(COCOEvaluator):
"""
Evaluate AR for object proposals, AP for instance detection/segmentation, AP
for keypoint detection outputs using COCO's metrics.
See http://cocodataset.org/#detection-eval and
http://cocodataset.org/#keypoints-eval to understand its metrics.
The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
the metric cannot be computed (e.g. due to no predictions made).
In addition to COCO, this evaluator is able to support any bounding box detection,
instance segmentation, or keypoint detection dataset.
"""
def _eval_predictions(self, predictions, img_ids=None):
"""
Evaluate predictions. Fill self._results with the metrics of the tasks.
"""
self._logger.info("Preparing results for COCO format ...")
coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
tasks = self._tasks or self._tasks_from_predictions(coco_results)
# unmap the category ids for COCO
if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
# all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
# num_classes = len(all_contiguous_ids)
# assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
for result in coco_results:
category_id = result["category_id"]
# assert category_id < num_classes, (
# f"A prediction has class={category_id}, "
# f"but the dataset only has {num_classes} classes and "
# f"predicted class id should be in [0, {num_classes - 1}]."
# )
assert category_id in reverse_id_mapping, (
f"A prediction has class={category_id}, "
f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
)
result["category_id"] = reverse_id_mapping[category_id]
if self._output_dir:
file_path = os.path.join(self._output_dir, "coco_instances_results.json")
self._logger.info("Saving results to {}".format(file_path))
with PathManager.open(file_path, "w") as f:
f.write(json.dumps(coco_results))
f.flush()
if not self._do_evaluation:
self._logger.info("Annotations are not available for evaluation.")
return
self._logger.info(
"Evaluating predictions with {} COCO API...".format(
"unofficial" if self._use_fast_impl else "official"
)
)
for task in sorted(tasks):
assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
coco_eval = (
_evaluate_predictions_on_coco(
self._coco_api,
coco_results,
task,
kpt_oks_sigmas=self._kpt_oks_sigmas,
use_fast_impl=self._use_fast_impl,
img_ids=img_ids,
max_dets_per_image=self._max_dets_per_image,
)
if len(coco_results) > 0
else None # cocoapi does not handle empty results very well
)
res = self._derive_coco_results(
coco_eval, task, class_names=self._metadata.get("thing_classes")
)
self._results[task] = res
================================================
FILE: mfvis_nococo/mask2former/maskformer_model.py
================================================
from typing import Tuple
import torch
from torch import nn
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.data import MetadataCatalog
from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head
from detectron2.modeling.backbone import Backbone
from detectron2.modeling.postprocessing import sem_seg_postprocess
from detectron2.structures import Boxes, ImageList, Instances, BitMasks
from detectron2.utils.memory import retry_if_cuda_oom
from .modeling.criterion import SetCriterion
from .modeling.matcher import HungarianMatcher
from skimage import color
import cv2
import numpy as np
def unfold_wo_center(x, kernel_size, dilation):
assert x.dim() == 4
assert kernel_size % 2 == 1
# using SAME padding
padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
unfolded_x = F.unfold(
x, kernel_size=kernel_size,
padding=padding,
dilation=dilation
)
unfolded_x = unfolded_x.reshape(
x.size(0), x.size(1), -1, x.size(2), x.size(3)
)
# remove the center pixels
size = kernel_size ** 2
unfolded_x = torch.cat((
unfolded_x[:, :, :size // 2],
unfolded_x[:, :, size // 2 + 1:]
), dim=2)
return unfolded_x
def get_images_color_similarity(images, kernel_size, dilation):
assert images.dim() == 4
assert images.size(0) == 1
unfolded_images = unfold_wo_center(
images, kernel_size=kernel_size, dilation=dilation
)
diff = images[:, :, None] - unfolded_images
similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5)
return similarity
@META_ARCH_REGISTRY.register()
class MaskFormer(nn.Module):
"""
Main class for mask classification semantic segmentation architectures.
"""
@configurable
def __init__(
self,
*,
backbone: Backbone,
sem_seg_head: nn.Module,
criterion: nn.Module,
num_queries: int,
object_mask_threshold: float,
overlap_threshold: float,
metadata,
size_divisibility: int,
sem_seg_postprocess_before_inference: bool,
pixel_mean: Tuple[float],
pixel_std: Tuple[float],
# inference
semantic_on: bool,
panoptic_on: bool,
instance_on: bool,
test_topk_per_image: int,
):
"""
Args:
backbone: a backbone module, must follow detectron2's backbone interface
sem_seg_head: a module that predicts semantic segmentation from backbone features
criterion: a module that defines the loss
num_queries: int, number of queries
object_mask_threshold: float, threshold to filter query based on classification score
for panoptic segmentation inference
overlap_threshold: overlap threshold used in general inference for panoptic segmentation
metadata: dataset meta, get `thing` and `stuff` category names for panoptic
segmentation inference
size_divisibility: Some backbones require the input height and width to be divisible by a
specific integer. We can use this to override such requirement.
sem_seg_postprocess_before_inference: whether to resize the prediction back
to original input size before semantic segmentation inference or after.
For high-resolution dataset like Mapillary, resizing predictions before
inference will cause OOM error.
pixel_mean, pixel_std: list or tuple with #channels element, representing
the per-channel mean and std to be used to normalize the input image
semantic_on: bool, whether to output semantic segmentation prediction
instance_on: bool, whether to output instance segmentation prediction
panoptic_on: bool, whether to output panoptic segmentation prediction
test_topk_per_image: int, instance segmentation parameter, keep topk instances per image
"""
super().__init__()
self.backbone = backbone
self.sem_seg_head = sem_seg_head
self.criterion = criterion
self.num_queries = num_queries
self.overlap_threshold = overlap_threshold
self.object_mask_threshold = object_mask_threshold
self.metadata = metadata
if size_divisibility < 0:
# use backbone size_divisibility if not set
size_divisibility = self.backbone.size_divisibility
self.size_divisibility = size_divisibility
self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference
self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
# additional args
self.semantic_on = semantic_on
self.instance_on = instance_on
self.panoptic_on = panoptic_on
self.test_topk_per_image = test_topk_per_image
if not self.semantic_on:
assert self.sem_seg_postprocess_before_inference
@classmethod
def from_config(cls, cfg):
backbone = build_backbone(cfg)
sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
# Loss parameters:
deep_supervision = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
no_object_weight = cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT
# loss weights
class_weight = cfg.MODEL.MASK_FORMER.CLASS_WEIGHT
dice_weight = cfg.MODEL.MASK_FORMER.DICE_WEIGHT
mask_weight = cfg.MODEL.MASK_FORMER.MASK_WEIGHT
# building criterion
matcher = HungarianMatcher(
cost_class=class_weight,
cost_mask=mask_weight,
cost_dice=dice_weight,
num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS,
)
weight_dict = {"loss_ce": class_weight, "loss_mask": mask_weight, "loss_dice": dice_weight, "loss_bound": mask_weight}
if deep_supervision:
dec_layers = cfg.MODEL.MASK_FORMER.DEC_LAYERS
aux_weight_dict = {}
for i in range(dec_layers - 1):
aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
weight_dict.update(aux_weight_dict)
losses = ["labels", "masks"]
criterion = SetCriterion(
sem_seg_head.num_classes,
matcher=matcher,
weight_dict=weight_dict,
eos_coef=no_object_weight,
losses=losses,
num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS,
oversample_ratio=cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO,
importance_sample_ratio=cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO,
)
return {
"backbone": backbone,
"sem_seg_head": sem_seg_head,
"criterion": criterion,
"num_queries": cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES,
"object_mask_threshold": cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD,
"overlap_threshold": cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD,
"metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
"size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY,
"sem_seg_postprocess_before_inference": (
cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE
or cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON
or cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON
),
"pixel_mean": cfg.MODEL.PIXEL_MEAN,
"pixel_std": cfg.MODEL.PIXEL_STD,
# inference
"semantic_on": cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON,
"instance_on": cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON,
"panoptic_on": cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON,
"test_topk_per_image": cfg.TEST.DETECTIONS_PER_IMAGE,
}
@property
def device(self):
return self.pixel_mean.device
def forward(self, batched_inputs):
"""
Args:
batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
* "image": Tensor, image in (C, H, W) format.
* "instances": per-region ground truth
* Other information that's included in the original dicts, such as:
"height", "width" (int): the output resolution of the model (may be different
from input resolution), used in inference.
Returns:
list[dict]:
each dict has the results for one image. The dict contains the following keys:
* "sem_seg":
A Tensor that represents the
per-pixel segmentation prediced by the head.
The prediction has shape KxHxW that represents the logits of
each class for each pixel.
* "panoptic_seg":
A tuple that represent panoptic output
panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
segments_info (list[dict]): Describe each segment in `panoptic_seg`.
Each dict contains keys "id", "category_id", "isthing".
"""
images = [x["image"].to(self.device) for x in batched_inputs]
if self.training:
rs_images = ImageList.from_tensors(images, self.size_divisibility)
image_masks = [~ x["padding_mask"].to(self.device) for x in batched_inputs]
image_masks_back = [x["padding_mask"].to(self.device) for x in batched_inputs]
image_masks_bool = [((m.sum() / (m.shape[0] * m.shape[1])) > 0.25).float()*((m_b.sum() / (m.shape[0] * m.shape[1])) > 0.25).float() for m, m_b in zip(image_masks, image_masks_back)]
downsampled_images = F.avg_pool2d(rs_images.tensor.float(), kernel_size=4, stride=4, padding=0) #for img in images]
images_lab = [torch.as_tensor(color.rgb2lab(ds_image[[2, 1, 0]].byte().permute(1, 2, 0).cpu().numpy()), device=ds_image.device, dtype=torch.float32).permute(2, 0, 1) for ds_image in downsampled_images]
images_lab_sim = [get_images_color_similarity(img_lab.unsqueeze(0), 3, 2) * float(img_m_bool) for img_lab, img_m_bool in zip(images_lab, image_masks_bool)]
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
images = ImageList.from_tensors(images, self.size_divisibility)
features = self.backbone(images.tensor)
outputs = self.sem_seg_head(features)
if self.training:
# mask classification target
if "instances" in batched_inputs[0]:
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
targets = self.prepare_targets(gt_instances, images)
else:
targets = None
# bipartite matching-based loss
losses = self.criterion(outputs, targets, images_lab_sim)
for k in list(losses.keys()):
if k in self.criterion.weight_dict:
losses[k] *= self.criterion.weight_dict[k]
else:
# remove this loss if not specified in `weight_dict`
losses.pop(k)
return losses
else:
mask_cls_results = outputs["pred_logits"]
mask_pred_results = outputs["pred_masks"]
# upsample masks
mask_pred_results = F.interpolate(
mask_pred_results,
size=(images.tensor.shape[-2], images.tensor.shape[-1]),
mode="bilinear",
align_corners=False,
)
del outputs
processed_results = []
for mask_cls_result, mask_pred_result, input_per_image, image_size in zip(
mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes
):
height = input_per_image.get("height", image_size[0])
width = input_per_image.get("width", image_size[1])
processed_results.append({})
if self.sem_seg_postprocess_before_inference:
mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
mask_pred_result, image_size, height, width
)
mask_cls_result = mask_cls_result.to(mask_pred_result)
# semantic segmentation inference
if self.semantic_on:
r = retry_if_cuda_oom(self.semantic_inference)(mask_cls_result, mask_pred_result)
if not self.sem_seg_postprocess_before_inference:
r = retry_if_cuda_oom(sem_seg_postprocess)(r, image_size, height, width)
processed_results[-1]["sem_seg"] = r
# panoptic segmentation inference
if self.panoptic_on:
panoptic_r = retry_if_cuda_oom(self.panoptic_inference)(mask_cls_result, mask_pred_result)
processed_results[-1]["panoptic_seg"] = panoptic_r
# instance segmentation inference
if self.instance_on:
instance_r = retry_if_cuda_oom(self.instance_inference)(mask_cls_result, mask_pred_result)
processed_results[-1]["instances"] = instance_r
return processed_results
def prepare_targets(self, targets, images):
h_pad, w_pad = images.tensor.shape[-2:]
new_targets = []
for targets_per_image in targets:
# pad gt
gt_masks = targets_per_image.gt_masks
padded_masks = torch.zeros((gt_masks.shape[0], h_pad, w_pad), dtype=gt_masks.dtype, device=gt_masks.device)
padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks
new_targets.append(
{
"labels": targets_per_image.gt_classes,
"masks": padded_masks,
}
)
return new_targets
def semantic_inference(self, mask_cls, mask_pred):
mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
mask_pred = mask_pred.sigmoid()
semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred)
return semseg
def panoptic_inference(self, mask_cls, mask_pred):
scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
mask_pred = mask_pred.sigmoid()
keep = labels.ne(self.sem_seg_head.num_classes) & (scores > self.object_mask_threshold)
cur_scores = scores[keep]
cur_classes = labels[keep]
cur_masks = mask_pred[keep]
cur_mask_cls = mask_cls[keep]
cur_mask_cls = cur_mask_cls[:, :-1]
cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
h, w = cur_masks.shape[-2:]
panoptic_seg = torch.zeros((h, w), dtype=torch.int32, device=cur_masks.device)
segments_info = []
current_segment_id = 0
if cur_masks.shape[0] == 0:
# We didn't detect any mask :(
return panoptic_seg, segments_info
else:
# take argmax
cur_mask_ids = cur_prob_masks.argmax(0)
stuff_memory_list = {}
for k in range(cur_classes.shape[0]):
pred_class = cur_classes[k].item()
isthing = pred_class in self.metadata.thing_dataset_id_to_contiguous_id.values()
mask_area = (cur_mask_ids == k).sum().item()
original_area = (cur_masks[k] >= 0.5).sum().item()
mask = (cur_mask_ids == k) & (cur_masks[k] >= 0.5)
if mask_area > 0 and original_area > 0 and mask.sum().item() > 0:
if mask_area / original_area < self.overlap_threshold:
continue
# merge stuff regions
if not isthing:
if int(pred_class) in stuff_memory_list.keys():
panoptic_seg[mask] = stuff_memory_list[int(pred_class)]
continue
else:
stuff_memory_list[int(pred_class)] = current_segment_id + 1
current_segment_id += 1
panoptic_seg[mask] = current_segment_id
segments_info.append(
{
"id": current_segment_id,
"isthing": bool(isthing),
"category_id": int(pred_class),
}
)
return panoptic_seg, segments_info
def instance_inference(self, mask_cls, mask_pred):
# mask_pred is already processed to have the same shape as original input
image_size = mask_pred.shape[-2:]
# [Q, K]
scores = F.softmax(mask_cls, dim=-1)[:, :-1]
labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1)
# scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.num_queries, sorted=False)
scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.test_topk_per_image, sorted=False)
labels_per_image = labels[topk_indices]
topk_indices = topk_indices // self.sem_seg_head.num_classes
# mask_pred = mask_pred.unsqueeze(1).repeat(1, self.sem_seg_head.num_classes, 1).flatten(0, 1)
mask_pred = mask_pred[topk_indices]
# if this is panoptic segmentation, we only keep the "thing" classes
if self.panoptic_on:
keep = torch.zeros_like(scores_per_image).bool()
for i, lab in enumerate(labels_per_image):
keep[i] = lab in self.metadata.thing_dataset_id_to_contiguous_id.values()
scores_per_image = scores_per_image[keep]
labels_per_image = labels_per_image[keep]
mask_pred = mask_pred[keep]
result = Instances(image_size)
result.pred_masks = (mask_pred > 0).float()
result.pred_boxes = BitMasks(mask_pred > 0).get_bounding_boxes()
# calculate average mask prob
mask_scores_per_image = (mask_pred.sigmoid().flatten(1) * result.pred_masks.flatten(1)).sum(1) / (result.pred_masks.flatten(1).sum(1) + 1e-6)
result.scores = scores_per_image * mask_scores_per_image
result.pred_classes = labels_per_image
return result
================================================
FILE: mfvis_nococo/mask2former/modeling/__init__.py
================================================
from .backbone.swin import D2SwinTransformer
from .pixel_decoder.fpn import BasePixelDecoder
from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
from .meta_arch.mask_former_head import MaskFormerHead
from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
================================================
FILE: mfvis_nococo/mask2former/modeling/backbone/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
================================================
FILE: mfvis_nococo/mask2former/modeling/backbone/__init__.py.new
================================================
================================================
FILE: mfvis_nococo/mask2former/modeling/backbone/swin.py
================================================
# --------------------------------------------------------
# Swin Transformer
# Copyright (c) 2021 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ze Liu, Yutong Lin, Yixuan Wei
# --------------------------------------------------------
# Modified by Bowen Cheng from https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation/blob/main/mmseg/models/backbones/swin_transformer.py
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
class Mlp(nn.Module):
"""Multilayer perceptron."""
def __init__(
self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0
):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
def window_partition(x, window_size):
"""
Args:
x: (B, H, W, C)
window_size (int): window size
Returns:
windows: (num_windows*B, window_size, window_size, C)
"""
B, H, W, C = x.shape
x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
return windows
def window_reverse(windows, window_size, H, W):
"""
Args:
windows: (num_windows*B, window_size, window_size, C)
window_size (int): Window size
H (int): Height of image
W (int): Width of image
Returns:
x: (B, H, W, C)
"""
B = int(windows.shape[0] / (H * W / window_size / window_size))
x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
return x
class WindowAttention(nn.Module):
"""Window based multi-head self attention (W-MSA) module with relative position bias.
It supports both of shifted and non-shifted window.
Args:
dim (int): Number of input channels.
window_size (tuple[int]): The height and width of the window.
num_heads (int): Number of attention heads.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
"""
def __init__(
self,
dim,
window_size,
num_heads,
qkv_bias=True,
qk_scale=None,
attn_drop=0.0,
proj_drop=0.0,
):
super().__init__()
self.dim = dim
self.window_size = window_size # Wh, Ww
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim ** -0.5
# define a parameter table of relative position bias
self.relative_position_bias_table = nn.Parameter(
torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
) # 2*Wh-1 * 2*Ww-1, nH
# get pair-wise relative position index for each token inside the window
coords_h = torch.arange(self.window_size[0])
coords_w = torch.arange(self.window_size[1])
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
relative_coords[:, :, 1] += self.window_size[1] - 1
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
self.register_buffer("relative_position_index", relative_position_index)
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
trunc_normal_(self.relative_position_bias_table, std=0.02)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x, mask=None):
"""Forward function.
Args:
x: input features with shape of (num_windows*B, N, C)
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
"""
B_, N, C = x.shape
qkv = (
self.qkv(x)
.reshape(B_, N, 3, self.num_heads, C // self.num_heads)
.permute(2, 0, 3, 1, 4)
)
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
q = q * self.scale
attn = q @ k.transpose(-2, -1)
relative_position_bias = self.relative_position_bias_table[
self.relative_position_index.view(-1)
].view(
self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.permute(
2, 0, 1
).contiguous() # nH, Wh*Ww, Wh*Ww
attn = attn + relative_position_bias.unsqueeze(0)
if mask is not None:
nW = mask.shape[0]
attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
attn = attn.view(-1, self.num_heads, N, N)
attn = self.softmax(attn)
else:
attn = self.softmax(attn)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
class SwinTransformerBlock(nn.Module):
"""Swin Transformer Block.
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads.
window_size (int): Window size.
shift_size (int): Shift size for SW-MSA.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float, optional): Stochastic depth rate. Default: 0.0
act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
"""
def __init__(
self,
dim,
num_heads,
window_size=7,
shift_size=0,
mlp_ratio=4.0,
qkv_bias=True,
qk_scale=None,
drop=0.0,
attn_drop=0.0,
drop_path=0.0,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm,
):
super().__init__()
self.dim = dim
self.num_heads = num_heads
self.window_size = window_size
self.shift_size = shift_size
self.mlp_ratio = mlp_ratio
assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
self.norm1 = norm_layer(dim)
self.attn = WindowAttention(
dim,
window_size=to_2tuple(self.window_size),
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop,
)
self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(
in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop
)
self.H = None
self.W = None
def forward(self, x, mask_matrix):
"""Forward function.
Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
mask_matrix: Attention mask for cyclic shift.
"""
B, L, C = x.shape
H, W = self.H, self.W
assert L == H * W, "input feature has wrong size"
shortcut = x
x = self.norm1(x)
x = x.view(B, H, W, C)
# pad feature maps to multiples of window size
pad_l = pad_t = 0
pad_r = (self.window_size - W % self.window_size) % self.window_size
pad_b = (self.window_size - H % self.window_size) % self.window_size
x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
_, Hp, Wp, _ = x.shape
# cyclic shift
if self.shift_size > 0:
shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
attn_mask = mask_matrix
else:
shifted_x = x
attn_mask = None
# partition windows
x_windows = window_partition(
shifted_x, self.window_size
) # nW*B, window_size, window_size, C
x_windows = x_windows.view(
-1, self.window_size * self.window_size, C
) # nW*B, window_size*window_size, C
# W-MSA/SW-MSA
attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C
# merge windows
attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
# reverse cyclic shift
if self.shift_size > 0:
x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
else:
x = shifted_x
if pad_r > 0 or pad_b > 0:
x = x[:, :H, :W, :].contiguous()
x = x.view(B, H * W, C)
# FFN
x = shortcut + self.drop_path(x)
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class PatchMerging(nn.Module):
"""Patch Merging Layer
Args:
dim (int): Number of input channels.
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
"""
def __init__(self, dim, norm_layer=nn.LayerNorm):
super().__init__()
self.dim = dim
self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
self.norm = norm_layer(4 * dim)
def forward(self, x, H, W):
"""Forward function.
Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
"""
B, L, C = x.shape
assert L == H * W, "input feature has wrong size"
x = x.view(B, H, W, C)
# padding
pad_input = (H % 2 == 1) or (W % 2 == 1)
if pad_input:
x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
x = self.norm(x)
x = self.reduction(x)
return x
class BasicLayer(nn.Module):
"""A basic Swin Transformer layer for one stage.
Args:
dim (int): Number of feature channels
depth (int): Depths of this stage.
num_heads (int): Number of attention head.
window_size (int): Local window size. Default: 7.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
"""
def __init__(
self,
dim,
depth,
num_heads,
window_size=7,
mlp_ratio=4.0,
qkv_bias=True,
qk_scale=None,
drop=0.0,
attn_drop=0.0,
drop_path=0.0,
norm_layer=nn.LayerNorm,
downsample=None,
use_checkpoint=False,
):
super().__init__()
self.window_size = window_size
self.shift_size = window_size // 2
self.depth = depth
self.use_checkpoint = use_checkpoint
# build blocks
self.blocks = nn.ModuleList(
[
SwinTransformerBlock(
dim=dim,
num_heads=num_heads,
window_size=window_size,
shift_size=0 if (i % 2 == 0) else window_size // 2,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop,
attn_drop=attn_drop,
drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
norm_layer=norm_layer,
)
for i in range(depth)
]
)
# patch merging layer
if downsample is not None:
self.downsample = downsample(dim=dim, norm_layer=norm_layer)
else:
self.downsample = None
def forward(self, x, H, W):
"""Forward function.
Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
"""
# calculate attention mask for SW-MSA
Hp = int(np.ceil(H / self.window_size)) * self.window_size
Wp = int(np.ceil(W / self.window_size)) * self.window_size
img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1
h_slices = (
slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None),
)
w_slices = (
slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None),
)
cnt = 0
for h in h_slices:
for w in w_slices:
img_mask[:, h, w, :] = cnt
cnt += 1
mask_windows = window_partition(
img_mask, self.window_size
) # nW, window_size, window_size, 1
mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
attn_mask == 0, float(0.0)
)
for blk in self.blocks:
blk.H, blk.W = H, W
if self.use_checkpoint:
x = checkpoint.checkpoint(blk, x, attn_mask)
else:
x = blk(x, attn_mask)
if self.downsample is not None:
x_down = self.downsample(x, H, W)
Wh, Ww = (H + 1) // 2, (W + 1) // 2
return x, H, W, x_down, Wh, Ww
else:
return x, H, W, x, H, W
class PatchEmbed(nn.Module):
"""Image to Patch Embedding
Args:
patch_size (int): Patch token size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
norm_layer (nn.Module, optional): Normalization layer. Default: None
"""
def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
super().__init__()
patch_size = to_2tuple(patch_size)
self.patch_size = patch_size
self.in_chans = in_chans
self.embed_dim = embed_dim
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
if norm_layer is not None:
self.norm = norm_layer(embed_dim)
else:
self.norm = None
def forward(self, x):
"""Forward function."""
# padding
_, _, H, W = x.size()
if W % self.patch_size[1] != 0:
x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
if H % self.patch_size[0] != 0:
x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
x = self.proj(x) # B C Wh Ww
if self.norm is not None:
Wh, Ww = x.size(2), x.size(3)
x = x.flatten(2).transpose(1, 2)
x = self.norm(x)
x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
return x
class SwinTransformer(nn.Module):
"""Swin Transformer backbone.
A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` -
https://arxiv.org/pdf/2103.14030
Args:
pretrain_img_size (int): Input image size for training the pretrained model,
used in absolute postion embedding. Default 224.
patch_size (int | tuple(int)): Patch size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
depths (tuple[int]): Depths of each Swin Transformer stage.
num_heads (tuple[int]): Number of attention head of each stage.
window_size (int): Window size. Default: 7.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
drop_rate (float): Dropout rate.
attn_drop_rate (float): Attention dropout rate. Default: 0.
drop_path_rate (float): Stochastic depth rate. Default: 0.2.
norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
patch_norm (bool): If True, add normalization after patch embedding. Default: True.
out_indices (Sequence[int]): Output from which stages.
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-1 means not freezing any parameters.
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
"""
def __init__(
self,
pretrain_img_size=224,
patch_size=4,
in_chans=3,
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4.0,
qkv_bias=True,
qk_scale=None,
drop_rate=0.0,
attn_drop_rate=0.0,
drop_path_rate=0.2,
norm_layer=nn.LayerNorm,
ape=False,
patch_norm=True,
out_indices=(0, 1, 2, 3),
frozen_stages=-1,
use_checkpoint=False,
):
super().__init__()
self.pretrain_img_size = pretrain_img_size
self.num_layers = len(depths)
self.embed_dim = embed_dim
self.ape = ape
self.patch_norm = patch_norm
self.out_indices = out_indices
self.frozen_stages = frozen_stages
# split image into non-overlapping patches
self.patch_embed = PatchEmbed(
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim,
norm_layer=norm_layer if self.patch_norm else None,
)
# absolute position embedding
if self.ape:
pretrain_img_size = to_2tuple(pretrain_img_size)
patch_size = to_2tuple(patch_size)
patches_resolution = [
pretrain_img_size[0] // patch_size[0],
pretrain_img_size[1] // patch_size[1],
]
self.absolute_pos_embed = nn.Parameter(
torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1])
)
trunc_normal_(self.absolute_pos_embed, std=0.02)
self.pos_drop = nn.Dropout(p=drop_rate)
# stochastic depth
dpr = [
x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
] # stochastic depth decay rule
# build layers
self.layers = nn.ModuleList()
for i_layer in range(self.num_layers):
layer = BasicLayer(
dim=int(embed_dim * 2 ** i_layer),
depth=depths[i_layer],
num_heads=num_heads[i_layer],
window_size=window_size,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
norm_layer=norm_layer,
downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
use_checkpoint=use_checkpoint,
)
self.layers.append(layer)
num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
self.num_features = num_features
# add a norm layer for each output
for i_layer in out_indices:
layer = norm_layer(num_features[i_layer])
layer_name = f"norm{i_layer}"
self.add_module(layer_name, layer)
self._freeze_stages()
def _freeze_stages(self):
if self.frozen_stages >= 0:
self.patch_embed.eval()
for param in self.patch_embed.parameters():
param.requires_grad = False
if self.frozen_stages >= 1 and self.ape:
self.absolute_pos_embed.requires_grad = False
if self.frozen_stages >= 2:
self.pos_drop.eval()
for i in range(0, self.frozen_stages - 1):
m = self.layers[i]
m.eval()
for param in m.parameters():
param.requires_grad = False
def init_weights(self, pretrained=None):
"""Initialize the weights in backbone.
Args:
pretrained (str, optional): Path to pre-trained weights.
Defaults to None.
"""
def _init_weights(m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=0.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
def forward(self, x):
"""Forward function."""
x = self.patch_embed(x)
Wh, Ww = x.size(2), x.size(3)
if self.ape:
# interpolate the position embedding to the corresponding size
absolute_pos_embed = F.interpolate(
self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
)
x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C
else:
x = x.flatten(2).transpose(1, 2)
x = self.pos_drop(x)
outs = {}
for i in range(self.num_layers):
layer = self.layers[i]
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
if i in self.out_indices:
norm_layer = getattr(self, f"norm{i}")
x_out = norm_layer(x_out)
out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
outs["res{}".format(i + 2)] = out
return outs
def train(self, mode=True):
"""Convert the model into training mode while keep layers freezed."""
super(SwinTransformer, self).train(mode)
self._freeze_stages()
@BACKBONE_REGISTRY.register()
class D2SwinTransformer(SwinTransformer, Backbone):
def __init__(self, cfg, input_shape):
pretrain_img_size = cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE
patch_size = cfg.MODEL.SWIN.PATCH_SIZE
in_chans = 3
embed_dim = cfg.MODEL.SWIN.EMBED_DIM
depths = cfg.MODEL.SWIN.DEPTHS
num_heads = cfg.MODEL.SWIN.NUM_HEADS
window_size = cfg.MODEL.SWIN.WINDOW_SIZE
mlp_ratio = cfg.MODEL.SWIN.MLP_RATIO
qkv_bias = cfg.MODEL.SWIN.QKV_BIAS
qk_scale = cfg.MODEL.SWIN.QK_SCALE
drop_rate = cfg.MODEL.SWIN.DROP_RATE
attn_drop_rate = cfg.MODEL.SWIN.ATTN_DROP_RATE
drop_path_rate = cfg.MODEL.SWIN.DROP_PATH_RATE
norm_layer = nn.LayerNorm
ape = cfg.MODEL.SWIN.APE
patch_norm = cfg.MODEL.SWIN.PATCH_NORM
use_checkpoint = cfg.MODEL.SWIN.USE_CHECKPOINT
super().__init__(
pretrain_img_size,
patch_size,
in_chans,
embed_dim,
depths,
num_heads,
window_size,
mlp_ratio,
qkv_bias,
qk_scale,
drop_rate,
attn_drop_rate,
drop_path_rate,
norm_layer,
ape,
patch_norm,
use_checkpoint=use_checkpoint,
)
self._out_features = cfg.MODEL.SWIN.OUT_FEATURES
self._out_feature_strides = {
"res2": 4,
"res3": 8,
"res4": 16,
"res5": 32,
}
self._out_feature_channels = {
"res2": self.num_features[0],
"res3": self.num_features[1],
"res4": self.num_features[2],
"res5": self.num_features[3],
}
def forward(self, x):
"""
Args:
x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
Returns:
dict[str->Tensor]: names and the corresponding features
"""
assert (
x.dim() == 4
), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!"
outputs = {}
y = super().forward(x)
for k in y.keys():
if k in self._out_features:
outputs[k] = y[k]
return outputs
def output_shape(self):
return {
name: ShapeSpec(
channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
)
for name in self._out_features
}
@property
def size_divisibility(self):
return 32
================================================
FILE: mfvis_nococo/mask2former/modeling/criterion.py
================================================
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py
"""
MaskFormer criterion.
"""
import logging
import torch
import torch.nn.functional as F
from torch import nn
from detectron2.utils.comm import get_world_size
from detectron2.projects.point_rend.point_features import (
get_uncertain_point_coords_with_randomness,
point_sample,
)
from ..utils.misc import is_dist_avail_and_initialized, nested_tensor_from_tensor_list
def unfold_wo_center(x, kernel_size, dilation):
assert x.dim() == 4
assert kernel_size % 2 == 1
# using SAME padding
padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
unfolded_x = F.unfold(
x, kernel_size=kernel_size,
padding=padding,
dilation=dilation
)
unfolded_x = unfolded_x.reshape(
x.size(0), x.size(1), -1, x.size(2), x.size(3)
)
# remove the center pixels
size = kernel_size ** 2
unfolded_x = torch.cat((
unfolded_x[:, :, :size // 2],
unfolded_x[:, :, size // 2 + 1:]
), dim=2)
return unfolded_x
def compute_pairwise_term(mask_logits, pairwise_size, pairwise_dilation):
assert mask_logits.dim() == 4
log_fg_prob = F.logsigmoid(mask_logits)
log_bg_prob = F.logsigmoid(-mask_logits)
log_fg_prob_unfold = unfold_wo_center(
log_fg_prob, kernel_size=pairwise_size,
dilation=pairwise_dilation
)
log_bg_prob_unfold = unfold_wo_center(
log_bg_prob, kernel_size=pairwise_size,
dilation=pairwise_dilation
)
# the probability of making the same prediction = p_i * p_j + (1 - p_i) * (1 - p_j)
# we compute the the probability in log space to avoid numerical instability
log_same_fg_prob = log_fg_prob[:, :, None] + log_fg_prob_unfold
log_same_bg_prob = log_bg_prob[:, :, None] + log_bg_prob_unfold
max_ = torch.max(log_same_fg_prob, log_same_bg_prob)
log_same_prob = torch.log(
torch.exp(log_same_fg_prob - max_) +
torch.exp(log_same_bg_prob - max_)
) + max_
# loss = -log(prob)
return -log_same_prob[:, 0]
def get_incoherent_mask(input_masks, sfact):
mask = input_masks.float()
w = input_masks.shape[-1]
h = input_masks.shape[-2]
mask_small = F.interpolate(mask, (h//sfact, w//sfact), mode='bilinear')
mask_recover = F.interpolate(mask_small, (h, w), mode='bilinear')
mask_uncertain = (mask - mask_recover).abs()
mask_uncertain = (mask_uncertain > 0.01).float()
return mask_uncertain
def dice_coefficient(x, target):
eps = 1e-5
n_inst = x.size(0)
x = x.reshape(n_inst, -1)
target = target.reshape(n_inst, -1)
intersection = (x * target).sum(dim=1)
union = (x ** 2.0).sum(dim=1) + (target ** 2.0).sum(dim=1) + eps
loss = 1. - (2 * intersection / union)
return loss
def compute_project_term(mask_scores, gt_bitmasks):
mask_losses_y = dice_coefficient(
mask_scores.max(dim=2, keepdim=True)[0],
gt_bitmasks.max(dim=2, keepdim=True)[0]
)
mask_losses_x = dice_coefficient(
mask_scores.max(dim=3, keepdim=True)[0],
gt_bitmasks.max(dim=3, keepdim=True)[0]
)
return (mask_losses_x + mask_losses_y).mean()
def dice_loss(
inputs: torch.Tensor,
targets: torch.Tensor,
num_masks: float,
):
"""
Compute the DICE loss, similar to generalized IOU for masks
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
"""
inputs = inputs.sigmoid()
inputs = inputs.flatten(1)
numerator = 2 * (inputs * targets).sum(-1)
denominator = inputs.sum(-1) + targets.sum(-1)
loss = 1 - (numerator + 1) / (denominator + 1)
return loss.sum() / num_masks
dice_loss_jit = torch.jit.script(
dice_loss
) # type: torch.jit.ScriptModule
def sigmoid_ce_loss(
inputs: torch.Tensor,
targets: torch.Tensor,
num_masks: float,
):
"""
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
Returns:
Loss tensor
"""
loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
return loss.mean(1).sum() / num_masks
sigmoid_ce_loss_jit = torch.jit.script(
sigmoid_ce_loss
) # type: torch.jit.ScriptModule
def calculate_uncertainty(logits):
"""
We estimate uncerainty as L1 distance between 0.0 and the logit prediction in 'logits' for the
foreground class in `classes`.
Args:
logits (Tensor): A tensor of shape (R, 1, ...) for class-specific or
class-agnostic, where R is the total number of predicted masks in all images and C is
the number of foreground classes. The values are logits.
Returns:
scores (Tensor): A tensor of shape (R, 1, ...) that contains uncertainty scores with
the most uncertain locations having the highest uncertainty score.
"""
assert logits.shape[1] == 1
gt_class_logits = logits.clone()
return -(torch.abs(gt_class_logits))
class SetCriterion(nn.Module):
"""This class computes the loss for DETR.
The process happens in two steps:
1) we compute hungarian assignment between ground truth boxes and the outputs of the model
2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
"""
def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses,
num_points, oversample_ratio, importance_sample_ratio):
"""Create the criterion.
Parameters:
num_classes: number of object categories, omitting the special no-object category
matcher: module able to compute a matching between targets and proposals
weight_dict: dict containing as key the names of the losses and as values their relative weight.
eos_coef: relative classification weight applied to the no-object category
losses: list of all the losses to be applied. See get_loss for list of available losses.
"""
super().__init__()
self.num_classes = num_classes
self.matcher = matcher
self.weight_dict = weight_dict
self.eos_coef = eos_coef
self.losses = losses
empty_weight = torch.ones(self.num_classes + 1)
empty_weight[-1] = self.eos_coef
self.register_buffer("empty_weight", empty_weight)
# pointwise mask loss parameters
self.num_points = num_points
self.oversample_ratio = oversample_ratio
self.importance_sample_ratio = importance_sample_ratio
self.laplacian_kernel = torch.tensor([-1, -1, -1, -1, 8, -1, -1, -1, -1], dtype=torch.float32).reshape(1, 1, 3, 3).requires_grad_(False)
self.register_buffer("_iter", torch.zeros([1]))
self._warmup_iters = 1000 #20000
def loss_labels(self, outputs, targets, indices, num_masks):
"""Classification loss (NLL)
targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
"""
assert "pred_logits" in outputs
src_logits = outputs["pred_logits"].float()
idx = self._get_src_permutation_idx(indices)
target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
target_classes = torch.full(
src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
)
target_classes[idx] = target_classes_o
loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
losses = {"loss_ce": loss_ce}
return losses
def loss_masks_proj(self, outputs, targets, indices, num_masks, images_lab_sim):
assert "pred_masks" in outputs
self._iter += 1
src_idx = self._get_src_permutation_idx(indices)
tgt_idx = self._get_tgt_permutation_idx(indices)
src_masks = outputs["pred_masks"]
src_masks = src_masks[src_idx]
masks = [t["masks"] for t in targets]
# TODO use valid to mask invalid areas due to padding in loss
target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
target_masks = target_masks.to(src_masks)
target_masks = target_masks[tgt_idx]
if len(src_idx[0].tolist()) > 0:
images_lab_sim = torch.cat([images_lab_sim[ind] for ind in src_idx[0].tolist()])
# No need to upsample predictions as we are using normalized coordinates :)
# N x 1 x H x W
src_masks = src_masks[:, None]
target_masks = target_masks[:, None]
target_masks = F.interpolate(target_masks, (src_masks.shape[-2], src_masks.shape[-1]), mode='bilinear')
# print('src masks shape:', src_masks.shape)
# print('target masks shape:', target_masks.shape)
if src_masks.shape[0] > 0:
loss_prj_term = compute_project_term(src_masks.sigmoid(), target_masks)
# print('src_masks shape before:', src_masks.shape)
pairwise_losses = compute_pairwise_term(
src_masks, 3, 2
)
inc_mask = get_incoherent_mask(src_masks.detach().sigmoid() > 0.5, 2) #* images_lab_sim).bool()
inc_mask = F.conv2d(inc_mask, self.laplacian_kernel.to(inc_mask.device), padding=1).abs()
inc_mask = (inc_mask > 0.5).float()
weights = (images_lab_sim >= 0.3).float() * target_masks.float() #* inc_mask
loss_pairwise = ((pairwise_losses * weights).sum() / weights.sum().clamp(min=1.0)) * 0.25
warmup_factor = min(self._iter.item() / float(self._warmup_iters), 1.0)
loss_pairwise = loss_pairwise * warmup_factor #* 0.
else:
loss_prj_term = src_masks.sum() * 0.
loss_pairwise = src_masks.sum() * 0.
losses = {
"loss_mask": loss_prj_term,
"loss_bound": loss_pairwise,
}
del src_masks
del target_masks
return losses
def loss_masks(self, outputs, targets, indices, num_masks):
"""Compute the losses related to the masks: the focal loss and the dice loss.
targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
"""
assert "pred_masks" in outputs
src_idx = self._get_src_permutation_idx(indices)
tgt_idx = self._get_tgt_permutation_idx(indices)
src_masks = outputs["pred_masks"]
src_masks = src_masks[src_idx]
masks = [t["masks"] for t in targets]
# TODO use valid to mask invalid areas due to padding in loss
target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
target_masks = target_masks.to(src_masks)
target_masks = target_masks[tgt_idx]
# No need to upsample predictions as we are using normalized coordinates :)
# N x 1 x H x W
src_masks = src_masks[:, None]
target_masks = target_masks[:, None]
with torch.no_grad():
# sample point_coords
point_coords = get_uncertain_point_coords_with_randomness(
src_masks,
lambda logits: calculate_uncertainty(logits),
self.num_points,
self.oversample_ratio,
self.importance_sample_ratio,
)
# get gt labels
point_labels = point_sample(
target_masks,
point_coords,
align_corners=False,
).squeeze(1)
point_logits = point_sample(
src_masks,
point_coords,
align_corners=False,
).squeeze(1)
losses = {
"loss_mask": sigmoid_ce_loss_jit(point_logits, point_labels, num_masks),
"loss_dice": dice_loss_jit(point_logits, point_labels, num_masks),
}
del src_masks
del target_masks
return losses
def _get_src_permutation_idx(self, indices):
# permute predictions following indices
batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
src_idx = torch.cat([src for (src, _) in indices])
return batch_idx, src_idx
def _get_tgt_permutation_idx(self, indices):
# permute targets following indices
batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
tgt_idx = torch.cat([tgt for (_, tgt) in indices])
return batch_idx, tgt_idx
def get_loss(self, loss, outputs, targets, indices, num_masks, images_lab_sim):
loss_map = {
'labels': self.loss_labels,
'masks': self.loss_masks_proj,
}
assert loss in loss_map, f"do you really want to compute {loss} loss?"
if loss == 'masks':
return loss_map[loss](outputs, targets, indices, num_masks, images_lab_sim)
else:
return loss_map[loss](outputs, targets, indices, num_masks)
def forward(self, outputs, targets, images_lab_sim):
"""This performs the loss computation.
Parameters:
outputs: dict of tensors, see the output specification of the model for the format
targets: list of dicts, such that len(targets) == batch_size.
The expected keys in each dict depends on the losses applied, see each loss' doc
"""
outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
# Retrieve the matching between the outputs of the last layer and the targets
indices = self.matcher(outputs_without_aux, targets)
# Compute the average number of target boxes accross all nodes, for normalization purposes
num_masks = sum(len(t["labels"]) for t in targets)
num_masks = torch.as_tensor(
[num_masks], dtype=torch.float, device=next(iter(outputs.values())).device
)
if is_dist_avail_and_initialized():
torch.distributed.all_reduce(num_masks)
num_masks = torch.clamp(num_masks / get_world_size(), min=1).item()
# Compute all the requested losses
losses = {}
for loss in self.losses:
losses.update(self.get_loss(loss, outputs, targets, indices, num_masks, images_lab_sim))
# In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
if "aux_outputs" in outputs:
for i, aux_outputs in enumerate(outputs["aux_outputs"]):
indices = self.matcher(aux_outputs, targets)
for loss in self.losses:
l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_masks, images_lab_sim)
l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
losses.update(l_dict)
return losses
def __repr__(self):
head = "Criterion " + self.__class__.__name__
body = [
"matcher: {}".format(self.matcher.__repr__(_repr_indent=8)),
"losses: {}".format(self.losses),
"weight_dict: {}".format(self.weight_dict),
"num_classes: {}".format(self.num_classes),
"eos_coef: {}".format(self.eos_coef),
"num_points: {}".format(self.num_points),
"oversample_ratio: {}".format(self.oversample_ratio),
"importance_sample_ratio: {}".format(self.importance_sample_ratio),
]
_repr_indent = 4
lines = [head] + [" " * _repr_indent + line for line in body]
return "\n".join(lines)
================================================
FILE: mfvis_nococo/mask2former/modeling/matcher.py
================================================
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py
"""
Modules to compute the matching cost and solve the corresponding LSAP.
"""
import torch
import torch.nn.functional as F
from scipy.optimize import linear_sum_assignment
from torch import nn
from torch.cuda.amp import autocast
from detectron2.projects.point_rend.point_features import point_sample
from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou, generalized_multi_box_iou
def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
"""
Compute the DICE loss, similar to generalized IOU for masks
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
"""
inputs = inputs #.sigmoid()
inputs = inputs.flatten(1)
numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
loss = 1 - (numerator + 1) / (denominator + 1)
return loss
batch_dice_loss_jit = torch.jit.script(
batch_dice_loss
) # type: torch.jit.ScriptModule
def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
"""
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
Returns:
Loss tensor
"""
hw = inputs.shape[1]
pos = F.binary_cross_entropy(
inputs, torch.ones_like(inputs), reduction="none"
)
neg = F.binary_cross_entropy(
inputs, torch.zeros_like(inputs), reduction="none"
)
loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum(
"nc,mc->nm", neg, (1 - targets)
)
return loss / hw
batch_sigmoid_ce_loss_jit = torch.jit.script(
batch_sigmoid_ce_loss
) # type: torch.jit.ScriptModule
def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
"""
Compute the bounding boxes around the provided masks.
Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with
``0 <= x1 < x2`` and ``0 <= y1 < y2``.
Args:
masks (Tensor[N, H, W]): masks to transform where N is the number of masks
and (H, W) are the spatial dimensions.
Returns:
Tensor[N, 4]: bounding boxes
"""
if masks.numel() == 0:
return masks
n = masks.shape[0]
for index, mask in enumerate(masks):
y, x = torch.where(mask != 0)
if len(x) * len(y) == 0:
continue
h = torch.max(y) - torch.min(y)
w = torch.max(x) - torch.min(x)
masks[index, torch.min(y):torch.max(y), torch.min(x):torch.max(x)] = 1.0
return masks
def masks_to_boxes_cc(masks: torch.Tensor) -> torch.Tensor:
"""
Compute the bounding boxes around the provided masks.
Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with
``0 <= x1 < x2`` and ``0 <= y1 < y2``.
Args:
masks (Tensor[N, H, W]): masks to transform where N is the number of masks
and (H, W) are the spatial dimensions.
Returns:
Tensor[N, 4]: bounding boxes
"""
if masks.numel() == 0:
return torch.zeros((0, 4), device=masks.device, dtype=torch.float)
n = masks.shape[0]
h = masks.shape[1]
w = masks.shape[2]
bounding_boxes = torch.zeros((n, 4), device=masks.device, dtype=torch.float)
for index, mask in enumerate(masks):
y, x = torch.where(mask != 0)
if len(x) * len(y) == 0:
continue
bounding_boxes[index, 0] = torch.min(x) / float(w)
bounding_boxes[index, 1] = torch.min(y) / float(h)
bounding_boxes[index, 2] = torch.max(x) / float(w)
bounding_boxes[index, 3] = torch.max(y) / float(h)
return bounding_boxes
class HungarianMatcher(nn.Module):
"""This class computes an assignment between the targets and the predictions of the network
For efficiency reasons, the targets don't include the no_object. Because of this, in general,
there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
while the others are un-matched (and thus treated as non-objects).
"""
def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0):
"""Creates the matcher
Params:
cost_class: This is the relative weight of the classification error in the matching cost
cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost
cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost
"""
super().__init__()
self.cost_class = cost_class
self.cost_mask = cost_mask
self.cost_dice = cost_dice
self.cost_giou = 2.0
self.cost_bbox = 5.0
assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0"
self.num_points = num_points
@torch.no_grad()
def memory_efficient_forward(self, outputs, targets):
"""More memory-friendly matching"""
bs, num_queries = outputs["pred_logits"].shape[:2]
indices = []
# Iterate through batch size
for b in range(bs):
out_prob = outputs["pred_logits"][b].softmax(-1) # [num_queries, num_classes]
tgt_ids = targets[b]["labels"]
# Compute the classification cost. Contrary to the loss, we don't use the NLL,
# but approximate it in 1 - proba[target class].
# The 1 is a constant that doesn't change the matching, it can be ommitted.
cost_class = -out_prob[:, tgt_ids]
out_mask = outputs["pred_masks"][b] # [num_queries, H_pred, W_pred]
out_mask_box = masks_to_boxes_cc((out_mask.sigmoid() > 0.5).float())
# gt masks are already padded when preparing target
tgt_mask = targets[b]["masks"].to(out_mask)
tgt_mask_box = masks_to_boxes_cc(tgt_mask)
# print('tgt_mask_box shape:', tgt_mask_box.shape)
with autocast(enabled=False):
cost_bbox = torch.cdist(out_mask_box, tgt_mask_box)
cost_giou = -generalized_box_iou(out_mask_box, tgt_mask_box)
if torch.isnan(cost_bbox).any():
print('cost_bbox:', cost_bbox)
if torch.isnan(cost_giou).any():
print('cost_giou:', cost_giou)
C = (
self.cost_bbox * cost_bbox
+ self.cost_class * cost_class
+ self.cost_giou * cost_giou
)
C = C.reshape(num_queries, -1).cpu()
indices.append(linear_sum_assignment(C))
return [
(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
for i, j in indices
]
@torch.no_grad()
def forward(self, outputs, targets):
"""Performs the matching
Params:
outputs: This is a dict that contains at least these entries:
"pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
"pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks
targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
"labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
objects in the target) containing the class labels
"masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks
Returns:
A list of size batch_size, containing tuples of (index_i, index_j) where:
- index_i is the indices of the selected predictions (in order)
- index_j is the indices of the corresponding selected targets (in order)
For each batch element, it holds:
len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
"""
return self.memory_efficient_forward(outputs, targets)
def __repr__(self, _repr_indent=4):
head = "Matcher " + self.__class__.__name__
body = [
"cost_class: {}".format(self.cost_class),
"cost_mask: {}".format(self.cost_mask),
"cost_dice: {}".format(self.cost_dice),
]
lines = [head] + [" " * _repr_indent + line for line in body]
return "\n".join(lines)
================================================
FILE: mfvis_nococo/mask2former/modeling/meta_arch/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
================================================
FILE: mfvis_nococo/mask2former/modeling/meta_arch/__init__.py.new
================================================
================================================
FILE: mfvis_nococo/mask2former/modeling/meta_arch/mask_former_head.py
================================================
import logging
from copy import deepcopy
from typing import Callable, Dict, List, Optional, Tuple, Union
import fvcore.nn.weight_init as weight_init
from torch import nn
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.layers import Conv2d, ShapeSpec, get_norm
from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder
from ..pixel_decoder.fpn import build_pixel_decoder
@SEM_SEG_HEADS_REGISTRY.register()
class MaskFormerHead(nn.Module):
_version = 2
def _load_from_state_dict(
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
):
version = local_metadata.get("version", None)
if version is None or version < 2:
# Do not warn if train from scratch
scratch = True
logger = logging.getLogger(__name__)
for k in list(state_dict.keys()):
newk = k
'''
if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
newk = k.replace(prefix, prefix + "pixel_decoder.")
# logger.debug(f"{k} ==> {newk}")
'''
if newk != k:
state_dict[newk] = state_dict[k]
del state_dict[k]
scratch = False
if not scratch:
logger.warning(
f"Weight format of {self.__class__.__name__} have changed! "
"Please upgrade your models. Applying automatic conversion now ..."
)
@configurable
def __init__(
self,
input_shape: Dict[str, ShapeSpec],
*,
num_classes: int,
pixel_decoder: nn.Module,
loss_weight: float = 1.0,
ignore_value: int = -1,
# extra parameters
transformer_predictor: nn.Module,
transformer_in_feature: str,
):
"""
NOTE: this interface is experimental.
Args:
input_shape: shapes (channels and stride) of the input features
num_classes: number of classes to predict
pixel_decoder: the pixel decoder module
loss_weight: loss weight
ignore_value: category id to be ignored during training.
transformer_predictor: the transformer decoder that makes prediction
transformer_in_feature: input feature name to the transformer_predictor
"""
super().__init__()
input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
self.in_features = [k for k, v in input_shape]
feature_strides = [v.stride for k, v in input_shape]
feature_channels = [v.channels for k, v in input_shape]
self.ignore_value = ignore_value
self.common_stride = 4
self.loss_weight = loss_weight
self.pixel_decoder = pixel_decoder
self.predictor = transformer_predictor
self.transformer_in_feature = transformer_in_feature
self.num_classes = num_classes
@classmethod
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
# figure out in_channels to transformer predictor
if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder": # for maskformer2
transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
else:
transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels
return {
"input_shape": {
k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
},
"ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
"num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
"pixel_decoder": build_pixel_decoder(cfg, input_shape),
"loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
"transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
"transformer_predictor": build_transformer_decoder(
cfg,
transformer_predictor_in_channels,
mask_classification=True,
),
}
def forward(self, features, mask=None):
return self.layers(features, mask)
def layers(self, features, mask=None):
mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features)
if self.transformer_in_feature == "multi_scale_pixel_decoder":
predictions = self.predictor(multi_scale_features, mask_features, mask)
else:
if self.transformer_in_feature == "transformer_encoder":
assert (
transformer_encoder_features is not None
), "Please use the TransformerEncoderPixelDecoder."
predictions = self.predictor(transformer_encoder_features, mask_features, mask)
elif self.transformer_in_feature == "pixel_embedding":
predictions = self.predictor(mask_features, mask_features, mask)
else:
predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask)
return predictions
================================================
FILE: mfvis_nococo/mask2former/modeling/meta_arch/per_pixel_baseline.py
================================================
import logging
from typing import Callable, Dict, List, Optional, Tuple, Union
import fvcore.nn.weight_init as weight_init
from torch import nn
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.layers import Conv2d, ShapeSpec, get_norm
from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
from ..transformer_decoder.maskformer_transformer_decoder import StandardTransformerDecoder
from ..pixel_decoder.fpn import build_pixel_decoder
@SEM_SEG_HEADS_REGISTRY.register()
class PerPixelBaselineHead(nn.Module):
_version = 2
def _load_from_state_dict(
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
):
version = local_metadata.get("version", None)
if version is None or version < 2:
logger = logging.getLogger(__name__)
# Do not warn if train from scratch
scratch = True
logger = logging.getLogger(__name__)
for k in list(state_dict.keys()):
newk = k
if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
newk = k.replace(prefix, prefix + "pixel_decoder.")
# logger.warning(f"{k} ==> {newk}")
if newk != k:
state_dict[newk] = state_dict[k]
del state_dict[k]
scratch = False
if not scratch:
logger.warning(
f"Weight format of {self.__class__.__name__} have changed! "
"Please upgrade your models. Applying automatic conversion now ..."
)
@configurable
def __init__(
self,
input_shape: Dict[str, ShapeSpec],
*,
num_classes: int,
pixel_decoder: nn.Module,
loss_weight: float = 1.0,
ignore_value: int = -1,
):
"""
NOTE: this interface is experimental.
Args:
input_shape: shapes (channels and stride) of the input features
num_classes: number of classes to predict
pixel_decoder: the pixel decoder module
loss_weight: loss weight
ignore_value: category id to be ignored during training.
"""
super().__init__()
input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
self.in_features = [k for k, v in input_shape]
feature_strides = [v.stride for k, v in input_shape]
feature_channels = [v.channels for k, v in input_shape]
self.ignore_value = ignore_value
self.common_stride = 4
self.loss_weight = loss_weight
self.pixel_decoder = pixel_decoder
self.predictor = Conv2d(
self.pixel_decoder.mask_dim, num_classes, kernel_size=1, stride=1, padding=0
)
weight_init.c2_msra_fill(self.predictor)
@classmethod
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
return {
"input_shape": {
k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
},
"ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
"num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
"pixel_decoder": build_pixel_decoder(cfg, input_shape),
"loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
}
def forward(self, features, targets=None):
"""
Returns:
In training, returns (None, dict of losses)
In inference, returns (CxHxW logits, {})
"""
x = self.layers(features)
if self.training:
return None, self.losses(x, targets)
else:
x = F.interpolate(
x, scale_factor=self.common_stride, mode="bilinear", align_corners=False
)
return x, {}
def layers(self, features):
x, _, _ = self.pixel_decoder.forward_features(features)
x = self.predictor(x)
return x
def losses(self, predictions, targets):
predictions = predictions.float() # https://github.com/pytorch/pytorch/issues/48163
predictions = F.interpolate(
predictions, scale_factor=self.common_stride, mode="bilinear", align_corners=False
)
loss = F.cross_entropy(
predictions, targets, reduction="mean", ignore_index=self.ignore_value
)
losses = {"loss_sem_seg": loss * self.loss_weight}
return losses
@SEM_SEG_HEADS_REGISTRY.register()
class PerPixelBaselinePlusHead(PerPixelBaselineHead):
def _load_from_state_dict(
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
):
version = local_metadata.get("version", None)
if version is None or version < 2:
# Do not warn if train from scratch
scratch = True
logger = logging.getLogger(__name__)
for k in list(state_dict.keys()):
newk = k
if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
newk = k.replace(prefix, prefix + "pixel_decoder.")
logger.debug(f"{k} ==> {newk}")
if newk != k:
state_dict[newk] = state_dict[k]
del state_dict[k]
scratch = False
if not scratch:
logger.warning(
f"Weight format of {self.__class__.__name__} have changed! "
"Please upgrade your models. Applying automatic conversion now ..."
)
@configurable
def __init__(
self,
input_shape: Dict[str, ShapeSpec],
*,
# extra parameters
transformer_predictor: nn.Module,
transformer_in_feature: str,
deep_supervision: bool,
# inherit parameters
num_classes: int,
pixel_decoder: nn.Module,
loss_weight: float = 1.0,
ignore_value: int = -1,
):
"""
NOTE: this interface is experimental.
Args:
input_shape: shapes (channels and stride) of the input features
transformer_predictor: the transformer decoder that makes prediction
transformer_in_feature: input feature name to the transformer_predictor
deep_supervision: whether or not to add supervision to the output of
every transformer decoder layer
num_classes: number of classes to predict
pixel_decoder: the pixel decoder module
loss_weight: loss weight
ignore_value: category id to be ignored during training.
"""
super().__init__(
input_shape,
num_classes=num_classes,
pixel_decoder=pixel_decoder,
loss_weight=loss_weight,
ignore_value=ignore_value,
)
del self.predictor
self.predictor = transformer_predictor
self.transformer_in_feature = transformer_in_feature
self.deep_supervision = deep_supervision
@classmethod
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
ret = super().from_config(cfg, input_shape)
ret["transformer_in_feature"] = cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE
if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
else:
in_channels = input_shape[ret["transformer_in_feature"]].channels
ret["transformer_predictor"] = StandardTransformerDecoder(
cfg, in_channels, mask_classification=False
)
ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
return ret
def forward(self, features, targets=None):
"""
Returns:
In training, returns (None, dict of losses)
In inference, returns (CxHxW logits, {})
"""
x, aux_outputs = self.layers(features)
if self.training:
if self.deep_supervision:
losses = self.losses(x, targets)
for i, aux_output in enumerate(aux_outputs):
losses["loss_sem_seg" + f"_{i}"] = self.losses(
aux_output["pred_masks"], targets
)["loss_sem_seg"]
return None, losses
else:
return None, self.losses(x, targets)
else:
x = F.interpolate(
x, scale_factor=self.common_stride, mode="bilinear", align_corners=False
)
return x, {}
def layers(self, features):
mask_features, transformer_encoder_features, _ = self.pixel_decoder.forward_features(features)
if self.transformer_in_feature == "transformer_encoder":
assert (
transformer_encoder_features is not None
), "Please use the TransformerEncoderPixelDecoder."
predictions = self.predictor(transformer_encoder_features, mask_features)
else:
predictions = self.predictor(features[self.transformer_in_feature], mask_features)
if self.deep_supervision:
return predictions["pred_masks"], predictions["aux_outputs"]
else:
return predictions["pred_masks"], None
================================================
FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
================================================
FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/__init__.py.new
================================================
================================================
FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/fpn.py
================================================
import logging
import numpy as np
from typing import Callable, Dict, List, Optional, Tuple, Union
import fvcore.nn.weight_init as weight_init
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_
from torch.cuda.amp import autocast
from detectron2.config import configurable
from detectron2.layers import Conv2d, DeformConv, ShapeSpec, get_norm
from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
from ..transformer_decoder.position_encoding import PositionEmbeddingSine
from ..transformer_decoder.transformer import TransformerEncoder, TransformerEncoderLayer, _get_clones, _get_activation_fn
def build_pixel_decoder(cfg, input_shape):
"""
Build a pixel decoder from `cfg.MODEL.MASK_FORMER.PIXEL_DECODER_NAME`.
"""
name = cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME
model = SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)
forward_features = getattr(model, "forward_features", None)
if not callable(forward_features):
raise ValueError(
"Only SEM_SEG_HEADS with forward_features method can be used as pixel decoder. "
f"Please implement forward_features for {name} to only return mask features."
)
return model
# This is a modified FPN decoder.
@SEM_SEG_HEADS_REGISTRY.register()
class BasePixelDecoder(nn.Module):
@configurable
def __init__(
self,
input_shape: Dict[str, ShapeSpec],
*,
conv_dim: int,
mask_dim: int,
norm: Optional[Union[str, Callable]] = None,
):
"""
NOTE: this interface is experimental.
Args:
input_shape: shapes (channels and stride) of the input features
conv_dims: number of output channels for the intermediate conv layers.
mask_dim: number of output channels for the final conv layer.
norm (str or callable): normalization for all conv layers
"""
super().__init__()
input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5"
feature_channels = [v.channels for k, v in input_shape]
lateral_convs = []
output_convs = []
use_bias = norm == ""
for idx, in_channels in enumerate(feature_channels):
if idx == len(self.in_features) - 1:
output_norm = get_norm(norm, conv_dim)
output_conv = Conv2d(
in_channels,
conv_dim,
kernel_size=3,
stride=1,
padding=1,
bias=use_bias,
norm=output_norm,
activation=F.relu,
)
weight_init.c2_xavier_fill(output_conv)
self.add_module("layer_{}".format(idx + 1), output_conv)
lateral_convs.append(None)
output_convs.append(output_conv)
else:
lateral_norm = get_norm(norm, conv_dim)
output_norm = get_norm(norm, conv_dim)
lateral_conv = Conv2d(
in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm
)
output_conv = Conv2d(
conv_dim,
conv_dim,
kernel_size=3,
stride=1,
padding=1,
bias=use_bias,
norm=output_norm,
activation=F.relu,
)
weight_init.c2_xavier_fill(lateral_conv)
weight_init.c2_xavier_fill(output_conv)
self.add_module("adapter_{}".format(idx + 1), lateral_conv)
self.add_module("layer_{}".format(idx + 1), output_conv)
lateral_convs.append(lateral_conv)
output_convs.append(output_conv)
# Place convs into top-down order (from low to high resolution)
# to make the top-down computation in forward clearer.
self.lateral_convs = lateral_convs[::-1]
self.output_convs = output_convs[::-1]
self.mask_dim = mask_dim
self.mask_features = Conv2d(
conv_dim,
mask_dim,
kernel_size=3,
stride=1,
padding=1,
)
weight_init.c2_xavier_fill(self.mask_features)
self.maskformer_num_feature_levels = 3 # always use 3 scales
@classmethod
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
ret = {}
ret["input_shape"] = {
k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
}
ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM
return ret
def forward_features(self, features):
multi_scale_features = []
num_cur_levels = 0
# Reverse feature maps into top-down order (from low to high resolution)
for idx, f in enumerate(self.in_features[::-1]):
x = features[f]
lateral_conv = self.lateral_convs[idx]
output_conv = self.output_convs[idx]
if lateral_conv is None:
y = output_conv(x)
else:
cur_fpn = lateral_conv(x)
# Following FPN implementation, we use nearest upsampling here
y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest")
y = output_conv(y)
if num_cur_levels < self.maskformer_num_feature_levels:
multi_scale_features.append(y)
num_cur_levels += 1
return self.mask_features(y), None, multi_scale_features
def forward(self, features, targets=None):
logger = logging.getLogger(__name__)
logger.warning("Calling forward() may cause unpredicted behavior of PixelDecoder module.")
return self.forward_features(features)
class TransformerEncoderOnly(nn.Module):
def __init__(
self,
d_model=512,
nhead=8,
num_encoder_layers=6,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
normalize_before=False,
):
super().__init__()
encoder_layer = TransformerEncoderLayer(
d_model, nhead, dim_feedforward, dropout, activation, normalize_before
)
encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
self._reset_parameters()
self.d_model = d_model
self.nhead = nhead
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def forward(self, src, mask, pos_embed):
# flatten NxCxHxW to HWxNxC
bs, c, h, w = src.shape
src = src.flatten(2).permute(2, 0, 1)
pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
if mask is not None:
mask = mask.flatten(1)
memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
return memory.permute(1, 2, 0).view(bs, c, h, w)
# This is a modified FPN decoder with extra Transformer encoder that processes the lowest-resolution feature map.
@SEM_SEG_HEADS_REGISTRY.register()
class TransformerEncoderPixelDecoder(BasePixelDecoder):
@configurable
def __init__(
self,
input_shape: Dict[str, ShapeSpec],
*,
transformer_dropout: float,
transformer_nheads: int,
transformer_dim_feedforward: int,
transformer_enc_layers: int,
transformer_pre_norm: bool,
conv_dim: int,
mask_dim: int,
norm: Optional[Union[str, Callable]] = None,
):
"""
NOTE: this interface is experimental.
Args:
input_shape: shapes (channels and stride) of the input features
transformer_dropout: dropout probability in transformer
transformer_nheads: number of heads in transformer
transformer_dim_feedforward: dimension of feedforward network
transformer_enc_layers: number of transformer encoder layers
transformer_pre_norm: whether to use pre-layernorm or not
conv_dims: number of output channels for the intermediate conv layers.
mask_dim: number of output channels for the final conv layer.
norm (str or callable): normalization for all conv layers
"""
super().__init__(input_shape, conv_dim=conv_dim, mask_dim=mask_dim, norm=norm)
input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5"
feature_strides = [v.stride for k, v in input_shape]
feature_channels = [v.channels for k, v in input_shape]
in_channels = feature_channels[len(self.in_features) - 1]
self.input_proj = Conv2d(in_channels, conv_dim, kernel_size=1)
weight_init.c2_xavier_fill(self.input_proj)
self.transformer = TransformerEncoderOnly(
d_model=conv_dim,
dropout=transformer_dropout,
nhead=transformer_nheads,
dim_feedforward=transformer_dim_feedforward,
num_encoder_layers=transformer_enc_layers,
normalize_before=transformer_pre_norm,
)
N_steps = conv_dim // 2
self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
# update layer
use_bias = norm == ""
output_norm = get_norm(norm, conv_dim)
output_conv = Conv2d(
conv_dim,
conv_dim,
kernel_size=3,
stride=1,
padding=1,
bias=use_bias,
norm=output_norm,
activation=F.relu,
)
weight_init.c2_xavier_fill(output_conv)
delattr(self, "layer_{}".format(len(self.in_features)))
self.add_module("layer_{}".format(len(self.in_features)), output_conv)
self.output_convs[0] = output_conv
@classmethod
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
ret = super().from_config(cfg, input_shape)
ret["transformer_dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
ret["transformer_nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
ret["transformer_dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
ret[
"transformer_enc_layers"
] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS # a separate config
ret["transformer_pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
return ret
def forward_features(self, features):
multi_scale_features = []
num_cur_levels = 0
# Reverse feature maps into top-down order (from low to high resolution)
for idx, f in enumerate(self.in_features[::-1]):
x = features[f]
lateral_conv = self.lateral_convs[idx]
output_conv = self.output_convs[idx]
if lateral_conv is None:
transformer = self.input_proj(x)
pos = self.pe_layer(x)
transformer = self.transformer(transformer, None, pos)
y = output_conv(transformer)
# save intermediate feature as input to Transformer decoder
transformer_encoder_features = transformer
else:
cur_fpn = lateral_conv(x)
# Following FPN implementation, we use nearest upsampling here
y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest")
y = output_conv(y)
if num_cur_levels < self.maskformer_num_feature_levels:
multi_scale_features.append(y)
num_cur_levels += 1
return self.mask_features(y), transformer_encoder_features, multi_scale_features
def forward(self, features, targets=None):
logger = logging.getLogger(__name__)
logger.warning("Calling forward() may cause unpredicted behavior of PixelDecoder module.")
return self.forward_features(features)
================================================
FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/msdeformattn.py
================================================
import logging
import numpy as np
from typing import Callable, Dict, List, Optional, Tuple, Union
import fvcore.nn.weight_init as weight_init
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_
from torch.cuda.amp import autocast
from detectron2.config import configurable
from detectron2.layers import Conv2d, ShapeSpec, get_norm
from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
from ..transformer_decoder.position_encoding import PositionEmbeddingSine
from ..transformer_decoder.transformer import _get_clones, _get_activation_fn
from .ops.modules import MSDeformAttn
# MSDeformAttn Transformer encoder in deformable detr
class MSDeformAttnTransformerEncoderOnly(nn.Module):
def __init__(self, d_model=256, nhead=8,
num_encoder_layers=6, dim_feedforward=1024, dropout=0.1,
activation="relu",
num_feature_levels=4, enc_n_points=4,
):
super().__init__()
self.d_model = d_model
self.nhead = nhead
encoder_layer = MSDeformAttnTransformerEncoderLayer(d_model, dim_feedforward,
dropout, activation,
num_feature_levels, nhead, enc_n_points)
self.encoder = MSDeformAttnTransformerEncoder(encoder_layer, num_encoder_layers)
self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
for m in self.modules():
if isinstance(m, MSDeformAttn):
m._reset_parameters()
normal_(self.level_embed)
def get_valid_ratio(self, mask):
_, H, W = mask.shape
valid_H = torch.sum(~mask[:, :, 0], 1)
valid_W = torch.sum(~mask[:, 0, :], 1)
valid_ratio_h = valid_H.float() / H
valid_ratio_w = valid_W.float() / W
valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
return valid_ratio
def forward(self, srcs, pos_embeds):
masks = [torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) for x in srcs]
# prepare input for encoder
src_flatten = []
mask_flatten = []
lvl_pos_embed_flatten = []
spatial_shapes = []
for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
bs, c, h, w = src.shape
spatial_shape = (h, w)
spatial_shapes.append(spatial_shape)
src = src.flatten(2).transpose(1, 2)
mask = mask.flatten(1)
pos_embed = pos_embed.flatten(2).transpose(1, 2)
lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
lvl_pos_embed_flatten.append(lvl_pos_embed)
src_flatten.append(src)
mask_flatten.append(mask)
src_flatten = torch.cat(src_flatten, 1)
mask_flatten = torch.cat(mask_flatten, 1)
lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
# encoder
memory = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten)
return memory, spatial_shapes, level_start_index
class MSDeformAttnTransformerEncoderLayer(nn.Module):
def __init__(self,
d_model=256, d_ffn=1024,
dropout=0.1, activation="relu",
n_levels=4, n_heads=8, n_points=4):
super().__init__()
# self attention
self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
self.dropout1 = nn.Dropout(dropout)
self.norm1 = nn.LayerNorm(d_model)
# ffn
self.linear1 = nn.Linear(d_model, d_ffn)
self.activation = _get_activation_fn(activation)
self.dropout2 = nn.Dropout(dropout)
self.linear2 = nn.Linear(d_ffn, d_model)
self.dropout3 = nn.Dropout(dropout)
self.norm2 = nn.LayerNorm(d_model)
@staticmethod
def with_pos_embed(tensor, pos):
return tensor if pos is None else tensor + pos
def forward_ffn(self, src):
src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
src = src + self.dropout3(src2)
src = self.norm2(src)
return src
def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None):
# self attention
src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask)
src = src + self.dropout1(src2)
src = self.norm1(src)
# ffn
src = self.forward_ffn(src)
return src
class MSDeformAttnTransformerEncoder(nn.Module):
def __init__(self, encoder_layer, num_layers):
super().__init__()
self.layers = _get_clones(encoder_layer, num_layers)
self.num_layers = num_layers
@staticmethod
def get_reference_points(spatial_shapes, valid_ratios, device):
reference_points_list = []
for lvl, (H_, W_) in enumerate(spatial_shapes):
ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
ref = torch.stack((ref_x, ref_y), -1)
reference_points_list.append(ref)
reference_points = torch.cat(reference_points_list, 1)
reference_points = reference_points[:, :, None] * valid_ratios[:, None]
return reference_points
def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None):
output = src
reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
for _, layer in enumerate(self.layers):
output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask)
return output
@SEM_SEG_HEADS_REGISTRY.register()
class MSDeformAttnPixelDecoder(nn.Module):
@configurable
def __init__(
self,
input_shape: Dict[str, ShapeSpec],
*,
transformer_dropout: float,
transformer_nheads: int,
transformer_dim_feedforward: int,
transformer_enc_layers: int,
conv_dim: int,
mask_dim: int,
norm: Optional[Union[str, Callable]] = None,
# deformable transformer encoder args
transformer_in_features: List[str],
common_stride: int,
):
"""
NOTE: this interface is experimental.
Args:
input_shape: shapes (channels and stride) of the input features
transformer_dropout: dropout probability in transformer
transformer_nheads: number of heads in transformer
transformer_dim_feedforward: dimension of feedforward network
transformer_enc_layers: number of transformer encoder layers
conv_dims: number of output channels for the intermediate conv layers.
mask_dim: number of output channels for the final conv layer.
norm (str or callable): normalization for all conv layers
"""
super().__init__()
transformer_input_shape = {
k: v for k, v in input_shape.items() if k in transformer_in_features
}
# this is the input shape of pixel decoder
input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5"
self.feature_strides = [v.stride for k, v in input_shape]
self.feature_channels = [v.channels for k, v in input_shape]
# this is the input shape of transformer encoder (could use less features than pixel decoder
transformer_input_shape = sorted(transformer_input_shape.items(), key=lambda x: x[1].stride)
self.transformer_in_features = [k for k, v in transformer_input_shape] # starting from "res2" to "res5"
transformer_in_channels = [v.channels for k, v in transformer_input_shape]
self.transformer_feature_strides = [v.stride for k, v in transformer_input_shape] # to decide extra FPN layers
self.transformer_num_feature_levels = len(self.transformer_in_features)
if self.transformer_num_feature_levels > 1:
input_proj_list = []
# from low resolution to high resolution (res5 -> res2)
for in_channels in transformer_in_channels[::-1]:
input_proj_list.append(nn.Sequential(
nn.Conv2d(in_channels, conv_dim, kernel_size=1),
nn.GroupNorm(32, conv_dim),
))
self.input_proj = nn.ModuleList(input_proj_list)
else:
self.input_proj = nn.ModuleList([
nn.Sequential(
nn.Conv2d(transformer_in_channels[-1], conv_dim, kernel_size=1),
nn.GroupNorm(32, conv_dim),
)])
for proj in self.input_proj:
nn.init.xavier_uniform_(proj[0].weight, gain=1)
nn.init.constant_(proj[0].bias, 0)
self.transformer = MSDeformAttnTransformerEncoderOnly(
d_model=conv_dim,
dropout=transformer_dropout,
nhead=transformer_nheads,
dim_feedforward=transformer_dim_feedforward,
num_encoder_layers=transformer_enc_layers,
num_feature_levels=self.transformer_num_feature_levels,
)
N_steps = conv_dim // 2
self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
self.mask_dim = mask_dim
# use 1x1 conv instead
self.mask_features = Conv2d(
conv_dim,
mask_dim,
kernel_size=1,
stride=1,
padding=0,
)
weight_init.c2_xavier_fill(self.mask_features)
self.maskformer_num_feature_levels = 3 # always use 3 scales
self.common_stride = common_stride
# extra fpn levels
stride = min(self.transformer_feature_strides)
self.num_fpn_levels = int(np.log2(stride) - np.log2(self.common_stride))
lateral_convs = []
output_convs = []
use_bias = norm == ""
for idx, in_channels in enumerate(self.feature_channels[:self.num_fpn_levels]):
lateral_norm = get_norm(norm, conv_dim)
output_norm = get_norm(norm, conv_dim)
lateral_conv = Conv2d(
in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm
)
output_conv = Conv2d(
conv_dim,
conv_dim,
kernel_size=3,
stride=1,
padding=1,
bias=use_bias,
norm=output_norm,
activation=F.relu,
)
weight_init.c2_xavier_fill(lateral_conv)
weight_init.c2_xavier_fill(output_conv)
self.add_module("adapter_{}".format(idx + 1), lateral_conv)
self.add_module("layer_{}".format(idx + 1), output_conv)
lateral_convs.append(lateral_conv)
output_convs.append(output_conv)
# Place convs into top-down order (from low to high resolution)
# to make the top-down computation in forward clearer.
self.lateral_convs = lateral_convs[::-1]
self.output_convs = output_convs[::-1]
@classmethod
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
ret = {}
ret["input_shape"] = {
k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
}
ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM
ret["transformer_dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
ret["transformer_nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
# ret["transformer_dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
ret["transformer_dim_feedforward"] = 1024 # use 1024 for deformable transformer encoder
ret[
"transformer_enc_layers"
] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS # a separate config
ret["transformer_in_features"] = cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES
ret["common_stride"] = cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE
return ret
@autocast(enabled=False)
def forward_features(self, features):
srcs = []
pos = []
# Reverse feature maps into top-down order (from low to high resolution)
for idx, f in enumerate(self.transformer_in_features[::-1]):
x = features[f].float() # deformable detr does not support half precision
srcs.append(self.input_proj[idx](x))
pos.append(self.pe_layer(x))
y, spatial_shapes, level_start_index = self.transformer(srcs, pos)
bs = y.shape[0]
split_size_or_sections = [None] * self.transformer_num_feature_levels
for i in range(self.transformer_num_feature_levels):
if i < self.transformer_num_feature_levels - 1:
split_size_or_sections[i] = level_start_index[i + 1] - level_start_index[i]
else:
split_size_or_sections[i] = y.shape[1] - level_start_index[i]
y = torch.split(y, split_size_or_sections, dim=1)
out = []
multi_scale_features = []
num_cur_levels = 0
for i, z in enumerate(y):
out.append(z.transpose(1, 2).view(bs, -1, spatial_shapes[i][0], spatial_shapes[i][1]))
# append `out` with extra FPN levels
# Reverse feature maps into top-down order (from low to high resolution)
for idx, f in enumerate(self.in_features[:self.num_fpn_levels][::-1]):
x = features[f].float()
lateral_conv = self.lateral_convs[idx]
output_conv = self.output_convs[idx]
cur_fpn = lateral_conv(x)
# Following FPN implementation, we use nearest upsampling here
y = cur_fpn + F.interpolate(out[-1], size=cur_fpn.shape[-2:], mode="bilinear", align_corners=False)
y = output_conv(y)
out.append(y)
for o in out:
if num_cur_levels < self.maskformer_num_feature_levels:
multi_scale_features.append(o)
num_cur_levels += 1
return self.mask_features(out[-1]), out[0], multi_scale_features
================================================
FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/functions/__init__.py
================================================
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
from .ms_deform_attn_func import MSDeformAttnFunction
================================================
FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py
================================================
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import torch
import torch.nn.functional as F
from torch.autograd import Function
from torch.autograd.function import once_differentiable
try:
import MultiScaleDeformableAttention as MSDA
except ModuleNotFoundError as e:
info_string = (
"\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
"\t`cd mask2former/modeling/pixel_decoder/ops`\n"
"\t`sh make.sh`\n"
)
raise ModuleNotFoundError(info_string)
class MSDeformAttnFunction(Function):
@staticmethod
def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
ctx.im2col_step = im2col_step
output = MSDA.ms_deform_attn_forward(
value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
return output
@staticmethod
@once_differentiable
def backward(ctx, grad_output):
value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
grad_value, grad_sampling_loc, grad_attn_weight = \
MSDA.ms_deform_attn_backward(
value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
# for debug and test only,
# need to use cuda version instead
N_, S_, M_, D_ = value.shape
_, Lq_, M_, L_, P_, _ = sampling_locations.shape
value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
sampling_grids = 2 * sampling_locations - 1
sampling_value_list = []
for lid_, (H_, W_) in enumerate(value_spatial_shapes):
# N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
# N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
# N_*M_, D_, Lq_, P_
sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
mode='bilinear', padding_mode='zeros', align_corners=False)
sampling_value_list.append(sampling_value_l_)
# (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
return output.transpose(1, 2).contiguous()
================================================
FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/make.sh
================================================
#!/usr/bin/env bash
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
python setup.py build install
================================================
FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/modules/__init__.py
================================================
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
from .ms_deform_attn import MSDeformAttn
================================================
FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py
================================================
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import warnings
import math
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.init import xavier_uniform_, constant_
from ..functions import MSDeformAttnFunction
from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
def _is_power_of_2(n):
if (not isinstance(n, int)) or (n < 0):
raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
return (n & (n-1) == 0) and n != 0
class MSDeformAttn(nn.Module):
def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
"""
Multi-Scale Deformable Attention Module
:param d_model hidden dimension
:param n_levels number of feature levels
:param n_heads number of attention heads
:param n_points number of sampling points per attention head per feature level
"""
super().__init__()
if d_model % n_heads != 0:
raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
_d_per_head = d_model // n_heads
# you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
if not _is_power_of_2(_d_per_head):
warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
"which is more efficient in our CUDA implementation.")
self.im2col_step = 128
self.d_model = d_model
self.n_levels = n_levels
self.n_heads = n_heads
self.n_points = n_points
self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
self.value_proj = nn.Linear(d_model, d_model)
self.output_proj = nn.Linear(d_model, d_model)
self._reset_parameters()
def _reset_parameters(self):
constant_(self.sampling_offsets.weight.data, 0.)
thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
for i in range(self.n_points):
grid_init[:, :, i, :] *= i + 1
with torch.no_grad():
self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
constant_(self.attention_weights.weight.data, 0.)
constant_(self.attention_weights.bias.data, 0.)
xavier_uniform_(self.value_proj.weight.data)
constant_(self.value_proj.bias.data, 0.)
xavier_uniform_(self.output_proj.weight.data)
constant_(self.output_proj.bias.data, 0.)
def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
"""
:param query (N, Length_{query}, C)
:param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
:param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
:param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
:param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
:param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
:return output (N, Length_{query}, C)
"""
N, Len_q, _ = query.shape
N, Len_in, _ = input_flatten.shape
assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
value = self.value_proj(input_flatten)
if input_padding_mask is not None:
value = value.masked_fill(input_padding_mask[..., None], float(0))
value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
# N, Len_q, n_heads, n_levels, n_points, 2
if reference_points.shape[-1] == 2:
offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
sampling_locations = reference_points[:, :, None, :, None, :] \
+ sampling_offsets / offset_normalizer[None, None, None, :, None, :]
elif reference_points.shape[-1] == 4:
sampling_locations = reference_points[:, :, None, :, None, :2] \
+ sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
else:
raise ValueError(
'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
try:
output = MSDeformAttnFunction.apply(
value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
except:
# CPU
output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
# # For FLOPs calculation only
# output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
output = self.output_proj(output)
return output
================================================
FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/setup.py
================================================
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
import os
import glob
import torch
from torch.utils.cpp_extension import CUDA_HOME
from torch.utils.cpp_extension import CppExtension
from torch.utils.cpp_extension import CUDAExtension
from setuptools import find_packages
from setuptools import setup
requirements = ["torch", "torchvision"]
def get_extensions():
this_dir = os.path.dirname(os.path.abspath(__file__))
extensions_dir = os.path.join(this_dir, "src")
main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
sources = main_file + source_cpu
extension = CppExtension
extra_compile_args = {"cxx": []}
define_macros = []
# Force cuda since torch ask for a device, not if cuda is in fact available.
if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
extension = CUDAExtension
sources += source_cuda
define_macros += [("WITH_CUDA", None)]
extra_compile_args["nvcc"] = [
"-DCUDA_HAS_FP16=1",
"-D__CUDA_NO_HALF_OPERATORS__",
"-D__CUDA_NO_HALF_CONVERSIONS__",
"-D__CUDA_NO_HALF2_OPERATORS__",
]
else:
if CUDA_HOME is None:
raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
else:
raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
sources = [os.path.join(extensions_dir, s) for s in sources]
include_dirs = [extensions_dir]
ext_modules = [
extension(
"MultiScaleDeformableAttention",
sources,
include_dirs=include_dirs,
define_macros=define_macros,
extra_compile_args=extra_compile_args,
)
]
return ext_modules
setup(
name="MultiScaleDeformableAttention",
version="1.0",
author="Weijie Su",
url="https://github.com/fundamentalvision/Deformable-DETR",
description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
packages=find_packages(exclude=("configs", "tests",)),
ext_modules=get_extensions(),
cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
)
================================================
FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#include
#include
#include
at::Tensor
ms_deform_attn_cpu_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step)
{
AT_ERROR("Not implement on cpu");
}
std::vector
ms_deform_attn_cpu_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step)
{
AT_ERROR("Not implement on cpu");
}
================================================
FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#pragma once
#include
at::Tensor
ms_deform_attn_cpu_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step);
std::vector
ms_deform_attn_cpu_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step);
================================================
FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#include
#include "cuda/ms_deform_im2col_cuda.cuh"
#include
#include
#include
#include
at::Tensor ms_deform_attn_cuda_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step)
{
AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
const int batch = value.size(0);
const int spatial_size = value.size(1);
const int num_heads = value.size(2);
const int channels = value.size(3);
const int num_levels = spatial_shapes.size(0);
const int num_query = sampling_loc.size(1);
const int num_point = sampling_loc.size(4);
const int im2col_step_ = std::min(batch, im2col_step);
AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
const int batch_n = im2col_step_;
auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
auto per_value_size = spatial_size * num_heads * channels;
auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
for (int n = 0; n < batch/im2col_step_; ++n)
{
auto columns = output_n.select(0, n);
AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
value.data() + n * im2col_step_ * per_value_size,
spatial_shapes.data(),
level_start_index.data(),
sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
attn_weight.data() + n * im2col_step_ * per_attn_weight_size,
batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
columns.data());
}));
}
output = output.view({batch, num_query, num_heads*channels});
return output;
}
std::vector ms_deform_attn_cuda_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step)
{
AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
const int batch = value.size(0);
const int spatial_size = value.size(1);
const int num_heads = value.size(2);
const int channels = value.size(3);
const int num_levels = spatial_shapes.size(0);
const int num_query = sampling_loc.size(1);
const int num_point = sampling_loc.size(4);
const int im2col_step_ = std::min(batch, im2col_step);
AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
auto grad_value = at::zeros_like(value);
auto grad_sampling_loc = at::zeros_like(sampling_loc);
auto grad_attn_weight = at::zeros_like(attn_weight);
const int batch_n = im2col_step_;
auto per_value_size = spatial_size * num_heads * channels;
auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
for (int n = 0; n < batch/im2col_step_; ++n)
{
auto grad_output_g = grad_output_n.select(0, n);
AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
grad_output_g.data(),
value.data() + n * im2col_step_ * per_value_size,
spatial_shapes.data(),
level_start_index.data(),
sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
attn_weight.data() + n * im2col_step_ * per_attn_weight_size,
batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
grad_value.data() + n * im2col_step_ * per_value_size,
grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size);
}));
}
return {
grad_value, grad_sampling_loc, grad_attn_weight
};
}
================================================
FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#pragma once
#include
at::Tensor ms_deform_attn_cuda_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step);
std::vector ms_deform_attn_cuda_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step);
================================================
FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh
================================================
/*!
**************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************
* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
* Copyright (c) 2018 Microsoft
**************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#include
#include
#include
#include
#include
#include
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N, const int num_threads)
{
return (N + num_threads - 1) / num_threads;
}
template
__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data,
const int &height, const int &width, const int &nheads, const int &channels,
const scalar_t &h, const scalar_t &w, const int &m, const int &c)
{
const int h_low = floor(h);
const int w_low = floor(w);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const scalar_t lh = h - h_low;
const scalar_t lw = w - w_low;
const scalar_t hh = 1 - lh, hw = 1 - lw;
const int w_stride = nheads * channels;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int base_ptr = m * channels + c;
scalar_t v1 = 0;
if (h_low >= 0 && w_low >= 0)
{
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
}
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
{
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
}
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
{
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
}
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
{
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
}
const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template
__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
const int &height, const int &width, const int &nheads, const int &channels,
const scalar_t &h, const scalar_t &w, const int &m, const int &c,
const scalar_t &top_grad,
const scalar_t &attn_weight,
scalar_t* &grad_value,
scalar_t* grad_sampling_loc,
scalar_t* grad_attn_weight)
{
const int h_low = floor(h);
const int w_low = floor(w);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const scalar_t lh = h - h_low;
const scalar_t lw = w - w_low;
const scalar_t hh = 1 - lh, hw = 1 - lw;
const int w_stride = nheads * channels;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int base_ptr = m * channels + c;
const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const scalar_t top_grad_value = top_grad * attn_weight;
scalar_t grad_h_weight = 0, grad_w_weight = 0;
scalar_t v1 = 0;
if (h_low >= 0 && w_low >= 0)
{
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
grad_h_weight -= hw * v1;
grad_w_weight -= hh * v1;
atomicAdd(grad_value+ptr1, w1*top_grad_value);
}
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
{
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
grad_h_weight -= lw * v2;
grad_w_weight += hh * v2;
atomicAdd(grad_value+ptr2, w2*top_grad_value);
}
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
{
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
grad_h_weight += hw * v3;
grad_w_weight -= lh * v3;
atomicAdd(grad_value+ptr3, w3*top_grad_value);
}
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
{
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
grad_h_weight += lw * v4;
grad_w_weight += lh * v4;
atomicAdd(grad_value+ptr4, w4*top_grad_value);
}
const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
*grad_attn_weight = top_grad * val;
*grad_sampling_loc = width * grad_w_weight * top_grad_value;
*(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
}
template
__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
const int &height, const int &width, const int &nheads, const int &channels,
const scalar_t &h, const scalar_t &w, const int &m, const int &c,
const scalar_t &top_grad,
const scalar_t &attn_weight,
scalar_t* &grad_value,
scalar_t* grad_sampling_loc,
scalar_t* grad_attn_weight)
{
const int h_low = floor(h);
const int w_low = floor(w);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const scalar_t lh = h - h_low;
const scalar_t lw = w - w_low;
const scalar_t hh = 1 - lh, hw = 1 - lw;
const int w_stride = nheads * channels;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int base_ptr = m * channels + c;
const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const scalar_t top_grad_value = top_grad * attn_weight;
scalar_t grad_h_weight = 0, grad_w_weight = 0;
scalar_t v1 = 0;
if (h_low >= 0 && w_low >= 0)
{
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
grad_h_weight -= hw * v1;
grad_w_weight -= hh * v1;
atomicAdd(grad_value+ptr1, w1*top_grad_value);
}
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
{
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
grad_h_weight -= lw * v2;
grad_w_weight += hh * v2;
atomicAdd(grad_value+ptr2, w2*top_grad_value);
}
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
{
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
grad_h_weight += hw * v3;
grad_w_weight -= lh * v3;
atomicAdd(grad_value+ptr3, w3*top_grad_value);
}
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
{
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
grad_h_weight += lw * v4;
grad_w_weight += lh * v4;
atomicAdd(grad_value+ptr4, w4*top_grad_value);
}
const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
atomicAdd(grad_attn_weight, top_grad * val);
atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
}
template
__global__ void ms_deformable_im2col_gpu_kernel(const int n,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *data_col)
{
CUDA_KERNEL_LOOP(index, n)
{
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
scalar_t *data_col_ptr = data_col + index;
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
scalar_t col = 0;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
}
data_weight_ptr += 1;
data_loc_w_ptr += 2;
}
}
*data_col_ptr = col;
}
}
template
__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
__shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
__shared__ scalar_t cache_grad_attn_weight[blockSize];
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
if (tid == 0)
{
scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
int sid=2;
for (unsigned int tid = 1; tid < blockSize; ++tid)
{
_grad_w += cache_grad_sampling_loc[sid];
_grad_h += cache_grad_sampling_loc[sid + 1];
_grad_a += cache_grad_attn_weight[tid];
sid += 2;
}
*grad_sampling_loc = _grad_w;
*(grad_sampling_loc + 1) = _grad_h;
*grad_attn_weight = _grad_a;
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template
__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
__shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
__shared__ scalar_t cache_grad_attn_weight[blockSize];
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
for (unsigned int s=blockSize/2; s>0; s>>=1)
{
if (tid < s) {
const unsigned int xid1 = tid << 1;
const unsigned int xid2 = (tid + s) << 1;
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
}
__syncthreads();
}
if (tid == 0)
{
*grad_sampling_loc = cache_grad_sampling_loc[0];
*(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
*grad_attn_weight = cache_grad_attn_weight[0];
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template
__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
extern __shared__ int _s[];
scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
if (tid == 0)
{
scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
int sid=2;
for (unsigned int tid = 1; tid < blockDim.x; ++tid)
{
_grad_w += cache_grad_sampling_loc[sid];
_grad_h += cache_grad_sampling_loc[sid + 1];
_grad_a += cache_grad_attn_weight[tid];
sid += 2;
}
*grad_sampling_loc = _grad_w;
*(grad_sampling_loc + 1) = _grad_h;
*grad_attn_weight = _grad_a;
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template
__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
extern __shared__ int _s[];
scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
{
if (tid < s) {
const unsigned int xid1 = tid << 1;
const unsigned int xid2 = (tid + s) << 1;
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
if (tid + (s << 1) < spre)
{
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
}
}
__syncthreads();
}
if (tid == 0)
{
*grad_sampling_loc = cache_grad_sampling_loc[0];
*(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
*grad_attn_weight = cache_grad_attn_weight[0];
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template
__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
extern __shared__ int _s[];
scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
{
if (tid < s) {
const unsigned int xid1 = tid << 1;
const unsigned int xid2 = (tid + s) << 1;
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
if (tid + (s << 1) < spre)
{
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
}
}
__syncthreads();
}
if (tid == 0)
{
atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template
__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear_gm(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
grad_sampling_loc, grad_attn_weight);
}
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template
void ms_deformable_im2col_cuda(cudaStream_t stream,
const scalar_t* data_value,
const int64_t* data_spatial_shapes,
const int64_t* data_level_start_index,
const scalar_t* data_sampling_loc,
const scalar_t* data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t* data_col)
{
const int num_kernels = batch_size * num_query * num_heads * channels;
const int num_actual_kernels = batch_size * num_query * num_heads * channels;
const int num_threads = CUDA_NUM_THREADS;
ms_deformable_im2col_gpu_kernel
<<>>(
num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight,
batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
}
}
template
void ms_deformable_col2im_cuda(cudaStream_t stream,
const scalar_t* grad_col,
const scalar_t* data_value,
const int64_t * data_spatial_shapes,
const int64_t * data_level_start_index,
const scalar_t * data_sampling_loc,
const scalar_t * data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t* grad_value,
scalar_t* grad_sampling_loc,
scalar_t* grad_attn_weight)
{
const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
const int num_kernels = batch_size * num_query * num_heads * channels;
const int num_actual_kernels = batch_size * num_query * num_heads * channels;
if (channels > 1024)
{
if ((channels & 1023) == 0)
{
ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
}
else
{
ms_deformable_col2im_gpu_kernel_gm
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
}
}
else{
switch(channels)
{
case 1:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 2:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 4:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 8:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 16:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 32:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 64:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 128:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 256:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 512:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 1024:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
default:
if (channels < 64)
{
ms_deformable_col2im_gpu_kernel_shm_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
}
else
{
ms_deformable_col2im_gpu_kernel_shm_reduce_v2
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
}
}
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
}
}
================================================
FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#pragma once
#include "cpu/ms_deform_attn_cpu.h"
#ifdef WITH_CUDA
#include "cuda/ms_deform_attn_cuda.h"
#endif
at::Tensor
ms_deform_attn_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step)
{
if (value.type().is_cuda())
{
#ifdef WITH_CUDA
return ms_deform_attn_cuda_forward(
value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
std::vector
ms_deform_attn_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step)
{
if (value.type().is_cuda())
{
#ifdef WITH_CUDA
return ms_deform_attn_cuda_backward(
value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
================================================
FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/vision.cpp
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#include "ms_deform_attn.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
}
================================================
FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/test.py
================================================
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import time
import torch
import torch.nn as nn
from torch.autograd import gradcheck
from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
N, M, D = 1, 2, 2
Lq, L, P = 2, 2, 2
shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
S = sum([(H*W).item() for H, W in shapes])
torch.manual_seed(3)
@torch.no_grad()
def check_forward_equal_with_pytorch_double():
value = torch.rand(N, S, M, D).cuda() * 0.01
sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
im2col_step = 2
output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
fwdok = torch.allclose(output_cuda, output_pytorch)
max_abs_err = (output_cuda - output_pytorch).abs().max()
max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
@torch.no_grad()
def check_forward_equal_with_pytorch_float():
value = torch.rand(N, S, M, D).cuda() * 0.01
sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
im2col_step = 2
output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
max_abs_err = (output_cuda - output_pytorch).abs().max()
max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
value = torch.rand(N, S, M, channels).cuda() * 0.01
sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
im2col_step = 2
func = MSDeformAttnFunction.apply
value.requires_grad = grad_value
sampling_locations.requires_grad = grad_sampling_loc
attention_weights.requires_grad = grad_attn_weight
gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
print(f'* {gradok} check_gradient_numerical(D={channels})')
if __name__ == '__main__':
check_forward_equal_with_pytorch_double()
check_forward_equal_with_pytorch_float()
for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
check_gradient_numerical(channels, True, True, True)
================================================
FILE: mfvis_nococo/mask2former/modeling/transformer_decoder/__init__.py
================================================
from .maskformer_transformer_decoder import StandardTransformerDecoder
from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
================================================
FILE: mfvis_nococo/mask2former/modeling/transformer_decoder/mask2former_transformer_decoder.py
================================================
# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
import logging
import fvcore.nn.weight_init as weight_init
from typing import Optional
import torch
from torch import nn, Tensor
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.layers import Conv2d
from .position_encoding import PositionEmbeddingSine
from .maskformer_transformer_decoder import TRANSFORMER_DECODER_REGISTRY
class SelfAttentionLayer(nn.Module):
def __init__(self, d_model, nhead, dropout=0.0,
activation="relu", normalize_before=False):
super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.norm = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(self, tgt,
tgt_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
q = k = self.with_pos_embed(tgt, query_pos)
tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
key_padding_mask=tgt_key_padding_mask)[0]
tgt = tgt + self.dropout(tgt2)
tgt = self.norm(tgt)
return tgt
def forward_pre(self, tgt,
tgt_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
tgt2 = self.norm(tgt)
q = k = self.with_pos_embed(tgt2, query_pos)
tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
key_padding_mask=tgt_key_padding_mask)[0]
tgt = tgt + self.dropout(tgt2)
return tgt
def forward(self, tgt,
tgt_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
if self.normalize_before:
return self.forward_pre(tgt, tgt_mask,
tgt_key_padding_mask, query_pos)
return self.forward_post(tgt, tgt_mask,
tgt_key_padding_mask, query_pos)
class CrossAttentionLayer(nn.Module):
def __init__(self, d_model, nhead, dropout=0.0,
activation="relu", normalize_before=False):
super().__init__()
self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.norm = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(self, tgt, memory,
memory_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
key=self.with_pos_embed(memory, pos),
value=memory, attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask)[0]
tgt = tgt + self.dropout(tgt2)
tgt = self.norm(tgt)
return tgt
def forward_pre(self, tgt, memory,
memory_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
tgt2 = self.norm(tgt)
tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
key=self.with_pos_embed(memory, pos),
value=memory, attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask)[0]
tgt = tgt + self.dropout(tgt2)
return tgt
def forward(self, tgt, memory,
memory_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
if self.normalize_before:
return self.forward_pre(tgt, memory, memory_mask,
memory_key_padding_mask, pos, query_pos)
return self.forward_post(tgt, memory, memory_mask,
memory_key_padding_mask, pos, query_pos)
class FFNLayer(nn.Module):
def __init__(self, d_model, dim_feedforward=2048, dropout=0.0,
activation="relu", normalize_before=False):
super().__init__()
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm = nn.LayerNorm(d_model)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(self, tgt):
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
tgt = tgt + self.dropout(tgt2)
tgt = self.norm(tgt)
return tgt
def forward_pre(self, tgt):
tgt2 = self.norm(tgt)
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
tgt = tgt + self.dropout(tgt2)
return tgt
def forward(self, tgt):
if self.normalize_before:
return self.forward_pre(tgt)
return self.forward_post(tgt)
def _get_activation_fn(activation):
"""Return an activation function given a string"""
if activation == "relu":
return F.relu
if activation == "gelu":
return F.gelu
if activation == "glu":
return F.glu
raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
class MLP(nn.Module):
""" Very simple multi-layer perceptron (also called FFN)"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
def forward(self, x):
for i, layer in enumerate(self.layers):
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x
@TRANSFORMER_DECODER_REGISTRY.register()
class MultiScaleMaskedTransformerDecoder(nn.Module):
_version = 2
def _load_from_state_dict(
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
):
version = local_metadata.get("version", None)
if version is None or version < 2:
# Do not warn if train from scratch
scratch = True
logger = logging.getLogger(__name__)
for k in list(state_dict.keys()):
newk = k
if "static_query" in k:
newk = k.replace("static_query", "query_feat")
if newk != k:
state_dict[newk] = state_dict[k]
del state_dict[k]
scratch = False
if not scratch:
logger.warning(
f"Weight format of {self.__class__.__name__} have changed! "
"Please upgrade your models. Applying automatic conversion now ..."
)
@configurable
def __init__(
self,
in_channels,
mask_classification=True,
*,
num_classes: int,
hidden_dim: int,
num_queries: int,
nheads: int,
dim_feedforward: int,
dec_layers: int,
pre_norm: bool,
mask_dim: int,
enforce_input_project: bool,
):
"""
NOTE: this interface is experimental.
Args:
in_channels: channels of the input features
mask_classification: whether to add mask classifier or not
num_classes: number of classes
hidden_dim: Transformer feature dimension
num_queries: number of queries
nheads: number of heads
dim_feedforward: feature dimension in feedforward network
enc_layers: number of Transformer encoder layers
dec_layers: number of Transformer decoder layers
pre_norm: whether to use pre-LayerNorm or not
mask_dim: mask feature dimension
enforce_input_project: add input project 1x1 conv even if input
channels and hidden dim is identical
"""
super().__init__()
assert mask_classification, "Only support mask classification model"
self.mask_classification = mask_classification
# positional encoding
N_steps = hidden_dim // 2
self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
# define Transformer decoder here
self.num_heads = nheads
self.num_layers = dec_layers
self.transformer_self_attention_layers = nn.ModuleList()
self.transformer_cross_attention_layers = nn.ModuleList()
self.transformer_ffn_layers = nn.ModuleList()
for _ in range(self.num_layers):
self.transformer_self_attention_layers.append(
SelfAttentionLayer(
d_model=hidden_dim,
nhead=nheads,
dropout=0.0,
normalize_before=pre_norm,
)
)
self.transformer_cross_attention_layers.append(
CrossAttentionLayer(
d_model=hidden_dim,
nhead=nheads,
dropout=0.0,
normalize_before=pre_norm,
)
)
self.transformer_ffn_layers.append(
FFNLayer(
d_model=hidden_dim,
dim_feedforward=dim_feedforward,
dropout=0.0,
normalize_before=pre_norm,
)
)
self.decoder_norm = nn.LayerNorm(hidden_dim)
self.num_queries = num_queries
# learnable query features
self.query_feat = nn.Embedding(num_queries, hidden_dim)
# learnable query p.e.
self.query_embed = nn.Embedding(num_queries, hidden_dim)
# level embedding (we always use 3 scales)
self.num_feature_levels = 3
self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim)
self.input_proj = nn.ModuleList()
for _ in range(self.num_feature_levels):
if in_channels != hidden_dim or enforce_input_project:
self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1))
weight_init.c2_xavier_fill(self.input_proj[-1])
else:
self.input_proj.append(nn.Sequential())
# output FFNs
if self.mask_classification:
self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
@classmethod
def from_config(cls, cfg, in_channels, mask_classification):
ret = {}
ret["in_channels"] = in_channels
ret["mask_classification"] = mask_classification
ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
# Transformer parameters:
ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
# NOTE: because we add learnable query features which requires supervision,
# we add minus 1 to decoder layers to be consistent with our loss
# implementation: that is, number of auxiliary losses is always
# equal to number of decoder layers. With learnable query features, the number of
# auxiliary losses equals number of decoders plus 1.
assert cfg.MODEL.MASK_FORMER.DEC_LAYERS >= 1
ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS - 1
ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
return ret
def forward(self, x, mask_features, mask = None):
# x is a list of multi-scale feature
assert len(x) == self.num_feature_levels
src = []
pos = []
size_list = []
# disable mask, it does not affect performance
del mask
for i in range(self.num_feature_levels):
size_list.append(x[i].shape[-2:])
pos.append(self.pe_layer(x[i], None).flatten(2))
src.append(self.input_proj[i](x[i]).flatten(2) + self.level_embed.weight[i][None, :, None])
# flatten NxCxHxW to HWxNxC
pos[-1] = pos[-1].permute(2, 0, 1)
src[-1] = src[-1].permute(2, 0, 1)
_, bs, _ = src[0].shape
# QxNxC
query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1)
# query_embed = None
# print('come here==========')
output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1)
predictions_class = []
predictions_mask = []
# prediction heads on learnable query features
outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0])
predictions_class.append(outputs_class)
predictions_mask.append(outputs_mask)
for i in range(self.num_layers):
level_index = i % self.num_feature_levels
attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False
# attention: cross-attention first
output = self.transformer_cross_attention_layers[i](
output, src[level_index],
memory_mask=attn_mask,
memory_key_padding_mask=None, # here we do not apply masking on padded region
pos=pos[level_index], query_pos=query_embed
)
output = self.transformer_self_attention_layers[i](
output, tgt_mask=None,
tgt_key_padding_mask=None,
query_pos=query_embed
)
# FFN
output = self.transformer_ffn_layers[i](
output
)
outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels])
predictions_class.append(outputs_class)
predictions_mask.append(outputs_mask)
assert len(predictions_class) == self.num_layers + 1
# print('len mask predictions:', len(predictions_mask))
out = {
'pred_logits': predictions_class[-1],
'pred_masks': predictions_mask[-1],
'aux_outputs': self._set_aux_loss(
predictions_class if self.mask_classification else None, predictions_mask
)
}
return out
def forward_prediction_heads(self, output, mask_features, attn_mask_target_size):
decoder_output = self.decoder_norm(output)
decoder_output = decoder_output.transpose(0, 1)
outputs_class = self.class_embed(decoder_output)
mask_embed = self.mask_embed(decoder_output)
outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
# NOTE: prediction is of higher-resolution
# [B, Q, H, W] -> [B, Q, H*W] -> [B, h, Q, H*W] -> [B*h, Q, HW]
attn_mask = F.interpolate(outputs_mask, size=attn_mask_target_size, mode="bilinear", align_corners=False)
# must use bool type
# If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged.
attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool()
attn_mask = attn_mask.detach()
return outputs_class, outputs_mask, attn_mask
@torch.jit.unused
def _set_aux_loss(self, outputs_class, outputs_seg_masks):
# this is a workaround to make torchscript happy, as torchscript
# doesn't support dictionary with non-homogeneous values, such
# as a dict having both a Tensor and a list.
if self.mask_classification:
return [
{"pred_logits": a, "pred_masks": b}
for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
]
else:
return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
================================================
FILE: mfvis_nococo/mask2former/modeling/transformer_decoder/maskformer_transformer_decoder.py
================================================
# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
import fvcore.nn.weight_init as weight_init
import torch
from torch import nn
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.layers import Conv2d
from detectron2.utils.registry import Registry
from .position_encoding import PositionEmbeddingSine
from .transformer import Transformer
TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE")
TRANSFORMER_DECODER_REGISTRY.__doc__ = """
Registry for transformer module in MaskFormer.
"""
def build_transformer_decoder(cfg, in_channels, mask_classification=True):
"""
Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`.
"""
name = cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME
return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification)
@TRANSFORMER_DECODER_REGISTRY.register()
class StandardTransformerDecoder(nn.Module):
@configurable
def __init__(
self,
in_channels,
mask_classification=True,
*,
num_classes: int,
hidden_dim: int,
num_queries: int,
nheads: int,
dropout: float,
dim_feedforward: int,
enc_layers: int,
dec_layers: int,
pre_norm: bool,
deep_supervision: bool,
mask_dim: int,
enforce_input_project: bool,
):
"""
NOTE: this interface is experimental.
Args:
in_channels: channels of the input features
mask_classification: whether to add mask classifier or not
num_classes: number of classes
hidden_dim: Transformer feature dimension
num_queries: number of queries
nheads: number of heads
dropout: dropout in Transformer
dim_feedforward: feature dimension in feedforward network
enc_layers: number of Transformer encoder layers
dec_layers: number of Transformer decoder layers
pre_norm: whether to use pre-LayerNorm or not
deep_supervision: whether to add supervision to every decoder layers
mask_dim: mask feature dimension
enforce_input_project: add input project 1x1 conv even if input
channels and hidden dim is identical
"""
super().__init__()
self.mask_classification = mask_classification
# positional encoding
N_steps = hidden_dim // 2
self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
transformer = Transformer(
d_model=hidden_dim,
dropout=dropout,
nhead=nheads,
dim_feedforward=dim_feedforward,
num_encoder_layers=enc_layers,
num_decoder_layers=dec_layers,
normalize_before=pre_norm,
return_intermediate_dec=deep_supervision,
)
self.num_queries = num_queries
self.transformer = transformer
hidden_dim = transformer.d_model
self.query_embed = nn.Embedding(num_queries, hidden_dim)
if in_channels != hidden_dim or enforce_input_project:
self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1)
weight_init.c2_xavier_fill(self.input_proj)
else:
self.input_proj = nn.Sequential()
self.aux_loss = deep_supervision
# output FFNs
if self.mask_classification:
self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
@classmethod
def from_config(cls, cfg, in_channels, mask_classification):
ret = {}
ret["in_channels"] = in_channels
ret["mask_classification"] = mask_classification
ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
# Transformer parameters:
ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS
ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS
ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
return ret
def forward(self, x, mask_features, mask=None):
if mask is not None:
mask = F.interpolate(mask[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
pos = self.pe_layer(x, mask)
src = x
hs, memory = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos)
if self.mask_classification:
outputs_class = self.class_embed(hs)
out = {"pred_logits": outputs_class[-1]}
else:
out = {}
if self.aux_loss:
# [l, bs, queries, embed]
mask_embed = self.mask_embed(hs)
outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features)
out["pred_masks"] = outputs_seg_masks[-1]
out["aux_outputs"] = self._set_aux_loss(
outputs_class if self.mask_classification else None, outputs_seg_masks
)
else:
# FIXME h_boxes takes the last one computed, keep this in mind
# [bs, queries, embed]
mask_embed = self.mask_embed(hs[-1])
outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
out["pred_masks"] = outputs_seg_masks
return out
@torch.jit.unused
def _set_aux_loss(self, outputs_class, outputs_seg_masks):
# this is a workaround to make torchscript happy, as torchscript
# doesn't support dictionary with non-homogeneous values, such
# as a dict having both a Tensor and a list.
if self.mask_classification:
return [
{"pred_logits": a, "pred_masks": b}
for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
]
else:
return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
class MLP(nn.Module):
"""Very simple multi-layer perceptron (also called FFN)"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.ModuleList(
nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
)
def forward(self, x):
for i, layer in enumerate(self.layers):
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x
================================================
FILE: mfvis_nococo/mask2former/modeling/transformer_decoder/position_encoding.py
================================================
# # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
"""
Various positional encodings for the transformer.
"""
import math
import torch
from torch import nn
class PositionEmbeddingSine(nn.Module):
"""
This is a more standard version of the position embedding, very similar to the one
used by the Attention is all you need paper, generalized to work on images.
"""
def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
super().__init__()
self.num_pos_feats = num_pos_feats
self.temperature = temperature
self.normalize = normalize
if scale is not None and normalize is False:
raise ValueError("normalize should be True if scale is passed")
if scale is None:
scale = 2 * math.pi
self.scale = scale
def forward(self, x, mask=None):
if mask is None:
mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
not_mask = ~mask
y_embed = not_mask.cumsum(1, dtype=torch.float32)
x_embed = not_mask.cumsum(2, dtype=torch.float32)
if self.normalize:
eps = 1e-6
y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
pos_x = x_embed[:, :, :, None] / dim_t
pos_y = y_embed[:, :, :, None] / dim_t
pos_x = torch.stack(
(pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
).flatten(3)
pos_y = torch.stack(
(pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
).flatten(3)
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
return pos
def __repr__(self, _repr_indent=4):
head = "Positional encoding " + self.__class__.__name__
body = [
"num_pos_feats: {}".format(self.num_pos_feats),
"temperature: {}".format(self.temperature),
"normalize: {}".format(self.normalize),
"scale: {}".format(self.scale),
]
# _repr_indent = 4
lines = [head] + [" " * _repr_indent + line for line in body]
return "\n".join(lines)
================================================
FILE: mfvis_nococo/mask2former/modeling/transformer_decoder/transformer.py
================================================
# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/transformer.py
"""
Transformer class.
Copy-paste from torch.nn.Transformer with modifications:
* positional encodings are passed in MHattention
* extra LN at the end of encoder is removed
* decoder returns a stack of activations from all decoding layers
"""
import copy
from typing import List, Optional
import torch
import torch.nn.functional as F
from torch import Tensor, nn
class Transformer(nn.Module):
def __init__(
self,
d_model=512,
nhead=8,
num_encoder_layers=6,
num_decoder_layers=6,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
normalize_before=False,
return_intermediate_dec=False,
):
super().__init__()
encoder_layer = TransformerEncoderLayer(
d_model, nhead, dim_feedforward, dropout, activation, normalize_before
)
encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
decoder_layer = TransformerDecoderLayer(
d_model, nhead, dim_feedforward, dropout, activation, normalize_before
)
decoder_norm = nn.LayerNorm(d_model)
self.decoder = TransformerDecoder(
decoder_layer,
num_decoder_layers,
decoder_norm,
return_intermediate=return_intermediate_dec,
)
self._reset_parameters()
self.d_model = d_model
self.nhead = nhead
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def forward(self, src, mask, query_embed, pos_embed):
# flatten NxCxHxW to HWxNxC
bs, c, h, w = src.shape
src = src.flatten(2).permute(2, 0, 1)
pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
if mask is not None:
mask = mask.flatten(1)
tgt = torch.zeros_like(query_embed)
memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
hs = self.decoder(
tgt, memory, memory_key_padding_mask=mask, pos=pos_embed, query_pos=query_embed
)
return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
class TransformerEncoder(nn.Module):
def __init__(self, encoder_layer, num_layers, norm=None):
super().__init__()
self.layers = _get_clones(encoder_layer, num_layers)
self.num_layers = num_layers
self.norm = norm
def forward(
self,
src,
mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
):
output = src
for layer in self.layers:
output = layer(
output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, pos=pos
)
if self.norm is not None:
output = self.norm(output)
return output
class TransformerDecoder(nn.Module):
def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
super().__init__()
self.layers = _get_clones(decoder_layer, num_layers)
self.num_layers = num_layers
self.norm = norm
self.return_intermediate = return_intermediate
def forward(
self,
tgt,
memory,
tgt_mask: Optional[Tensor] = None,
memory_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None,
):
output = tgt
intermediate = []
for layer in self.layers:
output = layer(
output,
memory,
tgt_mask=tgt_mask,
memory_mask=memory_mask,
tgt_key_padding_mask=tgt_key_padding_mask,
memory_key_padding_mask=memory_key_padding_mask,
pos=pos,
query_pos=query_pos,
)
if self.return_intermediate:
intermediate.append(self.norm(output))
if self.norm is not None:
output = self.norm(output)
if self.return_intermediate:
intermediate.pop()
intermediate.append(output)
if self.return_intermediate:
return torch.stack(intermediate)
return output.unsqueeze(0)
class TransformerEncoderLayer(nn.Module):
def __init__(
self,
d_model,
nhead,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
normalize_before=False,
):
super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(
self,
src,
src_mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
):
q = k = self.with_pos_embed(src, pos)
src2 = self.self_attn(
q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
)[0]
src = src + self.dropout1(src2)
src = self.norm1(src)
src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
src = src + self.dropout2(src2)
src = self.norm2(src)
return src
def forward_pre(
self,
src,
src_mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
):
src2 = self.norm1(src)
q = k = self.with_pos_embed(src2, pos)
src2 = self.self_attn(
q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
)[0]
src = src + self.dropout1(src2)
src2 = self.norm2(src)
src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
src = src + self.dropout2(src2)
return src
def forward(
self,
src,
src_mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
):
if self.normalize_before:
return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
return self.forward_post(src, src_mask, src_key_padding_mask, pos)
class TransformerDecoderLayer(nn.Module):
def __init__(
self,
d_model,
nhead,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
normalize_before=False,
):
super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.dropout3 = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(
self,
tgt,
memory,
tgt_mask: Optional[Tensor] = None,
memory_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None,
):
q = k = self.with_pos_embed(tgt, query_pos)
tgt2 = self.self_attn(
q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
)[0]
tgt = tgt + self.dropout1(tgt2)
tgt = self.norm1(tgt)
tgt2 = self.multihead_attn(
query=self.with_pos_embed(tgt, query_pos),
key=self.with_pos_embed(memory, pos),
value=memory,
attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask,
)[0]
tgt = tgt + self.dropout2(tgt2)
tgt = self.norm2(tgt)
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
tgt = tgt + self.dropout3(tgt2)
tgt = self.norm3(tgt)
return tgt
def forward_pre(
self,
tgt,
memory,
tgt_mask: Optional[Tensor] = None,
memory_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None,
):
tgt2 = self.norm1(tgt)
q = k = self.with_pos_embed(tgt2, query_pos)
tgt2 = self.self_attn(
q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
)[0]
tgt = tgt + self.dropout1(tgt2)
tgt2 = self.norm2(tgt)
tgt2 = self.multihead_attn(
query=self.with_pos_embed(tgt2, query_pos),
key=self.with_pos_embed(memory, pos),
value=memory,
attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask,
)[0]
tgt = tgt + self.dropout2(tgt2)
tgt2 = self.norm3(tgt)
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
tgt = tgt + self.dropout3(tgt2)
return tgt
def forward(
self,
tgt,
memory,
tgt_mask: Optional[Tensor] = None,
memory_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None,
):
if self.normalize_before:
return self.forward_pre(
tgt,
memory,
tgt_mask,
memory_mask,
tgt_key_padding_mask,
memory_key_padding_mask,
pos,
query_pos,
)
return self.forward_post(
tgt,
memory,
tgt_mask,
memory_mask,
tgt_key_padding_mask,
memory_key_padding_mask,
pos,
query_pos,
)
def _get_clones(module, N):
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
def _get_activation_fn(activation):
"""Return an activation function given a string"""
if activation == "relu":
return F.relu
if activation == "gelu":
return F.gelu
if activation == "glu":
return F.glu
raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
================================================
FILE: mfvis_nococo/mask2former/test_time_augmentation.py
================================================
import copy
import logging
from itertools import count
import numpy as np
import torch
from fvcore.transforms import HFlipTransform
from torch import nn
from torch.nn.parallel import DistributedDataParallel
from detectron2.data.detection_utils import read_image
from detectron2.modeling import DatasetMapperTTA
__all__ = [
"SemanticSegmentorWithTTA",
]
class SemanticSegmentorWithTTA(nn.Module):
"""
A SemanticSegmentor with test-time augmentation enabled.
Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
"""
def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
"""
Args:
cfg (CfgNode):
model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
tta_mapper (callable): takes a dataset dict and returns a list of
augmented versions of the dataset dict. Defaults to
`DatasetMapperTTA(cfg)`.
batch_size (int): batch the augmented images into this batch size for inference.
"""
super().__init__()
if isinstance(model, DistributedDataParallel):
model = model.module
self.cfg = cfg.clone()
self.model = model
if tta_mapper is None:
tta_mapper = DatasetMapperTTA(cfg)
self.tta_mapper = tta_mapper
self.batch_size = batch_size
def __call__(self, batched_inputs):
"""
Same input/output format as :meth:`SemanticSegmentor.forward`
"""
def _maybe_read_image(dataset_dict):
ret = copy.copy(dataset_dict)
if "image" not in ret:
image = read_image(ret.pop("file_name"), self.model.input_format)
image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW
ret["image"] = image
if "height" not in ret and "width" not in ret:
ret["height"] = image.shape[1]
ret["width"] = image.shape[2]
return ret
processed_results = []
for x in batched_inputs:
result = self._inference_one_image(_maybe_read_image(x))
processed_results.append(result)
return processed_results
def _inference_one_image(self, input):
"""
Args:
input (dict): one dataset dict with "image" field being a CHW tensor
Returns:
dict: one output dict
"""
orig_shape = (input["height"], input["width"])
augmented_inputs, tfms = self._get_augmented_inputs(input)
final_predictions = None
count_predictions = 0
for input, tfm in zip(augmented_inputs, tfms):
count_predictions += 1
with torch.no_grad():
if final_predictions is None:
if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2])
else:
final_predictions = self.model([input])[0].pop("sem_seg")
else:
if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2])
else:
final_predictions += self.model([input])[0].pop("sem_seg")
final_predictions = final_predictions / count_predictions
return {"sem_seg": final_predictions}
def _get_augmented_inputs(self, input):
augmented_inputs = self.tta_mapper(input)
tfms = [x.pop("transforms") for x in augmented_inputs]
return augmented_inputs, tfms
================================================
FILE: mfvis_nococo/mask2former/utils/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
================================================
FILE: mfvis_nococo/mask2former/utils/__init__.py.new
================================================
================================================
FILE: mfvis_nococo/mask2former/utils/misc.py
================================================
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
"""
Misc functions, including distributed helpers.
Mostly copy-paste from torchvision references.
"""
from typing import List, Optional
import torch
import torch.distributed as dist
import torchvision
from torch import Tensor
def _max_by_axis(the_list):
# type: (List[List[int]]) -> List[int]
maxes = the_list[0]
for sublist in the_list[1:]:
for index, item in enumerate(sublist):
maxes[index] = max(maxes[index], item)
return maxes
class NestedTensor(object):
def __init__(self, tensors, mask: Optional[Tensor]):
self.tensors = tensors
self.mask = mask
def to(self, device):
# type: (Device) -> NestedTensor # noqa
cast_tensor = self.tensors.to(device)
mask = self.mask
if mask is not None:
assert mask is not None
cast_mask = mask.to(device)
else:
cast_mask = None
return NestedTensor(cast_tensor, cast_mask)
def decompose(self):
return self.tensors, self.mask
def __repr__(self):
return str(self.tensors)
def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
# TODO make this more general
if tensor_list[0].ndim == 3:
if torchvision._is_tracing():
# nested_tensor_from_tensor_list() does not export well to ONNX
# call _onnx_nested_tensor_from_tensor_list() instead
return _onnx_nested_tensor_from_tensor_list(tensor_list)
# TODO make it support different-sized images
max_size = _max_by_axis([list(img.shape) for img in tensor_list])
# min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
batch_shape = [len(tensor_list)] + max_size
b, c, h, w = batch_shape
dtype = tensor_list[0].dtype
device = tensor_list[0].device
tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
for img, pad_img, m in zip(tensor_list, tensor, mask):
pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
m[: img.shape[1], : img.shape[2]] = False
else:
raise ValueError("not supported")
return NestedTensor(tensor, mask)
# _onnx_nested_tensor_from_tensor_list() is an implementation of
# nested_tensor_from_tensor_list() that is supported by ONNX tracing.
@torch.jit.unused
def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
max_size = []
for i in range(tensor_list[0].dim()):
max_size_i = torch.max(
torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
).to(torch.int64)
max_size.append(max_size_i)
max_size = tuple(max_size)
# work around for
# pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
# m[: img.shape[1], :img.shape[2]] = False
# which is not yet supported in onnx
padded_imgs = []
padded_masks = []
for img in tensor_list:
padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
padded_imgs.append(padded_img)
m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
padded_masks.append(padded_mask.to(torch.bool))
tensor = torch.stack(padded_imgs)
mask = torch.stack(padded_masks)
return NestedTensor(tensor, mask=mask)
def is_dist_avail_and_initialized():
if not dist.is_available():
return False
if not dist.is_initialized():
return False
return True
================================================
FILE: mfvis_nococo/mask2former_video/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
from . import modeling
# config
from .config import add_maskformer2_video_config
# models
from .video_maskformer_model import VideoMaskFormer
# video
from .data_video import (
YTVISDatasetMapper,
YTVISEvaluator,
build_detection_train_loader,
build_detection_test_loader,
get_detection_dataset_dicts,
)
================================================
FILE: mfvis_nococo/mask2former_video/config.py
================================================
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates.
from detectron2.config import CfgNode as CN
def add_maskformer2_video_config(cfg):
# video data
# DataLoader
cfg.INPUT.SAMPLING_FRAME_NUM = 5
cfg.INPUT.SAMPLING_FRAME_RANGE = 5
cfg.INPUT.SAMPLING_FRAME_SHUFFLE = True
cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation"
================================================
FILE: mfvis_nococo/mask2former_video/data_video/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
from .dataset_mapper import YTVISDatasetMapper, CocoClipDatasetMapper
from .build import *
from .datasets import *
from .ytvis_eval import YTVISEvaluator
================================================
FILE: mfvis_nococo/mask2former_video/data_video/augmentation.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
import numpy as np
import logging
import sys
from fvcore.transforms.transform import (
HFlipTransform,
NoOpTransform,
VFlipTransform,
)
from PIL import Image
from detectron2.data import transforms as T
class ResizeShortestEdge(T.Augmentation):
"""
Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge.
If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
"""
def __init__(
self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR, clip_frame_cnt=1
):
"""
Args:
short_edge_length (list[int]): If ``sample_style=="range"``,
a [min, max] interval from which to sample the shortest edge length.
If ``sample_style=="choice"``, a list of shortest edge lengths to sample from.
max_size (int): maximum allowed longest edge length.
sample_style (str): either "range" or "choice".
"""
super().__init__()
assert sample_style in ["range", "choice", "range_by_clip", "choice_by_clip"], sample_style
self.is_range = ("range" in sample_style)
if isinstance(short_edge_length, int):
short_edge_length = (short_edge_length, short_edge_length)
if self.is_range:
assert len(short_edge_length) == 2, (
"short_edge_length must be two values using 'range' sample style."
f" Got {short_edge_length}!"
)
self._cnt = 0
self._init(locals())
def get_transform(self, image):
if self._cnt % self.clip_frame_cnt == 0:
if self.is_range:
self.size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1)
else:
self.size = np.random.choice(self.short_edge_length)
if self.size == 0:
return NoOpTransform()
self._cnt = 0 # avoiding overflow
self._cnt += 1
h, w = image.shape[:2]
scale = self.size * 1.0 / min(h, w)
if h < w:
newh, neww = self.size, scale * w
else:
newh, neww = scale * h, self.size
if max(newh, neww) > self.max_size:
scale = self.max_size * 1.0 / max(newh, neww)
newh = newh * scale
neww = neww * scale
neww = int(neww + 0.5)
newh = int(newh + 0.5)
return T.ResizeTransform(h, w, newh, neww, self.interp)
class RandomFlip(T.Augmentation):
"""
Flip the image horizontally or vertically with the given probability.
"""
def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_frame_cnt=1):
"""
Args:
prob (float): probability of flip.
horizontal (boolean): whether to apply horizontal flipping
vertical (boolean): whether to apply vertical flipping
"""
super().__init__()
if horizontal and vertical:
raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.")
if not horizontal and not vertical:
raise ValueError("At least one of horiz or vert has to be True!")
self._cnt = 0
self._init(locals())
def get_transform(self, image):
if self._cnt % self.clip_frame_cnt == 0:
self.do = self._rand_range() < self.prob
self._cnt = 0 # avoiding overflow
self._cnt += 1
h, w = image.shape[:2]
if self.do:
if self.horizontal:
return HFlipTransform(w)
elif self.vertical:
return VFlipTransform(h)
else:
return NoOpTransform()
def build_augmentation(cfg, is_train):
logger = logging.getLogger(__name__)
aug_list = []
if is_train:
# Crop
if cfg.INPUT.CROP.ENABLED:
aug_list.append(T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE))
# Resize
min_size = cfg.INPUT.MIN_SIZE_TRAIN
max_size = cfg.INPUT.MAX_SIZE_TRAIN
sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
ms_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM if "by_clip" in cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING else 1
aug_list.append(ResizeShortestEdge(min_size, max_size, sample_style, clip_frame_cnt=ms_clip_frame_cnt))
# Flip
if cfg.INPUT.RANDOM_FLIP != "none":
if cfg.INPUT.RANDOM_FLIP == "flip_by_clip":
flip_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM
else:
flip_clip_frame_cnt = 1
aug_list.append(
# NOTE using RandomFlip modified for the support of flip maintenance
RandomFlip(
horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"),
vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
clip_frame_cnt=flip_clip_frame_cnt,
)
)
# Additional augmentations : brightness, contrast, saturation, rotation
augmentations = cfg.INPUT.AUGMENTATIONS
if "brightness" in augmentations:
aug_list.append(T.RandomBrightness(0.9, 1.1))
if "contrast" in augmentations:
aug_list.append(T.RandomContrast(0.9, 1.1))
if "saturation" in augmentations:
aug_list.append(T.RandomSaturation(0.9, 1.1))
if "rotation" in augmentations:
aug_list.append(
T.RandomRotation(
[-15, 15], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], sample_style="range"
)
)
else:
# Resize
min_size = cfg.INPUT.MIN_SIZE_TEST
max_size = cfg.INPUT.MAX_SIZE_TEST
sample_style = "choice"
aug_list.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
return aug_list
================================================
FILE: mfvis_nococo/mask2former_video/data_video/build.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
import itertools
import logging
import torch.utils.data
from detectron2.config import CfgNode, configurable
from detectron2.data.build import (
build_batch_data_loader,
load_proposals_into_dataset,
trivial_batch_collator,
)
from detectron2.data.catalog import DatasetCatalog
from detectron2.data.common import DatasetFromList, MapDataset
from detectron2.data.dataset_mapper import DatasetMapper
from detectron2.data.samplers import InferenceSampler, TrainingSampler
from detectron2.utils.comm import get_world_size
def _compute_num_images_per_worker(cfg: CfgNode):
num_workers = get_world_size()
images_per_batch = cfg.SOLVER.IMS_PER_BATCH
assert (
images_per_batch % num_workers == 0
), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format(
images_per_batch, num_workers
)
assert (
images_per_batch >= num_workers
), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format(
images_per_batch, num_workers
)
images_per_worker = images_per_batch // num_workers
return images_per_worker
def filter_images_with_only_crowd_annotations(dataset_dicts, dataset_names):
"""
Filter out images with none annotations or only crowd annotations
(i.e., images without non-crowd annotations).
A common training-time preprocessing on COCO dataset.
Args:
dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
Returns:
list[dict]: the same format, but filtered.
"""
num_before = len(dataset_dicts)
def valid(anns):
for ann in anns:
if isinstance(ann, list):
for instance in ann:
if instance.get("iscrowd", 0) == 0:
return True
else:
if ann.get("iscrowd", 0) == 0:
return True
return False
dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])]
num_after = len(dataset_dicts)
logger = logging.getLogger(__name__)
logger.info(
"Removed {} images with no usable annotations. {} images left.".format(
num_before - num_after, num_after
)
)
return dataset_dicts
def get_detection_dataset_dicts(
dataset_names, filter_empty=True, proposal_files=None
):
"""
Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.
Args:
dataset_names (str or list[str]): a dataset name or a list of dataset names
filter_empty (bool): whether to filter out images without instance annotations
proposal_files (list[str]): if given, a list of object proposal files
that match each dataset in `dataset_names`.
Returns:
list[dict]: a list of dicts following the standard dataset dict format.
"""
if isinstance(dataset_names, str):
dataset_names = [dataset_names]
assert len(dataset_names)
dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in dataset_names]
for dataset_name, dicts in zip(dataset_names, dataset_dicts):
assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
if proposal_files is not None:
assert len(dataset_names) == len(proposal_files)
# load precomputed proposals from proposal files
dataset_dicts = [
load_proposals_into_dataset(dataset_i_dicts, proposal_file)
for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files)
]
dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
has_instances = "annotations" in dataset_dicts[0]
if filter_empty and has_instances:
dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts, dataset_names)
assert len(dataset_dicts), "No valid data found in {}.".format(",".join(dataset_names))
return dataset_dicts
def _train_loader_from_config(cfg, mapper, *, dataset=None, sampler=None):
if dataset is None:
dataset = get_detection_dataset_dicts(
cfg.DATASETS.TRAIN,
filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
)
if mapper is None:
mapper = DatasetMapper(cfg, True)
if sampler is None:
sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
logger = logging.getLogger(__name__)
logger.info("Using training sampler {}".format(sampler_name))
sampler = TrainingSampler(len(dataset))
return {
"dataset": dataset,
"sampler": sampler,
"mapper": mapper,
"total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
"aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
"num_workers": cfg.DATALOADER.NUM_WORKERS,
}
# TODO can allow dataset as an iterable or IterableDataset to make this function more general
@configurable(from_config=_train_loader_from_config)
def build_detection_train_loader(
dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0
):
"""
Build a dataloader for object detection with some default features.
This interface is experimental.
Args:
dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
or a map-style pytorch dataset. They can be obtained by using
:func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
mapper (callable): a callable which takes a sample (dict) from dataset and
returns the format to be consumed by the model.
When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``.
sampler (torch.utils.data.sampler.Sampler or None): a sampler that
produces indices to be applied on ``dataset``.
Default to :class:`TrainingSampler`, which coordinates a random shuffle
sequence across all workers.
total_batch_size (int): total batch size across all workers. Batching
simply puts data into a list.
aspect_ratio_grouping (bool): whether to group images with similar
aspect ratio for efficiency. When enabled, it requires each
element in dataset be a dict with keys "width" and "height".
num_workers (int): number of parallel data loading workers
Returns:
torch.utils.data.DataLoader: a dataloader. Each output from it is a
``list[mapped_element]`` of length ``total_batch_size / num_workers``,
where ``mapped_element`` is produced by the ``mapper``.
"""
if isinstance(dataset, list):
dataset = DatasetFromList(dataset, copy=False)
if mapper is not None:
dataset = MapDataset(dataset, mapper)
if sampler is None:
sampler = TrainingSampler(len(dataset))
assert isinstance(sampler, torch.utils.data.sampler.Sampler)
return build_batch_data_loader(
dataset,
sampler,
total_batch_size,
aspect_ratio_grouping=aspect_ratio_grouping,
num_workers=num_workers,
)
def _test_loader_from_config(cfg, dataset_name, mapper=None):
"""
Uses the given `dataset_name` argument (instead of the names in cfg), because the
standard practice is to evaluate each test set individually (not combining them).
"""
dataset = get_detection_dataset_dicts(
[dataset_name],
filter_empty=False,
proposal_files=[
cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(dataset_name)]
]
if cfg.MODEL.LOAD_PROPOSALS
else None,
)
if mapper is None:
mapper = DatasetMapper(cfg, False)
return {"dataset": dataset, "mapper": mapper, "num_workers": cfg.DATALOADER.NUM_WORKERS}
@configurable(from_config=_test_loader_from_config)
def build_detection_test_loader(dataset, *, mapper, num_workers=0):
"""
Similar to `build_detection_train_loader`, but uses a batch size of 1.
This interface is experimental.
Args:
dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
or a map-style pytorch dataset. They can be obtained by using
:func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
mapper (callable): a callable which takes a sample (dict) from dataset
and returns the format to be consumed by the model.
When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``.
num_workers (int): number of parallel data loading workers
Returns:
DataLoader: a torch DataLoader, that loads the given detection
dataset, with test-time transformation and batching.
Examples:
::
data_loader = build_detection_test_loader(
DatasetRegistry.get("my_test"),
mapper=DatasetMapper(...))
# or, instantiate with a CfgNode:
data_loader = build_detection_test_loader(cfg, "my_test")
"""
if isinstance(dataset, list):
dataset = DatasetFromList(dataset, copy=False)
if mapper is not None:
dataset = MapDataset(dataset, mapper)
sampler = InferenceSampler(len(dataset))
# Always use 1 image per worker during inference since this is the
# standard when reporting inference time in papers.
batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False)
data_loader = torch.utils.data.DataLoader(
dataset,
num_workers=num_workers,
batch_sampler=batch_sampler,
collate_fn=trivial_batch_collator,
)
return data_loader
================================================
FILE: mfvis_nococo/mask2former_video/data_video/dataset_mapper.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
import copy
import logging
import random
import numpy as np
from typing import List, Union
import torch
from detectron2.config import configurable
from detectron2.structures import (
BitMasks,
Boxes,
BoxMode,
Instances,
)
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from .augmentation import build_augmentation
import os
__all__ = ["YTVISDatasetMapper", "CocoClipDatasetMapper"]
def seed_everything(seed):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
# torch.cuda.manual_seed(seed)
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = True
def filter_empty_instances(instances, by_box=True, by_mask=True, box_threshold=1e-5):
"""
Filter out empty instances in an `Instances` object.
Args:
instances (Instances):
by_box (bool): whether to filter out instances with empty boxes
by_mask (bool): whether to filter out instances with empty masks
box_threshold (float): minimum width and height to be considered non-empty
Returns:
Instances: the filtered instances.
"""
assert by_box or by_mask
r = []
if by_box:
r.append(instances.gt_boxes.nonempty(threshold=box_threshold))
if instances.has("gt_masks") and by_mask:
r.append(instances.gt_masks.nonempty())
if not r:
return instances
m = r[0]
for x in r[1:]:
m = m & x
instances.gt_ids[~m] = -1
return instances
def _get_dummy_anno(num_classes):
return {
"iscrowd": 0,
"category_id": num_classes,
"id": -1,
"bbox": np.array([0, 0, 0, 0]),
"bbox_mode": BoxMode.XYXY_ABS,
"segmentation": [np.array([0.0] * 6)]
}
def ytvis_annotations_to_instances(annos, image_size):
"""
Create an :class:`Instances` object used by the models,
from instance annotations in the dataset dict.
Args:
annos (list[dict]): a list of instance annotations in one image, each
element for one instance.
image_size (tuple): height, width
Returns:
Instances:
It will contain fields "gt_boxes", "gt_classes", "gt_ids",
"gt_masks", if they can be obtained from `annos`.
This is the format that builtin models expect.
"""
boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos]
target = Instances(image_size)
target.gt_boxes = Boxes(boxes)
classes = [int(obj["category_id"]) for obj in annos]
classes = torch.tensor(classes, dtype=torch.int64)
target.gt_classes = classes
ids = [int(obj["id"]) for obj in annos]
ids = torch.tensor(ids, dtype=torch.int64)
target.gt_ids = ids
if len(annos) and "segmentation" in annos[0]:
segms = [obj["segmentation"] for obj in annos]
masks = []
for segm in segms:
assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
segm.ndim
)
# mask array
masks.append(segm)
# torch.from_numpy does not support array with negative stride.
masks = BitMasks(
torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in masks])
)
target.gt_masks = masks
return target
class YTVISDatasetMapper:
"""
A callable which takes a dataset dict in YouTube-VIS Dataset format,
and map it into a format used by the model.
"""
@configurable
def __init__(
self,
is_train: bool,
*,
augmentations: List[Union[T.Augmentation, T.Transform]],
image_format: str,
use_instance_mask: bool = False,
sampling_frame_num: int = 2,
sampling_frame_range: int = 5,
sampling_frame_shuffle: bool = False,
num_classes: int = 40,
):
"""
NOTE: this interface is experimental.
Args:
is_train: whether it's used in training or inference
augmentations: a list of augmentations or deterministic transforms to apply
image_format: an image format supported by :func:`detection_utils.read_image`.
use_instance_mask: whether to process instance segmentation annotations, if available
"""
# fmt: off
self.is_train = is_train
self.augmentations = T.AugmentationList(augmentations)
self.image_format = image_format
self.use_instance_mask = use_instance_mask
self.sampling_frame_num = sampling_frame_num
self.sampling_frame_range = sampling_frame_range
self.sampling_frame_shuffle = sampling_frame_shuffle
self.num_classes = num_classes
# fmt: on
logger = logging.getLogger(__name__)
mode = "training" if is_train else "inference"
logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}")
seed_everything(29118357)
@classmethod
def from_config(cls, cfg, is_train: bool = True):
augs = build_augmentation(cfg, is_train)
sampling_frame_num = cfg.INPUT.SAMPLING_FRAME_NUM
sampling_frame_range = cfg.INPUT.SAMPLING_FRAME_RANGE
sampling_frame_shuffle = cfg.INPUT.SAMPLING_FRAME_SHUFFLE
ret = {
"is_train": is_train,
"augmentations": augs,
"image_format": cfg.INPUT.FORMAT,
"use_instance_mask": cfg.MODEL.MASK_ON,
"sampling_frame_num": sampling_frame_num,
"sampling_frame_range": sampling_frame_range,
"sampling_frame_shuffle": sampling_frame_shuffle,
"num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
}
return ret
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one video, in YTVIS Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
# TODO consider examining below deepcopy as it costs huge amount of computations.
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
video_length = dataset_dict["length"]
if self.is_train:
ref_frame = random.randrange(video_length)
start_idx = max(0, ref_frame-self.sampling_frame_range)
end_idx = min(video_length, ref_frame+self.sampling_frame_range + 1)
selected_idx = np.random.choice(
np.array(list(range(start_idx, ref_frame)) + list(range(ref_frame+1, end_idx))),
self.sampling_frame_num - 1,
)
selected_idx = selected_idx.tolist() + [ref_frame]
selected_idx = sorted(selected_idx)
# print('selected_idx:', selected_idx)
if self.sampling_frame_shuffle:
random.shuffle(selected_idx)
else:
selected_idx = range(video_length)
video_annos = dataset_dict.pop("annotations", None)
file_names = dataset_dict.pop("file_names", None)
if self.is_train:
_ids = set()
for frame_idx in selected_idx:
_ids.update([anno["id"] for anno in video_annos[frame_idx]])
ids = dict()
for i, _id in enumerate(_ids):
ids[_id] = i
dataset_dict["image"] = []
dataset_dict["instances"] = []
dataset_dict["file_names"] = []
for frame_idx in selected_idx:
dataset_dict["file_names"].append(file_names[frame_idx])
# Read image
image = utils.read_image(file_names[frame_idx], format=self.image_format)
utils.check_image_size(dataset_dict, image)
aug_input = T.AugInput(image)
transforms = self.augmentations(aug_input)
image = aug_input.image
image_shape = image.shape[:2] # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"].append(torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))))
if (video_annos is None) or (not self.is_train):
continue
# NOTE copy() is to prevent annotations getting changed from applying augmentations
_frame_annos = []
for anno in video_annos[frame_idx]:
_anno = {}
for k, v in anno.items():
_anno[k] = copy.deepcopy(v)
_frame_annos.append(_anno)
# USER: Implement additional transformations if you have other types of data
annos = [
utils.transform_instance_annotations(obj, transforms, image_shape)
for obj in _frame_annos
if obj.get("iscrowd", 0) == 0
]
sorted_annos = [_get_dummy_anno(self.num_classes) for _ in range(len(ids))]
for _anno in annos:
idx = ids[_anno["id"]]
sorted_annos[idx] = _anno
_gt_ids = [_anno["id"] for _anno in sorted_annos]
instances = utils.annotations_to_instances(sorted_annos, image_shape, mask_format="bitmask")
instances.gt_ids = torch.tensor(_gt_ids)
if instances.has("gt_masks"):
instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
instances = filter_empty_instances(instances)
else:
instances.gt_masks = BitMasks(torch.empty((0, *image_shape)))
dataset_dict["instances"].append(instances)
return dataset_dict
class CocoClipDatasetMapper:
"""
A callable which takes a COCO image which converts into multiple frames,
and map it into a format used by the model.
"""
@configurable
def __init__(
self,
is_train: bool,
*,
augmentations: List[Union[T.Augmentation, T.Transform]],
image_format: str,
use_instance_mask: bool = False,
sampling_frame_num: int = 2,
):
"""
NOTE: this interface is experimental.
Args:
is_train: whether it's used in training or inference
augmentations: a list of augmentations or deterministic transforms to apply
image_format: an image format supported by :func:`detection_utils.read_image`.
use_instance_mask: whether to process instance segmentation annotations, if available
"""
# fmt: off
self.is_train = is_train
self.augmentations = T.AugmentationList(augmentations)
self.image_format = image_format
self.use_instance_mask = use_instance_mask
self.sampling_frame_num = sampling_frame_num
# fmt: on
logger = logging.getLogger(__name__)
mode = "training" if is_train else "inference"
logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}")
@classmethod
def from_config(cls, cfg, is_train: bool = True):
augs = build_augmentation(cfg, is_train)
sampling_frame_num = cfg.INPUT.SAMPLING_FRAME_NUM
ret = {
"is_train": is_train,
"augmentations": augs,
"image_format": cfg.INPUT.FORMAT,
"use_instance_mask": cfg.MODEL.MASK_ON,
"sampling_frame_num": sampling_frame_num,
}
return ret
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
img_annos = dataset_dict.pop("annotations", None)
file_name = dataset_dict.pop("file_name", None)
original_image = utils.read_image(file_name, format=self.image_format)
dataset_dict["image"] = []
dataset_dict["instances"] = []
dataset_dict["file_names"] = [file_name] * self.sampling_frame_num
for _ in range(self.sampling_frame_num):
utils.check_image_size(dataset_dict, original_image)
aug_input = T.AugInput(original_image)
transforms = self.augmentations(aug_input)
image = aug_input.image
image_shape = image.shape[:2] # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"].append(torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))))
if (img_annos is None) or (not self.is_train):
continue
_img_annos = []
for anno in img_annos:
_anno = {}
for k, v in anno.items():
_anno[k] = copy.deepcopy(v)
_img_annos.append(_anno)
# USER: Implement additional transformations if you have other types of data
annos = [
utils.transform_instance_annotations(obj, transforms, image_shape)
for obj in _img_annos
if obj.get("iscrowd", 0) == 0
]
_gt_ids = list(range(len(annos)))
for idx in range(len(annos)):
if len(annos[idx]["segmentation"]) == 0:
annos[idx]["segmentation"] = [np.array([0.0] * 6)]
instances = utils.annotations_to_instances(annos, image_shape, mask_format="bitmask")
instances.gt_ids = torch.tensor(_gt_ids)
if instances.has("gt_masks"):
instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
instances = filter_empty_instances(instances)
else:
instances.gt_masks = BitMasks(torch.empty((0, *image_shape)))
dataset_dict["instances"].append(instances)
return dataset_dict
================================================
FILE: mfvis_nococo/mask2former_video/data_video/datasets/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
from . import builtin # ensure the builtin datasets are registered
__all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
================================================
FILE: mfvis_nococo/mask2former_video/data_video/datasets/builtin.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
import os
from .ytvis import (
register_ytvis_instances,
_get_ytvis_2019_instances_meta,
_get_ytvis_2021_instances_meta,
)
# ==== Predefined splits for YTVIS 2019 ===========
_PREDEFINED_SPLITS_YTVIS_2019 = {
"ytvis_2019_train": ("ytvis_2019/train/JPEGImages",
"ytvis_2019/train.json"),
"ytvis_2019_val": ("ytvis_2019/valid/JPEGImages",
"ytvis_2019/valid.json"),
"ytvis_2019_test": ("ytvis_2019/test/JPEGImages",
"ytvis_2019/test.json"),
}
# ==== Predefined splits for YTVIS 2021 ===========
_PREDEFINED_SPLITS_YTVIS_2021 = {
"ytvis_2021_train": ("ytvis_2021/train/JPEGImages",
"ytvis_2021/train.json"),
"ytvis_2021_val": ("ytvis_2021/valid/JPEGImages",
"ytvis_2021/valid.json"),
"ytvis_2021_test": ("ytvis_2021/test/JPEGImages",
"ytvis_2021/test.json"),
}
def register_all_ytvis_2019(root):
for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2019.items():
# Assume pre-defined datasets live in `./datasets`.
register_ytvis_instances(
key,
_get_ytvis_2019_instances_meta(),
os.path.join(root, json_file) if "://" not in json_file else json_file,
os.path.join(root, image_root),
)
def register_all_ytvis_2021(root):
for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2021.items():
# Assume pre-defined datasets live in `./datasets`.
register_ytvis_instances(
key,
_get_ytvis_2021_instances_meta(),
os.path.join(root, json_file) if "://" not in json_file else json_file,
os.path.join(root, image_root),
)
if __name__.endswith(".builtin"):
# Assume pre-defined datasets live in `./datasets`.
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_ytvis_2019(_root)
register_all_ytvis_2021(_root)
================================================
FILE: mfvis_nococo/mask2former_video/data_video/datasets/ytvis.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
import contextlib
import io
import json
import logging
import numpy as np
import os
import pycocotools.mask as mask_util
from fvcore.common.file_io import PathManager
from fvcore.common.timer import Timer
from detectron2.structures import Boxes, BoxMode, PolygonMasks
from detectron2.data import DatasetCatalog, MetadataCatalog
"""
This file contains functions to parse YTVIS dataset of
COCO-format annotations into dicts in "Detectron2 format".
"""
logger = logging.getLogger(__name__)
__all__ = ["load_ytvis_json", "register_ytvis_instances"]
YTVIS_CATEGORIES_2019 = [
{"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
{"color": [0, 82, 0], "isthing": 1, "id": 2, "name": "giant_panda"},
{"color": [119, 11, 32], "isthing": 1, "id": 3, "name": "lizard"},
{"color": [165, 42, 42], "isthing": 1, "id": 4, "name": "parrot"},
{"color": [134, 134, 103], "isthing": 1, "id": 5, "name": "skateboard"},
{"color": [0, 0, 142], "isthing": 1, "id": 6, "name": "sedan"},
{"color": [255, 109, 65], "isthing": 1, "id": 7, "name": "ape"},
{"color": [0, 226, 252], "isthing": 1, "id": 8, "name": "dog"},
{"color": [5, 121, 0], "isthing": 1, "id": 9, "name": "snake"},
{"color": [0, 60, 100], "isthing": 1, "id": 10, "name": "monkey"},
{"color": [250, 170, 30], "isthing": 1, "id": 11, "name": "hand"},
{"color": [100, 170, 30], "isthing": 1, "id": 12, "name": "rabbit"},
{"color": [179, 0, 194], "isthing": 1, "id": 13, "name": "duck"},
{"color": [255, 77, 255], "isthing": 1, "id": 14, "name": "cat"},
{"color": [120, 166, 157], "isthing": 1, "id": 15, "name": "cow"},
{"color": [73, 77, 174], "isthing": 1, "id": 16, "name": "fish"},
{"color": [0, 80, 100], "isthing": 1, "id": 17, "name": "train"},
{"color": [182, 182, 255], "isthing": 1, "id": 18, "name": "horse"},
{"color": [0, 143, 149], "isthing": 1, "id": 19, "name": "turtle"},
{"color": [174, 57, 255], "isthing": 1, "id": 20, "name": "bear"},
{"color": [0, 0, 230], "isthing": 1, "id": 21, "name": "motorbike"},
{"color": [72, 0, 118], "isthing": 1, "id": 22, "name": "giraffe"},
{"color": [255, 179, 240], "isthing": 1, "id": 23, "name": "leopard"},
{"color": [0, 125, 92], "isthing": 1, "id": 24, "name": "fox"},
{"color": [209, 0, 151], "isthing": 1, "id": 25, "name": "deer"},
{"color": [188, 208, 182], "isthing": 1, "id": 26, "name": "owl"},
{"color": [145, 148, 174], "isthing": 1, "id": 27, "name": "surfboard"},
{"color": [106, 0, 228], "isthing": 1, "id": 28, "name": "airplane"},
{"color": [0, 0, 70], "isthing": 1, "id": 29, "name": "truck"},
{"color": [199, 100, 0], "isthing": 1, "id": 30, "name": "zebra"},
{"color": [166, 196, 102], "isthing": 1, "id": 31, "name": "tiger"},
{"color": [110, 76, 0], "isthing": 1, "id": 32, "name": "elephant"},
{"color": [133, 129, 255], "isthing": 1, "id": 33, "name": "snowboard"},
{"color": [0, 0, 192], "isthing": 1, "id": 34, "name": "boat"},
{"color": [183, 130, 88], "isthing": 1, "id": 35, "name": "shark"},
{"color": [130, 114, 135], "isthing": 1, "id": 36, "name": "mouse"},
{"color": [107, 142, 35], "isthing": 1, "id": 37, "name": "frog"},
{"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "eagle"},
{"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "earless_seal"},
{"color": [255, 208, 186], "isthing": 1, "id": 40, "name": "tennis_racket"},
]
YTVIS_CATEGORIES_2021 = [
{"color": [106, 0, 228], "isthing": 1, "id": 1, "name": "airplane"},
{"color": [174, 57, 255], "isthing": 1, "id": 2, "name": "bear"},
{"color": [255, 109, 65], "isthing": 1, "id": 3, "name": "bird"},
{"color": [0, 0, 192], "isthing": 1, "id": 4, "name": "boat"},
{"color": [0, 0, 142], "isthing": 1, "id": 5, "name": "car"},
{"color": [255, 77, 255], "isthing": 1, "id": 6, "name": "cat"},
{"color": [120, 166, 157], "isthing": 1, "id": 7, "name": "cow"},
{"color": [209, 0, 151], "isthing": 1, "id": 8, "name": "deer"},
{"color": [0, 226, 252], "isthing": 1, "id": 9, "name": "dog"},
{"color": [179, 0, 194], "isthing": 1, "id": 10, "name": "duck"},
{"color": [174, 255, 243], "isthing": 1, "id": 11, "name": "earless_seal"},
{"color": [110, 76, 0], "isthing": 1, "id": 12, "name": "elephant"},
{"color": [73, 77, 174], "isthing": 1, "id": 13, "name": "fish"},
{"color": [250, 170, 30], "isthing": 1, "id": 14, "name": "flying_disc"},
{"color": [0, 125, 92], "isthing": 1, "id": 15, "name": "fox"},
{"color": [107, 142, 35], "isthing": 1, "id": 16, "name": "frog"},
{"color": [0, 82, 0], "isthing": 1, "id": 17, "name": "giant_panda"},
{"color": [72, 0, 118], "isthing": 1, "id": 18, "name": "giraffe"},
{"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
{"color": [255, 179, 240], "isthing": 1, "id": 20, "name": "leopard"},
{"color": [119, 11, 32], "isthing": 1, "id": 21, "name": "lizard"},
{"color": [0, 60, 100], "isthing": 1, "id": 22, "name": "monkey"},
{"color": [0, 0, 230], "isthing": 1, "id": 23, "name": "motorbike"},
{"color": [130, 114, 135], "isthing": 1, "id": 24, "name": "mouse"},
{"color": [165, 42, 42], "isthing": 1, "id": 25, "name": "parrot"},
{"color": [220, 20, 60], "isthing": 1, "id": 26, "name": "person"},
{"color": [100, 170, 30], "isthing": 1, "id": 27, "name": "rabbit"},
{"color": [183, 130, 88], "isthing": 1, "id": 28, "name": "shark"},
{"color": [134, 134, 103], "isthing": 1, "id": 29, "name": "skateboard"},
{"color": [5, 121, 0], "isthing": 1, "id": 30, "name": "snake"},
{"color": [133, 129, 255], "isthing": 1, "id": 31, "name": "snowboard"},
{"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "squirrel"},
{"color": [145, 148, 174], "isthing": 1, "id": 33, "name": "surfboard"},
{"color": [255, 208, 186], "isthing": 1, "id": 34, "name": "tennis_racket"},
{"color": [166, 196, 102], "isthing": 1, "id": 35, "name": "tiger"},
{"color": [0, 80, 100], "isthing": 1, "id": 36, "name": "train"},
{"color": [0, 0, 70], "isthing": 1, "id": 37, "name": "truck"},
{"color": [0, 143, 149], "isthing": 1, "id": 38, "name": "turtle"},
{"color": [0, 228, 0], "isthing": 1, "id": 39, "name": "whale"},
{"color": [199, 100, 0], "isthing": 1, "id": 40, "name": "zebra"},
]
def _get_ytvis_2019_instances_meta():
thing_ids = [k["id"] for k in YTVIS_CATEGORIES_2019 if k["isthing"] == 1]
thing_colors = [k["color"] for k in YTVIS_CATEGORIES_2019 if k["isthing"] == 1]
assert len(thing_ids) == 40, len(thing_ids)
# Mapping from the incontiguous YTVIS category id to an id in [0, 39]
thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
thing_classes = [k["name"] for k in YTVIS_CATEGORIES_2019 if k["isthing"] == 1]
ret = {
"thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
"thing_classes": thing_classes,
"thing_colors": thing_colors,
}
return ret
def _get_ytvis_2021_instances_meta():
thing_ids = [k["id"] for k in YTVIS_CATEGORIES_2021 if k["isthing"] == 1]
thing_colors = [k["color"] for k in YTVIS_CATEGORIES_2021 if k["isthing"] == 1]
assert len(thing_ids) == 40, len(thing_ids)
# Mapping from the incontiguous YTVIS category id to an id in [0, 39]
thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
thing_classes = [k["name"] for k in YTVIS_CATEGORIES_2021 if k["isthing"] == 1]
ret = {
"thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
"thing_classes": thing_classes,
"thing_colors": thing_colors,
}
return ret
def load_ytvis_json(json_file, image_root, dataset_name=None, extra_annotation_keys=None):
from .ytvis_api.ytvos import YTVOS
timer = Timer()
json_file = PathManager.get_local_path(json_file)
with contextlib.redirect_stdout(io.StringIO()):
ytvis_api = YTVOS(json_file)
if timer.seconds() > 1:
logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
id_map = None
if dataset_name is not None:
meta = MetadataCatalog.get(dataset_name)
cat_ids = sorted(ytvis_api.getCatIds())
cats = ytvis_api.loadCats(cat_ids)
# The categories in a custom json file may not be sorted.
thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])]
meta.thing_classes = thing_classes
# In COCO, certain category ids are artificially removed,
# and by convention they are always ignored.
# We deal with COCO's id issue and translate
# the category ids to contiguous ids in [0, 80).
# It works by looking at the "categories" field in the json, therefore
# if users' own json also have incontiguous ids, we'll
# apply this mapping as well but print a warning.
if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)):
if "coco" not in dataset_name:
logger.warning(
"""
Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you.
"""
)
id_map = {v: i for i, v in enumerate(cat_ids)}
meta.thing_dataset_id_to_contiguous_id = id_map
# sort indices for reproducible results
vid_ids = sorted(ytvis_api.vids.keys())
# vids is a list of dicts, each looks something like:
# {'license': 1,
# 'flickr_url': ' ',
# 'file_names': ['ff25f55852/00000.jpg', 'ff25f55852/00005.jpg', ..., 'ff25f55852/00175.jpg'],
# 'height': 720,
# 'width': 1280,
# 'length': 36,
# 'date_captured': '2019-04-11 00:55:41.903902',
# 'id': 2232}
vids = ytvis_api.loadVids(vid_ids)
anns = [ytvis_api.vidToAnns[vid_id] for vid_id in vid_ids]
total_num_valid_anns = sum([len(x) for x in anns])
total_num_anns = len(ytvis_api.anns)
if total_num_valid_anns < total_num_anns:
logger.warning(
f"{json_file} contains {total_num_anns} annotations, but only "
f"{total_num_valid_anns} of them match to images in the file."
)
vids_anns = list(zip(vids, anns))
logger.info("Loaded {} videos in YTVIS format from {}".format(len(vids_anns), json_file))
dataset_dicts = []
ann_keys = ["iscrowd", "category_id", "id"] + (extra_annotation_keys or [])
num_instances_without_valid_segmentation = 0
for (vid_dict, anno_dict_list) in vids_anns:
record = {}
record["file_names"] = [os.path.join(image_root, vid_dict["file_names"][i]) for i in range(vid_dict["length"])]
record["height"] = vid_dict["height"]
record["width"] = vid_dict["width"]
record["length"] = vid_dict["length"]
video_id = record["video_id"] = vid_dict["id"]
video_objs = []
for frame_idx in range(record["length"]):
frame_objs = []
for anno in anno_dict_list:
assert anno["video_id"] == video_id
obj = {key: anno[key] for key in ann_keys if key in anno}
_bboxes = anno.get("bboxes", None)
_segm = anno.get("segmentations", None)
if not (_bboxes and _segm and _bboxes[frame_idx] and _segm[frame_idx]):
continue
bbox = _bboxes[frame_idx]
segm = _segm[frame_idx]
obj["bbox"] = bbox
obj["bbox_mode"] = BoxMode.XYWH_ABS
if isinstance(segm, dict):
if isinstance(segm["counts"], list):
# convert to compressed RLE
segm = mask_util.frPyObjects(segm, *segm["size"])
elif segm:
# filter out invalid polygons (< 3 points)
segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
if len(segm) == 0:
num_instances_without_valid_segmentation += 1
continue # ignore this instance
obj["segmentation"] = segm
if id_map:
obj["category_id"] = id_map[obj["category_id"]]
frame_objs.append(obj)
video_objs.append(frame_objs)
record["annotations"] = video_objs
dataset_dicts.append(record)
if num_instances_without_valid_segmentation > 0:
logger.warning(
"Filtered out {} instances without valid segmentation. ".format(
num_instances_without_valid_segmentation
)
+ "There might be issues in your dataset generation process. "
"A valid polygon should be a list[float] with even length >= 6."
)
return dataset_dicts
def register_ytvis_instances(name, metadata, json_file, image_root):
"""
Register a dataset in YTVIS's json annotation format for
instance tracking.
Args:
name (str): the name that identifies a dataset, e.g. "ytvis_train".
metadata (dict): extra metadata associated with this dataset. You can
leave it as an empty dict.
json_file (str): path to the json instance annotation file.
image_root (str or path-like): directory which contains all the images.
"""
assert isinstance(name, str), name
assert isinstance(json_file, (str, os.PathLike)), json_file
assert isinstance(image_root, (str, os.PathLike)), image_root
# 1. register a function which returns dicts
DatasetCatalog.register(name, lambda: load_ytvis_json(json_file, image_root, name))
# 2. Optionally, add metadata about this dataset,
# since they might be useful in evaluation, visualization or logging
MetadataCatalog.get(name).set(
json_file=json_file, image_root=image_root, evaluator_type="ytvis", **metadata
)
if __name__ == "__main__":
"""
Test the YTVIS json dataset loader.
"""
from detectron2.utils.logger import setup_logger
from detectron2.utils.visualizer import Visualizer
import detectron2.data.datasets # noqa # add pre-defined metadata
import sys
from PIL import Image
logger = setup_logger(name=__name__)
#assert sys.argv[3] in DatasetCatalog.list()
meta = MetadataCatalog.get("ytvis_2019_train")
json_file = "./datasets/ytvis/instances_train_sub.json"
image_root = "./datasets/ytvis/train/JPEGImages"
dicts = load_ytvis_json(json_file, image_root, dataset_name="ytvis_2019_train")
logger.info("Done loading {} samples.".format(len(dicts)))
dirname = "ytvis-data-vis"
os.makedirs(dirname, exist_ok=True)
def extract_frame_dic(dic, frame_idx):
import copy
frame_dic = copy.deepcopy(dic)
annos = frame_dic.get("annotations", None)
if annos:
frame_dic["annotations"] = annos[frame_idx]
return frame_dic
for d in dicts:
vid_name = d["file_names"][0].split('/')[-2]
os.makedirs(os.path.join(dirname, vid_name), exist_ok=True)
for idx, file_name in enumerate(d["file_names"]):
img = np.array(Image.open(file_name))
visualizer = Visualizer(img, metadata=meta)
vis = visualizer.draw_dataset_dict(extract_frame_dic(d, idx))
fpath = os.path.join(dirname, vid_name, file_name.split('/')[-1])
vis.save(fpath)
================================================
FILE: mfvis_nococo/mask2former_video/data_video/datasets/ytvis_api/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi
================================================
FILE: mfvis_nococo/mask2former_video/data_video/datasets/ytvis_api/ytvos.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi
__author__ = 'ychfan'
# Interface for accessing the YouTubeVIS dataset.
# The following API functions are defined:
# YTVOS - YTVOS api class that loads YouTubeVIS annotation file and prepare data structures.
# decodeMask - Decode binary mask M encoded via run-length encoding.
# encodeMask - Encode binary mask M using run-length encoding.
# getAnnIds - Get ann ids that satisfy given filter conditions.
# getCatIds - Get cat ids that satisfy given filter conditions.
# getImgIds - Get img ids that satisfy given filter conditions.
# loadAnns - Load anns with the specified ids.
# loadCats - Load cats with the specified ids.
# loadImgs - Load imgs with the specified ids.
# annToMask - Convert segmentation in an annotation to binary mask.
# loadRes - Load algorithm results and create API for accessing them.
# Microsoft COCO Toolbox. version 2.0
# Data, paper, and tutorials available at: http://mscoco.org/
# Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
# Licensed under the Simplified BSD License [see bsd.txt]
import json
import time
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.patches import Polygon
import numpy as np
import copy
import itertools
from pycocotools import mask as maskUtils
import os
from collections import defaultdict
import sys
PYTHON_VERSION = sys.version_info[0]
if PYTHON_VERSION == 2:
from urllib import urlretrieve
elif PYTHON_VERSION == 3:
from urllib.request import urlretrieve
def _isArrayLike(obj):
return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
class YTVOS:
def __init__(self, annotation_file=None):
"""
Constructor of Microsoft COCO helper class for reading and visualizing annotations.
:param annotation_file (str): location of annotation file
:param image_folder (str): location to the folder that hosts images.
:return:
"""
# load dataset
self.dataset,self.anns,self.cats,self.vids = dict(),dict(),dict(),dict()
self.vidToAnns, self.catToVids = defaultdict(list), defaultdict(list)
if not annotation_file == None:
print('loading annotations into memory...')
tic = time.time()
dataset = json.load(open(annotation_file, 'r'))
assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
print('Done (t={:0.2f}s)'.format(time.time()- tic))
self.dataset = dataset
self.createIndex()
def createIndex(self):
# create index
print('creating index...')
anns, cats, vids = {}, {}, {}
vidToAnns,catToVids = defaultdict(list),defaultdict(list)
if 'annotations' in self.dataset:
for ann in self.dataset['annotations']:
vidToAnns[ann['video_id']].append(ann)
anns[ann['id']] = ann
if 'videos' in self.dataset:
for vid in self.dataset['videos']:
vids[vid['id']] = vid
if 'categories' in self.dataset:
for cat in self.dataset['categories']:
cats[cat['id']] = cat
if 'annotations' in self.dataset and 'categories' in self.dataset:
for ann in self.dataset['annotations']:
catToVids[ann['category_id']].append(ann['video_id'])
print('index created!')
# create class members
self.anns = anns
self.vidToAnns = vidToAnns
self.catToVids = catToVids
self.vids = vids
self.cats = cats
def info(self):
"""
Print information about the annotation file.
:return:
"""
for key, value in self.dataset['info'].items():
print('{}: {}'.format(key, value))
def getAnnIds(self, vidIds=[], catIds=[], areaRng=[], iscrowd=None):
"""
Get ann ids that satisfy given filter conditions. default skips that filter
:param vidIds (int array) : get anns for given vids
catIds (int array) : get anns for given cats
areaRng (float array) : get anns for given area range (e.g. [0 inf])
iscrowd (boolean) : get anns for given crowd label (False or True)
:return: ids (int array) : integer array of ann ids
"""
vidIds = vidIds if _isArrayLike(vidIds) else [vidIds]
catIds = catIds if _isArrayLike(catIds) else [catIds]
if len(vidIds) == len(catIds) == len(areaRng) == 0:
anns = self.dataset['annotations']
else:
if not len(vidIds) == 0:
lists = [self.vidToAnns[vidId] for vidId in vidIds if vidId in self.vidToAnns]
anns = list(itertools.chain.from_iterable(lists))
else:
anns = self.dataset['annotations']
anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds]
anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['avg_area'] > areaRng[0] and ann['avg_area'] < areaRng[1]]
if not iscrowd == None:
ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
else:
ids = [ann['id'] for ann in anns]
return ids
def getCatIds(self, catNms=[], supNms=[], catIds=[]):
"""
filtering parameters. default skips that filter.
:param catNms (str array) : get cats for given cat names
:param supNms (str array) : get cats for given supercategory names
:param catIds (int array) : get cats for given cat ids
:return: ids (int array) : integer array of cat ids
"""
catNms = catNms if _isArrayLike(catNms) else [catNms]
supNms = supNms if _isArrayLike(supNms) else [supNms]
catIds = catIds if _isArrayLike(catIds) else [catIds]
if len(catNms) == len(supNms) == len(catIds) == 0:
cats = self.dataset['categories']
else:
cats = self.dataset['categories']
cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms]
cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds]
ids = [cat['id'] for cat in cats]
return ids
def getVidIds(self, vidIds=[], catIds=[]):
'''
Get vid ids that satisfy given filter conditions.
:param vidIds (int array) : get vids for given ids
:param catIds (int array) : get vids with all given cats
:return: ids (int array) : integer array of vid ids
'''
vidIds = vidIds if _isArrayLike(vidIds) else [vidIds]
catIds = catIds if _isArrayLike(catIds) else [catIds]
if len(vidIds) == len(catIds) == 0:
ids = self.vids.keys()
else:
ids = set(vidIds)
for i, catId in enumerate(catIds):
if i == 0 and len(ids) == 0:
ids = set(self.catToVids[catId])
else:
ids &= set(self.catToVids[catId])
return list(ids)
def loadAnns(self, ids=[]):
"""
Load anns with the specified ids.
:param ids (int array) : integer ids specifying anns
:return: anns (object array) : loaded ann objects
"""
if _isArrayLike(ids):
return [self.anns[id] for id in ids]
elif type(ids) == int:
return [self.anns[ids]]
def loadCats(self, ids=[]):
"""
Load cats with the specified ids.
:param ids (int array) : integer ids specifying cats
:return: cats (object array) : loaded cat objects
"""
if _isArrayLike(ids):
return [self.cats[id] for id in ids]
elif type(ids) == int:
return [self.cats[ids]]
def loadVids(self, ids=[]):
"""
Load anns with the specified ids.
:param ids (int array) : integer ids specifying vid
:return: vids (object array) : loaded vid objects
"""
if _isArrayLike(ids):
return [self.vids[id] for id in ids]
elif type(ids) == int:
return [self.vids[ids]]
def loadRes(self, resFile):
"""
Load result file and return a result api object.
:param resFile (str) : file name of result file
:return: res (obj) : result api object
"""
res = YTVOS()
res.dataset['videos'] = [img for img in self.dataset['videos']]
print('Loading and preparing results...')
tic = time.time()
if type(resFile) == str or (PYTHON_VERSION == 2 and type(resFile) == unicode):
anns = json.load(open(resFile))
elif type(resFile) == np.ndarray:
anns = self.loadNumpyAnnotations(resFile)
else:
anns = resFile
assert type(anns) == list, 'results in not an array of objects'
annsVidIds = [ann['video_id'] for ann in anns]
assert set(annsVidIds) == (set(annsVidIds) & set(self.getVidIds())), \
'Results do not correspond to current coco set'
if 'segmentations' in anns[0]:
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
for id, ann in enumerate(anns):
ann['areas'] = []
if not 'bboxes' in ann:
ann['bboxes'] = []
for seg in ann['segmentations']:
# now only support compressed RLE format as segmentation results
if seg:
ann['areas'].append(maskUtils.area(seg))
if len(ann['bboxes']) < len(ann['areas']):
ann['bboxes'].append(maskUtils.toBbox(seg))
else:
ann['areas'].append(None)
if len(ann['bboxes']) < len(ann['areas']):
ann['bboxes'].append(None)
ann['id'] = id+1
l = [a for a in ann['areas'] if a]
if len(l)==0:
ann['avg_area'] = 0
else:
ann['avg_area'] = np.array(l).mean()
ann['iscrowd'] = 0
print('DONE (t={:0.2f}s)'.format(time.time()- tic))
res.dataset['annotations'] = anns
res.createIndex()
return res
def annToRLE(self, ann, frameId):
"""
Convert annotation which can be polygons, uncompressed RLE to RLE.
:return: binary mask (numpy 2D array)
"""
t = self.vids[ann['video_id']]
h, w = t['height'], t['width']
segm = ann['segmentations'][frameId]
if type(segm) == list:
# polygon -- a single object might consist of multiple parts
# we merge all parts into one mask rle code
rles = maskUtils.frPyObjects(segm, h, w)
rle = maskUtils.merge(rles)
elif type(segm['counts']) == list:
# uncompressed RLE
rle = maskUtils.frPyObjects(segm, h, w)
else:
# rle
rle = segm
return rle
def annToMask(self, ann, frameId):
"""
Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
:return: binary mask (numpy 2D array)
"""
rle = self.annToRLE(ann, frameId)
m = maskUtils.decode(rle)
return m
================================================
FILE: mfvis_nococo/mask2former_video/data_video/datasets/ytvis_api/ytvoseval.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi
__author__ = 'ychfan'
import numpy as np
import datetime
import time
from collections import defaultdict
from pycocotools import mask as maskUtils
import copy
class YTVOSeval:
# Interface for evaluating video instance segmentation on the YouTubeVIS dataset.
#
# The usage for YTVOSeval is as follows:
# cocoGt=..., cocoDt=... # load dataset and results
# E = YTVOSeval(cocoGt,cocoDt); # initialize YTVOSeval object
# E.params.recThrs = ...; # set parameters as desired
# E.evaluate(); # run per image evaluation
# E.accumulate(); # accumulate per image results
# E.summarize(); # display summary metrics of results
# For example usage see evalDemo.m and http://mscoco.org/.
#
# The evaluation parameters are as follows (defaults in brackets):
# imgIds - [all] N img ids to use for evaluation
# catIds - [all] K cat ids to use for evaluation
# iouThrs - [.5:.05:.95] T=10 IoU thresholds for evaluation
# recThrs - [0:.01:1] R=101 recall thresholds for evaluation
# areaRng - [...] A=4 object area ranges for evaluation
# maxDets - [1 10 100] M=3 thresholds on max detections per image
# iouType - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints'
# iouType replaced the now DEPRECATED useSegm parameter.
# useCats - [1] if true use category labels for evaluation
# Note: if useCats=0 category labels are ignored as in proposal scoring.
# Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified.
#
# evaluate(): evaluates detections on every image and every category and
# concats the results into the "evalImgs" with fields:
# dtIds - [1xD] id for each of the D detections (dt)
# gtIds - [1xG] id for each of the G ground truths (gt)
# dtMatches - [TxD] matching gt id at each IoU or 0
# gtMatches - [TxG] matching dt id at each IoU or 0
# dtScores - [1xD] confidence of each dt
# gtIgnore - [1xG] ignore flag for each gt
# dtIgnore - [TxD] ignore flag for each dt at each IoU
#
# accumulate(): accumulates the per-image, per-category evaluation
# results in "evalImgs" into the dictionary "eval" with fields:
# params - parameters used for evaluation
# date - date evaluation was performed
# counts - [T,R,K,A,M] parameter dimensions (see above)
# precision - [TxRxKxAxM] precision for every evaluation setting
# recall - [TxKxAxM] max recall for every evaluation setting
# Note: precision and recall==-1 for settings with no gt objects.
#
# See also coco, mask, pycocoDemo, pycocoEvalDemo
#
# Microsoft COCO Toolbox. version 2.0
# Data, paper, and tutorials available at: http://mscoco.org/
# Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
# Licensed under the Simplified BSD License [see coco/license.txt]
def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'):
'''
Initialize CocoEval using coco APIs for gt and dt
:param cocoGt: coco object with ground truth annotations
:param cocoDt: coco object with detection results
:return: None
'''
if not iouType:
print('iouType not specified. use default iouType segm')
self.cocoGt = cocoGt # ground truth COCO API
self.cocoDt = cocoDt # detections COCO API
self.params = {} # evaluation parameters
self.evalVids = defaultdict(list) # per-image per-category evaluation results [KxAxI] elements
self.eval = {} # accumulated evaluation results
self._gts = defaultdict(list) # gt for evaluation
self._dts = defaultdict(list) # dt for evaluation
self.params = Params(iouType=iouType) # parameters
self._paramsEval = {} # parameters for evaluation
self.stats = [] # result summarization
self.ious = {} # ious between all gts and dts
if not cocoGt is None:
self.params.vidIds = sorted(cocoGt.getVidIds())
self.params.catIds = sorted(cocoGt.getCatIds())
def _prepare(self):
'''
Prepare ._gts and ._dts for evaluation based on params
:return: None
'''
def _toMask(anns, coco):
# modify ann['segmentation'] by reference
for ann in anns:
for i, a in enumerate(ann['segmentations']):
if a:
rle = coco.annToRLE(ann, i)
ann['segmentations'][i] = rle
l = [a for a in ann['areas'] if a]
if len(l)==0:
ann['avg_area'] = 0
else:
ann['avg_area'] = np.array(l).mean()
p = self.params
if p.useCats:
gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(vidIds=p.vidIds, catIds=p.catIds))
dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(vidIds=p.vidIds, catIds=p.catIds))
else:
gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(vidIds=p.vidIds))
dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(vidIds=p.vidIds))
# convert ground truth to mask if iouType == 'segm'
if p.iouType == 'segm':
_toMask(gts, self.cocoGt)
_toMask(dts, self.cocoDt)
# set ignore flag
for gt in gts:
gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0
gt['ignore'] = 'iscrowd' in gt and gt['iscrowd']
if p.iouType == 'keypoints':
gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore']
self._gts = defaultdict(list) # gt for evaluation
self._dts = defaultdict(list) # dt for evaluation
for gt in gts:
self._gts[gt['video_id'], gt['category_id']].append(gt)
for dt in dts:
self._dts[dt['video_id'], dt['category_id']].append(dt)
self.evalVids = defaultdict(list) # per-image per-category evaluation results
self.eval = {} # accumulated evaluation results
def evaluate(self):
'''
Run per image evaluation on given images and store results (a list of dict) in self.evalVids
:return: None
'''
tic = time.time()
print('Running per image evaluation...')
p = self.params
# add backward compatibility if useSegm is specified in params
if not p.useSegm is None:
p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
print('Evaluate annotation type *{}*'.format(p.iouType))
p.vidIds = list(np.unique(p.vidIds))
if p.useCats:
p.catIds = list(np.unique(p.catIds))
p.maxDets = sorted(p.maxDets)
self.params=p
self._prepare()
# loop through images, area range, max detection number
catIds = p.catIds if p.useCats else [-1]
if p.iouType == 'segm' or p.iouType == 'bbox':
computeIoU = self.computeIoU
elif p.iouType == 'keypoints':
computeIoU = self.computeOks
self.ious = {(vidId, catId): computeIoU(vidId, catId) \
for vidId in p.vidIds
for catId in catIds}
evaluateVid = self.evaluateVid
maxDet = p.maxDets[-1]
self.evalImgs = [evaluateVid(vidId, catId, areaRng, maxDet)
for catId in catIds
for areaRng in p.areaRng
for vidId in p.vidIds
]
self._paramsEval = copy.deepcopy(self.params)
toc = time.time()
print('DONE (t={:0.2f}s).'.format(toc-tic))
def computeIoU(self, vidId, catId):
p = self.params
if p.useCats:
gt = self._gts[vidId,catId]
dt = self._dts[vidId,catId]
else:
gt = [_ for cId in p.catIds for _ in self._gts[vidId,cId]]
dt = [_ for cId in p.catIds for _ in self._dts[vidId,cId]]
if len(gt) == 0 and len(dt) ==0:
return []
inds = np.argsort([-d['score'] for d in dt], kind='mergesort')
dt = [dt[i] for i in inds]
if len(dt) > p.maxDets[-1]:
dt=dt[0:p.maxDets[-1]]
if p.iouType == 'segm':
g = [g['segmentations'] for g in gt]
d = [d['segmentations'] for d in dt]
elif p.iouType == 'bbox':
g = [g['bboxes'] for g in gt]
d = [d['bboxes'] for d in dt]
else:
raise Exception('unknown iouType for iou computation')
# compute iou between each dt and gt region
iscrowd = [int(o['iscrowd']) for o in gt]
#ious = maskUtils.iou(d,g,iscrowd)
def iou_seq(d_seq, g_seq):
i = .0
u = .0
for d, g in zip(d_seq, g_seq):
if d and g:
i += maskUtils.area(maskUtils.merge([d, g], True))
u += maskUtils.area(maskUtils.merge([d, g], False))
elif not d and g:
u += maskUtils.area(g)
elif d and not g:
u += maskUtils.area(d)
if not u > .0:
print("Mask sizes in video {} and category {} may not match!".format(vidId, catId))
iou = i / u if u > .0 else .0
return iou
ious = np.zeros([len(d), len(g)])
for i, j in np.ndindex(ious.shape):
ious[i, j] = iou_seq(d[i], g[j])
#print(vidId, catId, ious.shape, ious)
return ious
def computeOks(self, imgId, catId):
p = self.params
# dimention here should be Nxm
gts = self._gts[imgId, catId]
dts = self._dts[imgId, catId]
inds = np.argsort([-d['score'] for d in dts], kind='mergesort')
dts = [dts[i] for i in inds]
if len(dts) > p.maxDets[-1]:
dts = dts[0:p.maxDets[-1]]
# if len(gts) == 0 and len(dts) == 0:
if len(gts) == 0 or len(dts) == 0:
return []
ious = np.zeros((len(dts), len(gts)))
sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62,.62, 1.07, 1.07, .87, .87, .89, .89])/10.0
vars = (sigmas * 2)**2
k = len(sigmas)
# compute oks between each detection and ground truth object
for j, gt in enumerate(gts):
# create bounds for ignore regions(double the gt bbox)
g = np.array(gt['keypoints'])
xg = g[0::3]; yg = g[1::3]; vg = g[2::3]
k1 = np.count_nonzero(vg > 0)
bb = gt['bbox']
x0 = bb[0] - bb[2]; x1 = bb[0] + bb[2] * 2
y0 = bb[1] - bb[3]; y1 = bb[1] + bb[3] * 2
for i, dt in enumerate(dts):
d = np.array(dt['keypoints'])
xd = d[0::3]; yd = d[1::3]
if k1>0:
# measure the per-keypoint distance if keypoints visible
dx = xd - xg
dy = yd - yg
else:
# measure minimum distance to keypoints in (x0,y0) & (x1,y1)
z = np.zeros((k))
dx = np.max((z, x0-xd),axis=0)+np.max((z, xd-x1),axis=0)
dy = np.max((z, y0-yd),axis=0)+np.max((z, yd-y1),axis=0)
e = (dx**2 + dy**2) / vars / (gt['avg_area']+np.spacing(1)) / 2
if k1 > 0:
e=e[vg > 0]
ious[i, j] = np.sum(np.exp(-e)) / e.shape[0]
return ious
def evaluateVid(self, vidId, catId, aRng, maxDet):
'''
perform evaluation for single category and image
:return: dict (single image results)
'''
p = self.params
if p.useCats:
gt = self._gts[vidId,catId]
dt = self._dts[vidId,catId]
else:
gt = [_ for cId in p.catIds for _ in self._gts[vidId,cId]]
dt = [_ for cId in p.catIds for _ in self._dts[vidId,cId]]
if len(gt) == 0 and len(dt) ==0:
return None
for g in gt:
if g['ignore'] or (g['avg_area']aRng[1]):
g['_ignore'] = 1
else:
g['_ignore'] = 0
# sort dt highest score first, sort gt ignore last
gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort')
gt = [gt[i] for i in gtind]
dtind = np.argsort([-d['score'] for d in dt], kind='mergesort')
dt = [dt[i] for i in dtind[0:maxDet]]
iscrowd = [int(o['iscrowd']) for o in gt]
# load computed ious
ious = self.ious[vidId, catId][:, gtind] if len(self.ious[vidId, catId]) > 0 else self.ious[vidId, catId]
T = len(p.iouThrs)
G = len(gt)
D = len(dt)
gtm = np.zeros((T,G))
dtm = np.zeros((T,D))
gtIg = np.array([g['_ignore'] for g in gt])
dtIg = np.zeros((T,D))
if not len(ious)==0:
for tind, t in enumerate(p.iouThrs):
for dind, d in enumerate(dt):
# information about best match so far (m=-1 -> unmatched)
iou = min([t,1-1e-10])
m = -1
for gind, g in enumerate(gt):
# if this gt already matched, and not a crowd, continue
if gtm[tind,gind]>0 and not iscrowd[gind]:
continue
# if dt matched to reg gt, and on ignore gt, stop
if m>-1 and gtIg[m]==0 and gtIg[gind]==1:
break
# continue to next gt unless better match made
if ious[dind,gind] < iou:
continue
# if match successful and best so far, store appropriately
iou=ious[dind,gind]
m=gind
# if match made store id of match for both dt and gt
if m ==-1:
continue
dtIg[tind,dind] = gtIg[m]
dtm[tind,dind] = gt[m]['id']
gtm[tind,m] = d['id']
# set unmatched detections outside of area range to ignore
a = np.array([d['avg_area']aRng[1] for d in dt]).reshape((1, len(dt)))
dtIg = np.logical_or(dtIg, np.logical_and(dtm==0, np.repeat(a,T,0)))
# store results for given image and category
return {
'video_id': vidId,
'category_id': catId,
'aRng': aRng,
'maxDet': maxDet,
'dtIds': [d['id'] for d in dt],
'gtIds': [g['id'] for g in gt],
'dtMatches': dtm,
'gtMatches': gtm,
'dtScores': [d['score'] for d in dt],
'gtIgnore': gtIg,
'dtIgnore': dtIg,
}
def accumulate(self, p = None):
'''
Accumulate per image evaluation results and store the result in self.eval
:param p: input params for evaluation
:return: None
'''
print('Accumulating evaluation results...')
tic = time.time()
if not self.evalImgs:
print('Please run evaluate() first')
# allows input customized parameters
if p is None:
p = self.params
p.catIds = p.catIds if p.useCats == 1 else [-1]
T = len(p.iouThrs)
R = len(p.recThrs)
K = len(p.catIds) if p.useCats else 1
A = len(p.areaRng)
M = len(p.maxDets)
precision = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories
recall = -np.ones((T,K,A,M))
scores = -np.ones((T,R,K,A,M))
# create dictionary for future indexing
_pe = self._paramsEval
catIds = _pe.catIds if _pe.useCats else [-1]
setK = set(catIds)
setA = set(map(tuple, _pe.areaRng))
setM = set(_pe.maxDets)
setI = set(_pe.vidIds)
# get inds to evaluate
k_list = [n for n, k in enumerate(p.catIds) if k in setK]
m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
i_list = [n for n, i in enumerate(p.vidIds) if i in setI]
I0 = len(_pe.vidIds)
A0 = len(_pe.areaRng)
# retrieve E at each category, area range, and max number of detections
for k, k0 in enumerate(k_list):
Nk = k0*A0*I0
for a, a0 in enumerate(a_list):
Na = a0*I0
for m, maxDet in enumerate(m_list):
E = [self.evalImgs[Nk + Na + i] for i in i_list]
E = [e for e in E if not e is None]
if len(E) == 0:
continue
dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E])
# different sorting method generates slightly different results.
# mergesort is used to be consistent as Matlab implementation.
inds = np.argsort(-dtScores, kind='mergesort')
dtScoresSorted = dtScores[inds]
dtm = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds]
dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet] for e in E], axis=1)[:,inds]
gtIg = np.concatenate([e['gtIgnore'] for e in E])
npig = np.count_nonzero(gtIg==0 )
if npig == 0:
continue
tps = np.logical_and( dtm, np.logical_not(dtIg) )
fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) )
tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)
for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
tp = np.array(tp)
fp = np.array(fp)
nd = len(tp)
rc = tp / npig
pr = tp / (fp+tp+np.spacing(1))
q = np.zeros((R,))
ss = np.zeros((R,))
if nd:
recall[t,k,a,m] = rc[-1]
else:
recall[t,k,a,m] = 0
# numpy is slow without cython optimization for accessing elements
# use python array gets significant speed improvement
pr = pr.tolist(); q = q.tolist()
for i in range(nd-1, 0, -1):
if pr[i] > pr[i-1]:
pr[i-1] = pr[i]
inds = np.searchsorted(rc, p.recThrs, side='left')
try:
for ri, pi in enumerate(inds):
q[ri] = pr[pi]
ss[ri] = dtScoresSorted[pi]
except:
pass
precision[t,:,k,a,m] = np.array(q)
scores[t,:,k,a,m] = np.array(ss)
self.eval = {
'params': p,
'counts': [T, R, K, A, M],
'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
'precision': precision,
'recall': recall,
'scores': scores,
}
toc = time.time()
print('DONE (t={:0.2f}s).'.format( toc-tic))
def summarize(self):
'''
Compute and display summary metrics for evaluation results.
Note this functin can *only* be applied on the default parameter setting
'''
def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ):
p = self.params
iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}'
titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
typeStr = '(AP)' if ap==1 else '(AR)'
iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
if iouThr is None else '{:0.2f}'.format(iouThr)
aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
if ap == 1:
# dimension of precision: [TxRxKxAxM]
s = self.eval['precision']
# IoU
if iouThr is not None:
t = np.where(iouThr == p.iouThrs)[0]
s = s[t]
s = s[:,:,:,aind,mind]
else:
# dimension of recall: [TxKxAxM]
s = self.eval['recall']
if iouThr is not None:
t = np.where(iouThr == p.iouThrs)[0]
s = s[t]
s = s[:,:,aind,mind]
if len(s[s>-1])==0:
mean_s = -1
else:
mean_s = np.mean(s[s>-1])
print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
return mean_s
def _summarizeDets():
stats = np.zeros((12,))
stats[0] = _summarize(1)
stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2])
stats[2] = _summarize(1, iouThr=.75, maxDets=self.params.maxDets[2])
stats[3] = _summarize(1, areaRng='small', maxDets=self.params.maxDets[2])
stats[4] = _summarize(1, areaRng='medium', maxDets=self.params.maxDets[2])
stats[5] = _summarize(1, areaRng='large', maxDets=self.params.maxDets[2])
stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
stats[9] = _summarize(0, areaRng='small', maxDets=self.params.maxDets[2])
stats[10] = _summarize(0, areaRng='medium', maxDets=self.params.maxDets[2])
stats[11] = _summarize(0, areaRng='large', maxDets=self.params.maxDets[2])
return stats
def _summarizeKps():
stats = np.zeros((10,))
stats[0] = _summarize(1, maxDets=20)
stats[1] = _summarize(1, maxDets=20, iouThr=.5)
stats[2] = _summarize(1, maxDets=20, iouThr=.75)
stats[3] = _summarize(1, maxDets=20, areaRng='medium')
stats[4] = _summarize(1, maxDets=20, areaRng='large')
stats[5] = _summarize(0, maxDets=20)
stats[6] = _summarize(0, maxDets=20, iouThr=.5)
stats[7] = _summarize(0, maxDets=20, iouThr=.75)
stats[8] = _summarize(0, maxDets=20, areaRng='medium')
stats[9] = _summarize(0, maxDets=20, areaRng='large')
return stats
if not self.eval:
raise Exception('Please run accumulate() first')
iouType = self.params.iouType
if iouType == 'segm' or iouType == 'bbox':
summarize = _summarizeDets
elif iouType == 'keypoints':
summarize = _summarizeKps
self.stats = summarize()
def __str__(self):
self.summarize()
class Params:
'''
Params for coco evaluation api
'''
def setDetParams(self):
self.vidIds = []
self.catIds = []
# np.arange causes trouble. the data point on arange is slightly larger than the true value
#self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True)
#self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True)
self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
self.maxDets = [1, 10, 100]
self.areaRng = [[0 ** 2, 1e5 ** 2], [0 ** 2, 128 ** 2], [ 128 ** 2, 256 ** 2], [256 ** 2, 1e5 ** 2]]
self.areaRngLbl = ['all', 'small', 'medium', 'large']
self.useCats = 1
def setKpParams(self):
self.vidIds = []
self.catIds = []
# np.arange causes trouble. the data point on arange is slightly larger than the true value
self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True)
self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True)
self.maxDets = [20]
self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
self.areaRngLbl = ['all', 'medium', 'large']
self.useCats = 1
def __init__(self, iouType='segm'):
if iouType == 'segm' or iouType == 'bbox':
self.setDetParams()
elif iouType == 'keypoints':
self.setKpParams()
else:
raise Exception('iouType not supported')
self.iouType = iouType
# useSegm is deprecated
self.useSegm = None
================================================
FILE: mfvis_nococo/mask2former_video/data_video/ytvis_eval.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
import contextlib
import copy
import io
import itertools
import json
import logging
import numpy as np
import os
from collections import OrderedDict
import pycocotools.mask as mask_util
import torch
from .datasets.ytvis_api.ytvos import YTVOS
from .datasets.ytvis_api.ytvoseval import YTVOSeval
from tabulate import tabulate
import detectron2.utils.comm as comm
from detectron2.config import CfgNode
from detectron2.data import MetadataCatalog
from detectron2.evaluation import DatasetEvaluator
from detectron2.utils.file_io import PathManager
from detectron2.utils.logger import create_small_table
class YTVISEvaluator(DatasetEvaluator):
"""
Evaluate AR for object proposals, AP for instance detection/segmentation, AP
for keypoint detection outputs using COCO's metrics.
See http://cocodataset.org/#detection-eval and
http://cocodataset.org/#keypoints-eval to understand its metrics.
In addition to COCO, this evaluator is able to support any bounding box detection,
instance segmentation, or keypoint detection dataset.
"""
def __init__(
self,
dataset_name,
tasks=None,
distributed=True,
output_dir=None,
*,
use_fast_impl=True,
):
"""
Args:
dataset_name (str): name of the dataset to be evaluated.
It must have either the following corresponding metadata:
"json_file": the path to the COCO format annotation
Or it must be in detectron2's standard dataset format
so it can be converted to COCO format automatically.
tasks (tuple[str]): tasks that can be evaluated under the given
configuration. A task is one of "bbox", "segm", "keypoints".
By default, will infer this automatically from predictions.
distributed (True): if True, will collect results from all ranks and run evaluation
in the main process.
Otherwise, will only evaluate the results in the current process.
output_dir (str): optional, an output directory to dump all
results predicted on the dataset. The dump contains two files:
1. "instances_predictions.pth" a file in torch serialization
format that contains all the raw original predictions.
2. "coco_instances_results.json" a json file in COCO's result
format.
use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP.
Although the results should be very close to the official implementation in COCO
API, it is still recommended to compute results with the official API for use in
papers. The faster implementation also uses more RAM.
"""
self._logger = logging.getLogger(__name__)
self._distributed = distributed
self._output_dir = output_dir
self._use_fast_impl = use_fast_impl
if tasks is not None and isinstance(tasks, CfgNode):
self._logger.warning(
"COCO Evaluator instantiated using config, this is deprecated behavior."
" Please pass in explicit arguments instead."
)
self._tasks = None # Infering it from predictions should be better
else:
self._tasks = tasks
self._cpu_device = torch.device("cpu")
self._metadata = MetadataCatalog.get(dataset_name)
json_file = PathManager.get_local_path(self._metadata.json_file)
with contextlib.redirect_stdout(io.StringIO()):
self._ytvis_api = YTVOS(json_file)
# Test set json files do not contain annotations (evaluation must be
# performed using the COCO evaluation server).
self._do_evaluation = "annotations" in self._ytvis_api.dataset
def reset(self):
self._predictions = []
def process(self, inputs, outputs):
"""
Args:
inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
It is a list of dict. Each dict corresponds to an image and
contains keys like "height", "width", "file_name", "image_id".
outputs: the outputs of a COCO model. It is a list of dicts with key
"instances" that contains :class:`Instances`.
"""
prediction = instances_to_coco_json_video(inputs, outputs)
self._predictions.extend(prediction)
def evaluate(self):
"""
Args:
img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset
"""
if self._distributed:
comm.synchronize()
predictions = comm.gather(self._predictions, dst=0)
predictions = list(itertools.chain(*predictions))
if not comm.is_main_process():
return {}
else:
predictions = self._predictions
if len(predictions) == 0:
self._logger.warning("[COCOEvaluator] Did not receive valid predictions.")
return {}
if self._output_dir:
PathManager.mkdirs(self._output_dir)
file_path = os.path.join(self._output_dir, "instances_predictions.pth")
with PathManager.open(file_path, "wb") as f:
torch.save(predictions, f)
self._results = OrderedDict()
self._eval_predictions(predictions)
# Copy so the caller can do whatever with results
return copy.deepcopy(self._results)
def _eval_predictions(self, predictions):
"""
Evaluate predictions. Fill self._results with the metrics of the tasks.
"""
self._logger.info("Preparing results for YTVIS format ...")
# unmap the category ids for COCO
if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
num_classes = len(all_contiguous_ids)
assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
for result in predictions:
category_id = result["category_id"]
assert category_id < num_classes, (
f"A prediction has class={category_id}, "
f"but the dataset only has {num_classes} classes and "
f"predicted class id should be in [0, {num_classes - 1}]."
)
result["category_id"] = reverse_id_mapping[category_id]
if self._output_dir:
file_path = os.path.join(self._output_dir, "results.json")
self._logger.info("Saving results to {}".format(file_path))
with PathManager.open(file_path, "w") as f:
f.write(json.dumps(predictions))
f.flush()
if not self._do_evaluation:
self._logger.info("Annotations are not available for evaluation.")
return
coco_eval = (
_evaluate_predictions_on_coco(
self._ytvis_api,
predictions,
)
if len(predictions) > 0
else None # cocoapi does not handle empty results very well
)
res = self._derive_coco_results(
coco_eval, class_names=self._metadata.get("thing_classes")
)
self._results["segm"] = res
def _derive_coco_results(self, coco_eval, class_names=None):
"""
Derive the desired score numbers from summarized COCOeval.
Args:
coco_eval (None or COCOEval): None represents no predictions from model.
iou_type (str):
class_names (None or list[str]): if provided, will use it to predict
per-category AP.
Returns:
a dict of {metric name: score}
"""
metrics = ["AP", "AP50", "AP75", "APs", "APm", "APl", "AR1", "AR10"]
if coco_eval is None:
self._logger.warn("No predictions from the model!")
return {metric: float("nan") for metric in metrics}
# the standard metrics
results = {
metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan")
for idx, metric in enumerate(metrics)
}
self._logger.info(
"Evaluation results for {}: \n".format("segm") + create_small_table(results)
)
if not np.isfinite(sum(results.values())):
self._logger.info("Some metrics cannot be computed and is shown as NaN.")
if class_names is None or len(class_names) <= 1:
return results
# Compute per-category AP
# from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa
precisions = coco_eval.eval["precision"]
# precision has dims (iou, recall, cls, area range, max dets)
assert len(class_names) == precisions.shape[2]
results_per_category = []
for idx, name in enumerate(class_names):
# area range index 0: all area ranges
# max dets index -1: typically 100 per image
precision = precisions[:, :, idx, 0, -1]
precision = precision[precision > -1]
ap = np.mean(precision) if precision.size else float("nan")
results_per_category.append(("{}".format(name), float(ap * 100)))
# tabulate it
N_COLS = min(6, len(results_per_category) * 2)
results_flatten = list(itertools.chain(*results_per_category))
results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)])
table = tabulate(
results_2d,
tablefmt="pipe",
floatfmt=".3f",
headers=["category", "AP"] * (N_COLS // 2),
numalign="left",
)
self._logger.info("Per-category {} AP: \n".format("segm") + table)
results.update({"AP-" + name: ap for name, ap in results_per_category})
return results
def instances_to_coco_json_video(inputs, outputs):
"""
Dump an "Instances" object to a COCO-format json that's used for evaluation.
Args:
instances (Instances):
video_id (int): the image id
Returns:
list[dict]: list of json annotations in COCO format.
"""
assert len(inputs) == 1, "More than one inputs are loaded for inference!"
video_id = inputs[0]["video_id"]
video_length = inputs[0]["length"]
scores = outputs["pred_scores"]
labels = outputs["pred_labels"]
masks = outputs["pred_masks"]
ytvis_results = []
for instance_id, (s, l, m) in enumerate(zip(scores, labels, masks)):
segms = [
mask_util.encode(np.array(_mask[:, :, None], order="F", dtype="uint8"))[0]
for _mask in m
]
for rle in segms:
rle["counts"] = rle["counts"].decode("utf-8")
res = {
"video_id": video_id,
"score": s,
"category_id": l,
"segmentations": segms,
}
ytvis_results.append(res)
return ytvis_results
def _evaluate_predictions_on_coco(
coco_gt,
coco_results,
img_ids=None,
):
"""
Evaluate the coco results using COCOEval API.
"""
assert len(coco_results) > 0
coco_results = copy.deepcopy(coco_results)
# When evaluating mask AP, if the results contain bbox, cocoapi will
# use the box area as the area of the instance, instead of the mask area.
# This leads to a different definition of small/medium/large.
# We remove the bbox field to let mask AP use mask area.
for c in coco_results:
c.pop("bbox", None)
coco_dt = coco_gt.loadRes(coco_results)
coco_eval = YTVOSeval(coco_gt, coco_dt)
# For COCO, the default max_dets_per_image is [1, 10, 100].
max_dets_per_image = [1, 10, 100] # Default from COCOEval
coco_eval.params.maxDets = max_dets_per_image
if img_ids is not None:
coco_eval.params.imgIds = img_ids
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
return coco_eval
================================================
FILE: mfvis_nococo/mask2former_video/modeling/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
from .transformer_decoder.video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder
================================================
FILE: mfvis_nococo/mask2former_video/modeling/criterion.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py
"""
MaskFormer criterion.
"""
import logging
import torch
import torch.nn.functional as F
from torch import nn
from detectron2.utils.comm import get_world_size
from detectron2.projects.point_rend.point_features import (
get_uncertain_point_coords_with_randomness,
point_sample,
)
from mask2former.utils.misc import is_dist_avail_and_initialized
def unfold_wo_center(x, kernel_size, dilation):
assert x.dim() == 4
assert kernel_size % 2 == 1
# using SAME padding
padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
unfolded_x = F.unfold(
x, kernel_size=kernel_size,
padding=padding,
dilation=dilation
)
unfolded_x = unfolded_x.reshape(
x.size(0), x.size(1), -1, x.size(2), x.size(3)
)
# remove the center pixels
size = kernel_size ** 2
unfolded_x = torch.cat((
unfolded_x[:, :, :size // 2],
unfolded_x[:, :, size // 2 + 1:]
), dim=2)
return unfolded_x
def unfold_w_center(x, kernel_size, dilation):
assert x.dim() == 4
assert kernel_size % 2 == 1
# using SAME padding
padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
unfolded_x = F.unfold(
x, kernel_size=kernel_size,
padding=padding,
dilation=dilation
)
unfolded_x = unfolded_x.reshape(
x.size(0), x.size(1), -1, x.size(2), x.size(3)
)
return unfolded_x
def compute_pairwise_term(mask_logits, pairwise_size, pairwise_dilation):
assert mask_logits.dim() == 4
log_fg_prob = F.logsigmoid(mask_logits)
log_bg_prob = F.logsigmoid(-mask_logits)
log_fg_prob_unfold = unfold_wo_center(
log_fg_prob, kernel_size=pairwise_size,
dilation=pairwise_dilation
)
log_bg_prob_unfold = unfold_wo_center(
log_bg_prob, kernel_size=pairwise_size,
dilation=pairwise_dilation
)
# the probability of making the same prediction = p_i * p_j + (1 - p_i) * (1 - p_j)
# we compute the the probability in log space to avoid numerical instability
log_same_fg_prob = log_fg_prob[:, :, None] + log_fg_prob_unfold
log_same_bg_prob = log_bg_prob[:, :, None] + log_bg_prob_unfold
max_ = torch.max(log_same_fg_prob, log_same_bg_prob)
log_same_prob = torch.log(
torch.exp(log_same_fg_prob - max_) +
torch.exp(log_same_bg_prob - max_)
) + max_
# loss = -log(prob)
return -log_same_prob[:, 0]
def compute_pairwise_term_neighbor(mask_logits, mask_logits_neighbor, pairwise_size, pairwise_dilation):
assert mask_logits.dim() == 4
log_fg_prob_neigh = F.logsigmoid(mask_logits_neighbor)
log_bg_prob_neigh = F.logsigmoid(-mask_logits_neighbor)
log_fg_prob = F.logsigmoid(mask_logits)
log_bg_prob = F.logsigmoid(-mask_logits)
log_fg_prob_unfold = unfold_w_center(
log_fg_prob, kernel_size=pairwise_size,
dilation=pairwise_dilation
)
log_bg_prob_unfold = unfold_w_center(
log_bg_prob, kernel_size=pairwise_size,
dilation=pairwise_dilation
)
# the probability of making the same prediction = p_i * p_j + (1 - p_i) * (1 - p_j)
# we compute the the probability in log space to avoid numerical instability
log_same_fg_prob = log_fg_prob_neigh[:, :, None] + log_fg_prob_unfold
log_same_bg_prob = log_bg_prob_neigh[:, :, None] + log_bg_prob_unfold
max_ = torch.max(log_same_fg_prob, log_same_bg_prob)
log_same_prob = torch.log(
torch.exp(log_same_fg_prob - max_) +
torch.exp(log_same_bg_prob - max_)
) + max_
return -log_same_prob[:, 0]
def dice_coefficient(x, target):
eps = 1e-5
n_inst = x.size(0)
x = x.reshape(n_inst, -1)
target = target.reshape(n_inst, -1)
intersection = (x * target).sum(dim=1)
union = (x ** 2.0).sum(dim=1) + (target ** 2.0).sum(dim=1) + eps
loss = 1. - (2 * intersection / union)
return loss
def compute_project_term(mask_scores, gt_bitmasks):
mask_losses_y = dice_coefficient(
mask_scores.max(dim=2, keepdim=True)[0],
gt_bitmasks.max(dim=2, keepdim=True)[0]
)
mask_losses_x = dice_coefficient(
mask_scores.max(dim=3, keepdim=True)[0],
gt_bitmasks.max(dim=3, keepdim=True)[0]
)
return (mask_losses_x + mask_losses_y).mean()
def dice_loss(
inputs: torch.Tensor,
targets: torch.Tensor,
num_masks: float,
):
"""
Compute the DICE loss, similar to generalized IOU for masks
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
"""
inputs = inputs.sigmoid()
inputs = inputs.flatten(1)
numerator = 2 * (inputs * targets).sum(-1)
denominator = inputs.sum(-1) + targets.sum(-1)
loss = 1 - (numerator + 1) / (denominator + 1)
return loss.sum() / num_masks
dice_loss_jit = torch.jit.script(
dice_loss
) # type: torch.jit.ScriptModule
def sigmoid_ce_loss(
inputs: torch.Tensor,
targets: torch.Tensor,
num_masks: float,
):
"""
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
Returns:
Loss tensor
"""
loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
return loss.mean(1).sum() / num_masks
sigmoid_ce_loss_jit = torch.jit.script(
sigmoid_ce_loss
) # type: torch.jit.ScriptModule
def calculate_uncertainty(logits):
"""
We estimate uncerainty as L1 distance between 0.0 and the logit prediction in 'logits' for the
foreground class in `classes`.
Args:
logits (Tensor): A tensor of shape (R, 1, ...) for class-specific or
class-agnostic, where R is the total number of predicted masks in all images and C is
the number of foreground classes. The values are logits.
Returns:
scores (Tensor): A tensor of shape (R, 1, ...) that contains uncertainty scores with
the most uncertain locations having the highest uncertainty score.
"""
assert logits.shape[1] == 1
gt_class_logits = logits.clone()
return -(torch.abs(gt_class_logits))
class VideoSetCriterion(nn.Module):
"""This class computes the loss for DETR.
The process happens in two steps:
1) we compute hungarian assignment between ground truth boxes and the outputs of the model
2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
"""
def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses,
num_points, oversample_ratio, importance_sample_ratio):
"""Create the criterion.
Parameters:
num_classes: number of object categories, omitting the special no-object category
matcher: module able to compute a matching between targets and proposals
weight_dict: dict containing as key the names of the losses and as values their relative weight.
eos_coef: relative classification weight applied to the no-object category
losses: list of all the losses to be applied. See get_loss for list of available losses.
"""
super().__init__()
self.num_classes = num_classes
self.matcher = matcher
self.weight_dict = weight_dict
self.eos_coef = eos_coef
self.losses = losses
empty_weight = torch.ones(self.num_classes + 1)
empty_weight[-1] = self.eos_coef
self.register_buffer("empty_weight", empty_weight)
# pointwise mask loss parameters
self.num_points = num_points
self.oversample_ratio = oversample_ratio
self.importance_sample_ratio = importance_sample_ratio
self._warmup_iters = 2000
self.register_buffer("_iter", torch.zeros([1]))
def loss_labels(self, outputs, targets, indices, num_masks):
"""Classification loss (NLL)
targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
"""
assert "pred_logits" in outputs
src_logits = outputs["pred_logits"].float()
idx = self._get_src_permutation_idx(indices)
target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
target_classes = torch.full(
src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
)
target_classes[idx] = target_classes_o
loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
losses = {"loss_ce": loss_ce}
return losses
def loss_masks(self, outputs, targets, indices, num_masks):
"""Compute the losses related to the masks: the focal loss and the dice loss.
targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
"""
assert "pred_masks" in outputs
src_idx = self._get_src_permutation_idx(indices)
src_masks = outputs["pred_masks"]
src_masks = src_masks[src_idx]
# Modified to handle video
target_masks = torch.cat([t['masks'][i] for t, (_, i) in zip(targets, indices)]).to(src_masks)
# No need to upsample predictions as we are using normalized coordinates :)
# NT x 1 x H x W
src_masks = src_masks.flatten(0, 1)[:, None]
target_masks = target_masks.flatten(0, 1)[:, None]
with torch.no_grad():
# sample point_coords
point_coords = get_uncertain_point_coords_with_randomness(
src_masks,
lambda logits: calculate_uncertainty(logits),
self.num_points,
self.oversample_ratio,
self.importance_sample_ratio,
)
# get gt labels
point_labels = point_sample(
target_masks,
point_coords,
align_corners=False,
).squeeze(1)
point_logits = point_sample(
src_masks,
point_coords,
align_corners=False,
).squeeze(1)
losses = {
"loss_mask": sigmoid_ce_loss_jit(point_logits, point_labels, num_masks),
"loss_dice": dice_loss_jit(point_logits, point_labels, num_masks),
}
del src_masks
del target_masks
return losses
def topk_mask(self, images_lab_sim, k):
images_lab_sim_mask = torch.zeros_like(images_lab_sim)
topk, indices = torch.topk(images_lab_sim, k, dim =1) # 1, 3, 5, 7
images_lab_sim_mask = images_lab_sim_mask.scatter(1, indices, topk)
return images_lab_sim_mask
def loss_masks_proj(self, outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2, images_lab_sim_nei3, images_lab_sim_nei4):
"""Compute the losses related to the masks: the focal loss and the dice loss.
targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
"""
assert "pred_masks" in outputs
self._iter += 1
src_idx = self._get_src_permutation_idx(indices)
src_masks = outputs["pred_masks"]
src_masks = src_masks[src_idx]
# Modified to handle video
target_masks = torch.cat([t['masks'][i] for t, (_, i) in zip(targets, indices)]).to(src_masks)
images_lab_sim = torch.cat(images_lab_sim, dim =0)
images_lab_sim_nei = torch.cat(images_lab_sim_nei, dim=0)
images_lab_sim_nei1 = torch.cat(images_lab_sim_nei1, dim=0)
images_lab_sim_nei2 = torch.cat(images_lab_sim_nei2, dim=0)
images_lab_sim_nei3 = torch.cat(images_lab_sim_nei3, dim=0)
images_lab_sim_nei4 = torch.cat(images_lab_sim_nei4, dim=0)
images_lab_sim = images_lab_sim.view(-1, target_masks.shape[1], images_lab_sim.shape[-3], images_lab_sim.shape[-2], images_lab_sim.shape[-1])
images_lab_sim_nei = images_lab_sim_nei.unsqueeze(1)
images_lab_sim_nei1 = images_lab_sim_nei1.unsqueeze(1)
images_lab_sim_nei2 = images_lab_sim_nei2.unsqueeze(1)
images_lab_sim_nei3 = images_lab_sim_nei3.unsqueeze(1)
images_lab_sim_nei4 = images_lab_sim_nei4.unsqueeze(1)
if len(src_idx[0].tolist()) > 0:
images_lab_sim = torch.cat([images_lab_sim[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1)
images_lab_sim_nei = self.topk_mask(torch.cat([images_lab_sim_nei[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1), 5)
images_lab_sim_nei1 = self.topk_mask(torch.cat([images_lab_sim_nei1[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1), 5)
images_lab_sim_nei2 = self.topk_mask(torch.cat([images_lab_sim_nei2[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1), 5)
images_lab_sim_nei3 = self.topk_mask(torch.cat([images_lab_sim_nei3[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1), 5)
images_lab_sim_nei4 = self.topk_mask(torch.cat([images_lab_sim_nei4[ind][None] for ind in src_idx[0].tolist()]).flatten(0, 1), 5)
k_size = 3
if src_masks.shape[0] > 0:
pairwise_losses_neighbor = compute_pairwise_term_neighbor(
src_masks[:,:1], src_masks[:,1:2], k_size, 3
)
pairwise_losses_neighbor1 = compute_pairwise_term_neighbor(
src_masks[:,1:2], src_masks[:,2:3], k_size, 3
)
pairwise_losses_neighbor2 = compute_pairwise_term_neighbor(
src_masks[:,2:3], src_masks[:,3:4], k_size, 3
)
pairwise_losses_neighbor3 = compute_pairwise_term_neighbor(
src_masks[:,3:4], src_masks[:,4:5], k_size, 3
)
pairwise_losses_neighbor4 = compute_pairwise_term_neighbor(
src_masks[:,4:5], src_masks[:,0:1], k_size, 3
)
# print('pairwise_losses_neighbor:', pairwise_losses_neighbor.shape)
src_masks = src_masks.flatten(0, 1)[:, None]
target_masks = target_masks.flatten(0, 1)[:, None]
target_masks = F.interpolate(target_masks, (src_masks.shape[-2], src_masks.shape[-1]), mode='bilinear')
# images_lab_sim = F.interpolate(images_lab_sim, (src_masks.shape[-2], src_masks.shape[-1]), mode='bilinear')
if src_masks.shape[0] > 0:
loss_prj_term = compute_project_term(src_masks.sigmoid(), target_masks)
pairwise_losses = compute_pairwise_term(
src_masks, k_size, 2
)
weights = (images_lab_sim >= 0.3).float() * target_masks.float()
target_masks_sum = target_masks.reshape(pairwise_losses_neighbor.shape[0], 5, target_masks.shape[-2], target_masks.shape[-1]).sum(dim=1, keepdim=True)
target_masks_sum = (target_masks_sum >= 1.0).float() # ori is 1.0
weights_neighbor = (images_lab_sim_nei >= 0.05).float() * target_masks_sum # ori is 0.5, 0.01, 0.001, 0.005, 0.0001, 0.02, 0.05, 0.075, 0.1 , dy 0.5
weights_neighbor1 = (images_lab_sim_nei1 >= 0.05).float() * target_masks_sum # ori is 0.5, 0.01, 0.001, 0.005, 0.0001, 0.02, 0.05, 0.075, 0.1, dy 0.5
weights_neighbor2 = (images_lab_sim_nei2 >= 0.05).float() * target_masks_sum # ori is 0.5, 0.01, 0.001, 0.005, 0.0001, 0.02, 0.05, 0.075, 0.1, dy 0.5
weights_neighbor3 = (images_lab_sim_nei3 >= 0.05).float() * target_masks_sum
weights_neighbor4 = (images_lab_sim_nei4 >= 0.05).float() * target_masks_sum
warmup_factor = min(self._iter.item() / float(self._warmup_iters), 1.0) #1.0
loss_pairwise = (pairwise_losses * weights).sum() / weights.sum().clamp(min=1.0)
loss_pairwise_neighbor = (pairwise_losses_neighbor * weights_neighbor).sum() / weights_neighbor.sum().clamp(min=1.0) * warmup_factor
loss_pairwise_neighbor1 = (pairwise_losses_neighbor1 * weights_neighbor1).sum() / weights_neighbor1.sum().clamp(min=1.0) * warmup_factor
loss_pairwise_neighbor2 = (pairwise_losses_neighbor2 * weights_neighbor2).sum() / weights_neighbor2.sum().clamp(min=1.0) * warmup_factor
loss_pairwise_neighbor3 = (pairwise_losses_neighbor3 * weights_neighbor3).sum() / weights_neighbor3.sum().clamp(min=1.0) * warmup_factor
loss_pairwise_neighbor4 = (pairwise_losses_neighbor4 * weights_neighbor4).sum() / weights_neighbor4.sum().clamp(min=1.0) * warmup_factor
else:
loss_prj_term = src_masks.sum() * 0.
loss_pairwise = src_masks.sum() * 0.
loss_pairwise_neighbor = src_masks.sum() * 0.
loss_pairwise_neighbor1 = src_masks.sum() * 0.
loss_pairwise_neighbor2 = src_masks.sum() * 0.
loss_pairwise_neighbor3 = src_masks.sum() * 0.
loss_pairwise_neighbor4 = src_masks.sum() * 0.
# print('loss_proj term:', loss_prj_term)
losses = {
"loss_mask": loss_prj_term,
"loss_bound": loss_pairwise,
"loss_bound_neighbor": (loss_pairwise_neighbor + loss_pairwise_neighbor1 + loss_pairwise_neighbor2 + loss_pairwise_neighbor3 + loss_pairwise_neighbor4) * 0.1, # * 0.33
}
del src_masks
del target_masks
return losses
def _get_src_permutation_idx(self, indices):
# permute predictions following indices
batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
src_idx = torch.cat([src for (src, _) in indices])
return batch_idx, src_idx
def _get_tgt_permutation_idx(self, indices):
# permute targets following indices
batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
tgt_idx = torch.cat([tgt for (_, tgt) in indices])
return batch_idx, tgt_idx
def get_loss(self, loss, outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2, images_lab_sim_nei3, images_lab_sim_nei4):
loss_map = {
'labels': self.loss_labels,
'masks': self.loss_masks_proj,
}
assert loss in loss_map, f"do you really want to compute {loss} loss?"
if loss == 'masks':
return loss_map[loss](outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2, images_lab_sim_nei3, images_lab_sim_nei4)
else:
return loss_map[loss](outputs, targets, indices, num_masks)
def forward(self, outputs, targets, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2, images_lab_sim_nei3, images_lab_sim_nei4):
"""This performs the loss computation.
Parameters:
outputs: dict of tensors, see the output specification of the model for the format
targets: list of dicts, such that len(targets) == batch_size.
The expected keys in each dict depends on the losses applied, see each loss' doc
"""
outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
# Retrieve the matching between the outputs of the last layer and the targets
indices = self.matcher(outputs_without_aux, targets)
# Compute the average number of target boxes accross all nodes, for normalization purposes
num_masks = sum(len(t["labels"]) for t in targets)
num_masks = torch.as_tensor(
[num_masks], dtype=torch.float, device=next(iter(outputs.values())).device
)
if is_dist_avail_and_initialized():
torch.distributed.all_reduce(num_masks)
num_masks = torch.clamp(num_masks / get_world_size(), min=1).item()
# Compute all the requested losses
losses = {}
for loss in self.losses:
losses.update(self.get_loss(loss, outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2, images_lab_sim_nei3, images_lab_sim_nei4))
# In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
if "aux_outputs" in outputs:
for i, aux_outputs in enumerate(outputs["aux_outputs"]):
indices = self.matcher(aux_outputs, targets)
for loss in self.losses:
l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_masks, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2, images_lab_sim_nei3, images_lab_sim_nei4)
l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
losses.update(l_dict)
return losses
def __repr__(self):
head = "Criterion " + self.__class__.__name__
body = [
"matcher: {}".format(self.matcher.__repr__(_repr_indent=8)),
"losses: {}".format(self.losses),
"weight_dict: {}".format(self.weight_dict),
"num_classes: {}".format(self.num_classes),
"eos_coef: {}".format(self.eos_coef),
"num_points: {}".format(self.num_points),
"oversample_ratio: {}".format(self.oversample_ratio),
"importance_sample_ratio: {}".format(self.importance_sample_ratio),
]
_repr_indent = 4
lines = [head] + [" " * _repr_indent + line for line in body]
return "\n".join(lines)
================================================
FILE: mfvis_nococo/mask2former_video/modeling/matcher.py
================================================
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py
"""
Modules to compute the matching cost and solve the corresponding LSAP.
"""
import torch
import torch.nn.functional as F
from scipy.optimize import linear_sum_assignment
from torch import nn
from torch.cuda.amp import autocast
from detectron2.projects.point_rend.point_features import point_sample
def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
"""
Compute the bounding boxes around the provided masks.
Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with
``0 <= x1 < x2`` and ``0 <= y1 < y2``.
Args:
masks (Tensor[N, H, W]): masks to transform where N is the number of masks
and (H, W) are the spatial dimensions.
Returns:
Tensor[N, 4]: bounding boxes
"""
if masks.numel() == 0:
return masks
n = masks.shape[0]
masks = masks.flatten(0, 1)
for index, mask in enumerate(masks):
y, x = torch.where(mask != 0)
if len(x) * len(y) == 0:
continue
masks[index, torch.min(y):torch.max(y)+1, torch.min(x):torch.max(x)+1] = 1.0
masks = masks.view(n, -1, masks.shape[-2], masks.shape[-1])
return masks
def masks_to_boxes_new(masks: torch.Tensor) -> torch.Tensor:
"""
Compute the bounding boxes around the provided masks.
Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with
``0 <= x1 < x2`` and ``0 <= y1 < y2``.
Args:
masks (Tensor[N, H, W]): masks to transform where N is the number of masks
and (H, W) are the spatial dimensions.
Returns:
Tensor[N, 4]: bounding boxes
"""
if masks.numel() == 0:
return masks
n, _, h, w = masks.shape
masks = masks.flatten(0, 1)
y = torch.arange(0, h, dtype=torch.float).to(masks.device)
x = torch.arange(0, w, dtype=torch.float).to(masks.device)
y, x = torch.meshgrid(y, x)
x_mask = (masks * x.unsqueeze(0))
x_max = x_mask.flatten(1).max(-1)[0] + 1
x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
y_mask = (masks * y.unsqueeze(0))
y_max = y_mask.flatten(1).max(-1)[0] + 1
y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
boxes = torch.stack([x_min, y_min, x_max, y_max], 1)
mem_mask = torch.zeros_like(masks)
hMask = torch.logical_or(torch.arange(h).unsqueeze(0).to(boxes)=boxes[:, 3, None])
wMask = torch.logical_or(torch.arange(w).unsqueeze(0).to(boxes)=boxes[:, 2, None])
mem_mask = torch.logical_or(hMask.unsqueeze(2), wMask.unsqueeze(1)).float()
mem_mask = 1.0 - mem_mask.view(n, -1, masks.shape[-2], masks.shape[-1])
return mem_mask
def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
"""
Compute the DICE loss, similar to generalized IOU for masks
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
"""
inputs = inputs.sigmoid()
inputs = inputs.flatten(1)
numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
loss = 1 - (numerator + 1) / (denominator + 1)
return loss
def batch_dice_loss_nosig(inputs: torch.Tensor, targets: torch.Tensor):
"""
Compute the DICE loss, similar to generalized IOU for masks
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
"""
# inputs = inputs.sigmoid()
inputs = inputs.flatten(1)
numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
loss = 1 - (numerator + 1) / (denominator + 1)
return loss
batch_dice_loss_jit = torch.jit.script(
batch_dice_loss
) # type: torch.jit.ScriptModule
batch_dice_loss_jit_nosig = torch.jit.script(
batch_dice_loss_nosig
) # type: torch.jit.ScriptModule
def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
"""
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
Returns:
Loss tensor
"""
hw = inputs.shape[1]
pos = F.binary_cross_entropy_with_logits(
inputs, torch.ones_like(inputs), reduction="none"
)
neg = F.binary_cross_entropy_with_logits(
inputs, torch.zeros_like(inputs), reduction="none"
)
loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum(
"nc,mc->nm", neg, (1 - targets)
)
return loss / hw
batch_sigmoid_ce_loss_jit = torch.jit.script(
batch_sigmoid_ce_loss
) # type: torch.jit.ScriptModule
class VideoHungarianMatcher(nn.Module):
"""This class computes an assignment between the targets and the predictions of the network
For efficiency reasons, the targets don't include the no_object. Because of this, in general,
there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
while the others are un-matched (and thus treated as non-objects).
"""
def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0):
"""Creates the matcher
Params:
cost_class: This is the relative weight of the classification error in the matching cost
cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost
cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost
"""
super().__init__()
self.cost_class = cost_class
self.cost_mask = cost_mask
self.cost_dice = cost_dice
assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0"
self.num_points = num_points
@torch.no_grad()
def memory_efficient_forward(self, outputs, targets):
"""More memory-friendly matching"""
bs, num_queries = outputs["pred_logits"].shape[:2]
indices = []
# Iterate through batch size
for b in range(bs):
out_prob = outputs["pred_logits"][b].softmax(-1) # [num_queries, num_classes]
tgt_ids = targets[b]["labels"]
# Compute the classification cost. Contrary to the loss, we don't use the NLL,
# but approximate it in 1 - proba[target class].
# The 1 is a constant that doesn't change the matching, it can be ommitted.
cost_class = -out_prob[:, tgt_ids]
out_mask = outputs["pred_masks"][b] # [num_queries, T, H_pred, W_pred]
out_mask = masks_to_boxes_new((out_mask.sigmoid() > 0.5).float())
# gt masks are already padded when preparing target
tgt_mask = targets[b]["masks"].to(out_mask) # [num_gts, T, H_pred, W_pred]
tgt_mask = masks_to_boxes(tgt_mask)
# all masks share the same set of points for efficient matching!
point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device)
# get gt labels
tgt_mask = point_sample(
tgt_mask,
point_coords.repeat(tgt_mask.shape[0], 1, 1),
align_corners=False,
).flatten(1)
out_mask = point_sample(
out_mask,
point_coords.repeat(out_mask.shape[0], 1, 1),
align_corners=False,
).flatten(1)
with autocast(enabled=False):
out_mask = out_mask.float()
tgt_mask = tgt_mask.float()
cost_dice_nosig = batch_dice_loss_jit_nosig(out_mask, tgt_mask)
C = (
self.cost_class * cost_class
+ self.cost_dice * cost_dice_nosig
)
C = C.reshape(num_queries, -1).cpu()
indices.append(linear_sum_assignment(C))
return [
(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
for i, j in indices
]
@torch.no_grad()
def forward(self, outputs, targets):
"""Performs the matching
Params:
outputs: This is a dict that contains at least these entries:
"pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
"pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks
targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
"labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
objects in the target) containing the class labels
"masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks
Returns:
A list of size batch_size, containing tuples of (index_i, index_j) where:
- index_i is the indices of the selected predictions (in order)
- index_j is the indices of the corresponding selected targets (in order)
For each batch element, it holds:
len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
"""
return self.memory_efficient_forward(outputs, targets)
def __repr__(self, _repr_indent=4):
head = "Matcher " + self.__class__.__name__
body = [
"cost_class: {}".format(self.cost_class),
"cost_mask: {}".format(self.cost_mask),
"cost_dice: {}".format(self.cost_dice),
]
lines = [head] + [" " * _repr_indent + line for line in body]
return "\n".join(lines)
================================================
FILE: mfvis_nococo/mask2former_video/modeling/transformer_decoder/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
from .video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder
================================================
FILE: mfvis_nococo/mask2former_video/modeling/transformer_decoder/position_encoding.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
"""
Various positional encodings for the transformer.
"""
import math
import torch
from torch import nn
class PositionEmbeddingSine3D(nn.Module):
"""
This is a more standard version of the position embedding, very similar to the one
used by the Attention is all you need paper, generalized to work on images.
"""
def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
super().__init__()
self.num_pos_feats = num_pos_feats
self.temperature = temperature
self.normalize = normalize
if scale is not None and normalize is False:
raise ValueError("normalize should be True if scale is passed")
if scale is None:
scale = 2 * math.pi
self.scale = scale
def forward(self, x, mask=None):
# b, t, c, h, w
assert x.dim() == 5, f"{x.shape} should be a 5-dimensional Tensor, got {x.dim()}-dimensional Tensor instead"
if mask is None:
mask = torch.zeros((x.size(0), x.size(1), x.size(3), x.size(4)), device=x.device, dtype=torch.bool)
not_mask = ~mask
z_embed = not_mask.cumsum(1, dtype=torch.float32)
y_embed = not_mask.cumsum(2, dtype=torch.float32)
x_embed = not_mask.cumsum(3, dtype=torch.float32)
if self.normalize:
eps = 1e-6
z_embed = z_embed / (z_embed[:, -1:, :, :] + eps) * self.scale
y_embed = y_embed / (y_embed[:, :, -1:, :] + eps) * self.scale
x_embed = x_embed / (x_embed[:, :, :, -1:] + eps) * self.scale
dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
dim_t_z = torch.arange((self.num_pos_feats * 2), dtype=torch.float32, device=x.device)
dim_t_z = self.temperature ** (2 * (dim_t_z // 2) / (self.num_pos_feats * 2))
pos_x = x_embed[:, :, :, :, None] / dim_t
pos_y = y_embed[:, :, :, :, None] / dim_t
pos_z = z_embed[:, :, :, :, None] / dim_t_z
pos_x = torch.stack((pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
pos_y = torch.stack((pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
pos_z = torch.stack((pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3) # b, t, c, h, w
return pos
================================================
FILE: mfvis_nococo/mask2former_video/modeling/transformer_decoder/video_mask2former_transformer_decoder.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
import logging
import fvcore.nn.weight_init as weight_init
from typing import Optional
import torch
from torch import nn, Tensor
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.layers import Conv2d
from mask2former.modeling.transformer_decoder.maskformer_transformer_decoder import TRANSFORMER_DECODER_REGISTRY
from .position_encoding import PositionEmbeddingSine3D
class SelfAttentionLayer(nn.Module):
def __init__(self, d_model, nhead, dropout=0.0,
activation="relu", normalize_before=False):
super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.norm = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(self, tgt,
tgt_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
q = k = self.with_pos_embed(tgt, query_pos)
tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
key_padding_mask=tgt_key_padding_mask)[0]
tgt = tgt + self.dropout(tgt2)
tgt = self.norm(tgt)
return tgt
def forward_pre(self, tgt,
tgt_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
tgt2 = self.norm(tgt)
q = k = self.with_pos_embed(tgt2, query_pos)
tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
key_padding_mask=tgt_key_padding_mask)[0]
tgt = tgt + self.dropout(tgt2)
return tgt
def forward(self, tgt,
tgt_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
if self.normalize_before:
return self.forward_pre(tgt, tgt_mask,
tgt_key_padding_mask, query_pos)
return self.forward_post(tgt, tgt_mask,
tgt_key_padding_mask, query_pos)
class CrossAttentionLayer(nn.Module):
def __init__(self, d_model, nhead, dropout=0.0,
activation="relu", normalize_before=False):
super().__init__()
self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.norm = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(self, tgt, memory,
memory_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
key=self.with_pos_embed(memory, pos),
value=memory, attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask)[0]
tgt = tgt + self.dropout(tgt2)
tgt = self.norm(tgt)
return tgt
def forward_pre(self, tgt, memory,
memory_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
tgt2 = self.norm(tgt)
tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
key=self.with_pos_embed(memory, pos),
value=memory, attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask)[0]
tgt = tgt + self.dropout(tgt2)
return tgt
def forward(self, tgt, memory,
memory_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
if self.normalize_before:
return self.forward_pre(tgt, memory, memory_mask,
memory_key_padding_mask, pos, query_pos)
return self.forward_post(tgt, memory, memory_mask,
memory_key_padding_mask, pos, query_pos)
class FFNLayer(nn.Module):
def __init__(self, d_model, dim_feedforward=2048, dropout=0.0,
activation="relu", normalize_before=False):
super().__init__()
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm = nn.LayerNorm(d_model)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(self, tgt):
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
tgt = tgt + self.dropout(tgt2)
tgt = self.norm(tgt)
return tgt
def forward_pre(self, tgt):
tgt2 = self.norm(tgt)
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
tgt = tgt + self.dropout(tgt2)
return tgt
def forward(self, tgt):
if self.normalize_before:
return self.forward_pre(tgt)
return self.forward_post(tgt)
def _get_activation_fn(activation):
"""Return an activation function given a string"""
if activation == "relu":
return F.relu
if activation == "gelu":
return F.gelu
if activation == "glu":
return F.glu
raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
class MLP(nn.Module):
""" Very simple multi-layer perceptron (also called FFN)"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
def forward(self, x):
for i, layer in enumerate(self.layers):
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x
@TRANSFORMER_DECODER_REGISTRY.register()
class VideoMultiScaleMaskedTransformerDecoder(nn.Module):
_version = 2
def _load_from_state_dict(
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
):
version = local_metadata.get("version", None)
if version is None or version < 2:
# Do not warn if train from scratch
scratch = True
logger = logging.getLogger(__name__)
for k in list(state_dict.keys()):
newk = k
if "static_query" in k:
newk = k.replace("static_query", "query_feat")
if newk != k:
state_dict[newk] = state_dict[k]
del state_dict[k]
scratch = False
if not scratch:
logger.warning(
f"Weight format of {self.__class__.__name__} have changed! "
"Please upgrade your models. Applying automatic conversion now ..."
)
@configurable
def __init__(
self,
in_channels,
mask_classification=True,
*,
num_classes: int,
hidden_dim: int,
num_queries: int,
nheads: int,
dim_feedforward: int,
dec_layers: int,
pre_norm: bool,
mask_dim: int,
enforce_input_project: bool,
# video related
num_frames,
):
"""
NOTE: this interface is experimental.
Args:
in_channels: channels of the input features
mask_classification: whether to add mask classifier or not
num_classes: number of classes
hidden_dim: Transformer feature dimension
num_queries: number of queries
nheads: number of heads
dim_feedforward: feature dimension in feedforward network
enc_layers: number of Transformer encoder layers
dec_layers: number of Transformer decoder layers
pre_norm: whether to use pre-LayerNorm or not
mask_dim: mask feature dimension
enforce_input_project: add input project 1x1 conv even if input
channels and hidden dim is identical
"""
super().__init__()
assert mask_classification, "Only support mask classification model"
self.mask_classification = mask_classification
self.num_frames = num_frames
# positional encoding
N_steps = hidden_dim // 2
self.pe_layer = PositionEmbeddingSine3D(N_steps, normalize=True)
# define Transformer decoder here
self.num_heads = nheads
self.num_layers = dec_layers
self.transformer_self_attention_layers = nn.ModuleList()
self.transformer_cross_attention_layers = nn.ModuleList()
self.transformer_ffn_layers = nn.ModuleList()
for _ in range(self.num_layers):
self.transformer_self_attention_layers.append(
SelfAttentionLayer(
d_model=hidden_dim,
nhead=nheads,
dropout=0.0,
normalize_before=pre_norm,
)
)
self.transformer_cross_attention_layers.append(
CrossAttentionLayer(
d_model=hidden_dim,
nhead=nheads,
dropout=0.0,
normalize_before=pre_norm,
)
)
self.transformer_ffn_layers.append(
FFNLayer(
d_model=hidden_dim,
dim_feedforward=dim_feedforward,
dropout=0.0,
normalize_before=pre_norm,
)
)
self.decoder_norm = nn.LayerNorm(hidden_dim)
self.num_queries = num_queries
# learnable query features
self.query_feat = nn.Embedding(num_queries, hidden_dim)
# learnable query p.e.
self.query_embed = nn.Embedding(num_queries, hidden_dim)
# level embedding (we always use 3 scales)
self.num_feature_levels = 3
self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim)
self.input_proj = nn.ModuleList()
for _ in range(self.num_feature_levels):
if in_channels != hidden_dim or enforce_input_project:
self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1))
weight_init.c2_xavier_fill(self.input_proj[-1])
else:
self.input_proj.append(nn.Sequential())
# output FFNs
if self.mask_classification:
self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
@classmethod
def from_config(cls, cfg, in_channels, mask_classification):
ret = {}
ret["in_channels"] = in_channels
ret["mask_classification"] = mask_classification
ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
# Transformer parameters:
ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
# NOTE: because we add learnable query features which requires supervision,
# we add minus 1 to decoder layers to be consistent with our loss
# implementation: that is, number of auxiliary losses is always
# equal to number of decoder layers. With learnable query features, the number of
# auxiliary losses equals number of decoders plus 1.
assert cfg.MODEL.MASK_FORMER.DEC_LAYERS >= 1
ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS - 1
ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
ret["num_frames"] = cfg.INPUT.SAMPLING_FRAME_NUM
return ret
def forward(self, x, mask_features, mask = None):
bt, c_m, h_m, w_m = mask_features.shape
bs = bt // self.num_frames if self.training else 1
t = bt // bs
mask_features = mask_features.view(bs, t, c_m, h_m, w_m)
# x is a list of multi-scale feature
assert len(x) == self.num_feature_levels
src = []
pos = []
size_list = []
# disable mask, it does not affect performance
del mask
for i in range(self.num_feature_levels):
size_list.append(x[i].shape[-2:])
pos.append(self.pe_layer(x[i].view(bs, t, -1, size_list[-1][0], size_list[-1][1]), None).flatten(3))
src.append(self.input_proj[i](x[i]).flatten(2) + self.level_embed.weight[i][None, :, None])
# NTxCxHW => NxTxCxHW => (TxHW)xNxC
_, c, hw = src[-1].shape
pos[-1] = pos[-1].view(bs, t, c, hw).permute(1, 3, 0, 2).flatten(0, 1)
src[-1] = src[-1].view(bs, t, c, hw).permute(1, 3, 0, 2).flatten(0, 1)
# QxNxC
query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1)
output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1)
predictions_class = []
predictions_mask = []
# prediction heads on learnable query features
outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0])
predictions_class.append(outputs_class)
predictions_mask.append(outputs_mask)
for i in range(self.num_layers):
level_index = i % self.num_feature_levels
attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False
# attention: cross-attention first
output = self.transformer_cross_attention_layers[i](
output, src[level_index],
memory_mask=attn_mask,
memory_key_padding_mask=None, # here we do not apply masking on padded region
pos=pos[level_index], query_pos=query_embed
)
output = self.transformer_self_attention_layers[i](
output, tgt_mask=None,
tgt_key_padding_mask=None,
query_pos=query_embed
)
# FFN
output = self.transformer_ffn_layers[i](
output
)
outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels])
predictions_class.append(outputs_class)
predictions_mask.append(outputs_mask)
assert len(predictions_class) == self.num_layers + 1
out = {
'pred_logits': predictions_class[-1],
'pred_masks': predictions_mask[-1],
'aux_outputs': self._set_aux_loss(
predictions_class if self.mask_classification else None, predictions_mask
)
}
return out
def forward_prediction_heads(self, output, mask_features, attn_mask_target_size):
decoder_output = self.decoder_norm(output)
decoder_output = decoder_output.transpose(0, 1)
outputs_class = self.class_embed(decoder_output)
mask_embed = self.mask_embed(decoder_output)
outputs_mask = torch.einsum("bqc,btchw->bqthw", mask_embed, mask_features)
b, q, t, _, _ = outputs_mask.shape
# NOTE: prediction is of higher-resolution
# [B, Q, T, H, W] -> [B, Q, T*H*W] -> [B, h, Q, T*H*W] -> [B*h, Q, T*HW]
attn_mask = F.interpolate(outputs_mask.flatten(0, 1), size=attn_mask_target_size, mode="bilinear", align_corners=False).view(
b, q, t, attn_mask_target_size[0], attn_mask_target_size[1])
# must use bool type
# If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged.
attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool()
attn_mask = attn_mask.detach()
return outputs_class, outputs_mask, attn_mask
@torch.jit.unused
def _set_aux_loss(self, outputs_class, outputs_seg_masks):
# this is a workaround to make torchscript happy, as torchscript
# doesn't support dictionary with non-homogeneous values, such
# as a dict having both a Tensor and a list.
if self.mask_classification:
return [
{"pred_logits": a, "pred_masks": b}
for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
]
else:
return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
================================================
FILE: mfvis_nococo/mask2former_video/utils/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
================================================
FILE: mfvis_nococo/mask2former_video/utils/memory.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
import logging
from contextlib import contextmanager
from functools import wraps
import torch
from torch.cuda.amp import autocast
__all__ = ["retry_if_cuda_oom"]
@contextmanager
def _ignore_torch_cuda_oom():
"""
A context which ignores CUDA OOM exception from pytorch.
"""
try:
yield
except RuntimeError as e:
# NOTE: the string may change?
if "CUDA out of memory. " in str(e):
pass
else:
raise
def retry_if_cuda_oom(func):
"""
Makes a function retry itself after encountering
pytorch's CUDA OOM error.
It will first retry after calling `torch.cuda.empty_cache()`.
If that still fails, it will then retry by trying to convert inputs to CPUs.
In this case, it expects the function to dispatch to CPU implementation.
The return values may become CPU tensors as well and it's user's
responsibility to convert it back to CUDA tensor if needed.
Args:
func: a stateless callable that takes tensor-like objects as arguments
Returns:
a callable which retries `func` if OOM is encountered.
Examples:
::
output = retry_if_cuda_oom(some_torch_function)(input1, input2)
# output may be on CPU even if inputs are on GPU
Note:
1. When converting inputs to CPU, it will only look at each argument and check
if it has `.device` and `.to` for conversion. Nested structures of tensors
are not supported.
2. Since the function might be called more than once, it has to be
stateless.
"""
def maybe_to_cpu(x):
try:
like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to")
except AttributeError:
like_gpu_tensor = False
if like_gpu_tensor:
return x.to(device="cpu").to(torch.float32)
else:
return x
@wraps(func)
def wrapped(*args, **kwargs):
with _ignore_torch_cuda_oom():
return func(*args, **kwargs)
# Clear cache and retry
torch.cuda.empty_cache()
with _ignore_torch_cuda_oom():
return func(*args, **kwargs)
# Try on CPU. This slows down the code significantly, therefore print a notice.
logger = logging.getLogger(__name__)
logger.info("Attempting to copy inputs to CPU due to CUDA OOM")
new_args = (maybe_to_cpu(x) for x in args)
new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()}
with autocast(enabled=False):
return func(*new_args, **new_kwargs)
return wrapped
================================================
FILE: mfvis_nococo/mask2former_video/video_maskformer_model.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
import logging
import math
from typing import Tuple
import torch
from torch import nn
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.data import MetadataCatalog
from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head
from detectron2.modeling.backbone import Backbone
from detectron2.modeling.postprocessing import sem_seg_postprocess
from detectron2.structures import Boxes, ImageList, Instances, BitMasks
from .modeling.criterion import VideoSetCriterion
from .modeling.matcher import VideoHungarianMatcher
from .utils.memory import retry_if_cuda_oom
from skimage import color
import cv2
import numpy as np
def unfold_wo_center(x, kernel_size, dilation):
assert x.dim() == 4
assert kernel_size % 2 == 1
# using SAME padding
padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
unfolded_x = F.unfold(
x, kernel_size=kernel_size,
padding=padding,
dilation=dilation
)
unfolded_x = unfolded_x.reshape(
x.size(0), x.size(1), -1, x.size(2), x.size(3)
)
# remove the center pixels
size = kernel_size ** 2
unfolded_x = torch.cat((
unfolded_x[:, :, :size // 2],
unfolded_x[:, :, size // 2 + 1:]
), dim=2)
return unfolded_x
def unfold_w_center(x, kernel_size, dilation):
assert x.dim() == 4
assert kernel_size % 2 == 1
# using SAME padding
padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
unfolded_x = F.unfold(
x, kernel_size=kernel_size,
padding=padding,
dilation=dilation
)
unfolded_x = unfolded_x.reshape(
x.size(0), x.size(1), -1, x.size(2), x.size(3)
)
return unfolded_x
def get_images_color_similarity(images, kernel_size, dilation):
assert images.dim() == 4
assert images.size(0) == 1
unfolded_images = unfold_wo_center(
images, kernel_size=kernel_size, dilation=dilation
)
diff = images[:, :, None] - unfolded_images
similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5)
return similarity
def get_neighbor_images_color_similarity(images, images_neighbor, kernel_size, dilation):
assert images.dim() == 4
assert images.size(0) == 1
unfolded_images = unfold_w_center(
images, kernel_size=kernel_size, dilation=dilation
)
diff = images_neighbor[:, :, None] - unfolded_images
similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5)
return similarity
def get_neighbor_images_patch_color_similarity(images, images_neighbor, kernel_size, dilation):
assert images.dim() == 4
assert images.size(0) == 1
unfolded_images = unfold_w_center(
images, kernel_size=kernel_size, dilation= 1 #dilation
)
unfolded_images_neighbor = unfold_w_center(
images_neighbor, kernel_size=kernel_size, dilation= 1 #dilation
)
unfolded_images = unfolded_images.flatten(1,2)
unfolded_images_neighbor = unfolded_images_neighbor.flatten(1,2)
similarity = get_neighbor_images_color_similarity(unfolded_images, unfolded_images_neighbor, 3, 3)
return similarity
logger = logging.getLogger(__name__)
@META_ARCH_REGISTRY.register()
class VideoMaskFormer(nn.Module):
"""
Main class for mask classification semantic segmentation architectures.
"""
@configurable
def __init__(
self,
*,
backbone: Backbone,
sem_seg_head: nn.Module,
criterion: nn.Module,
num_queries: int,
object_mask_threshold: float,
overlap_threshold: float,
metadata,
size_divisibility: int,
sem_seg_postprocess_before_inference: bool,
pixel_mean: Tuple[float],
pixel_std: Tuple[float],
# video
num_frames,
):
"""
Args:
backbone: a backbone module, must follow detectron2's backbone interface
sem_seg_head: a module that predicts semantic segmentation from backbone features
criterion: a module that defines the loss
num_queries: int, number of queries
object_mask_threshold: float, threshold to filter query based on classification score
for panoptic segmentation inference
overlap_threshold: overlap threshold used in general inference for panoptic segmentation
metadata: dataset meta, get `thing` and `stuff` category names for panoptic
segmentation inference
size_divisibility: Some backbones require the input height and width to be divisible by a
specific integer. We can use this to override such requirement.
sem_seg_postprocess_before_inference: whether to resize the prediction back
to original input size before semantic segmentation inference or after.
For high-resolution dataset like Mapillary, resizing predictions before
inference will cause OOM error.
pixel_mean, pixel_std: list or tuple with #channels element, representing
the per-channel mean and std to be used to normalize the input image
semantic_on: bool, whether to output semantic segmentation prediction
instance_on: bool, whether to output instance segmentation prediction
panoptic_on: bool, whether to output panoptic segmentation prediction
test_topk_per_image: int, instance segmentation parameter, keep topk instances per image
"""
super().__init__()
self.backbone = backbone
self.sem_seg_head = sem_seg_head
self.criterion = criterion
self.num_queries = num_queries
self.overlap_threshold = overlap_threshold
self.object_mask_threshold = object_mask_threshold
self.metadata = metadata
if size_divisibility < 0:
# use backbone size_divisibility if not set
size_divisibility = self.backbone.size_divisibility
self.size_divisibility = size_divisibility
self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference
self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
self.num_frames = num_frames
#self.structure_fc = nn.Conv2d(27, 256, 1)
@classmethod
def from_config(cls, cfg):
backbone = build_backbone(cfg)
sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
# Loss parameters:
deep_supervision = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
no_object_weight = cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT
# loss weights
class_weight = cfg.MODEL.MASK_FORMER.CLASS_WEIGHT
dice_weight = cfg.MODEL.MASK_FORMER.DICE_WEIGHT
mask_weight = cfg.MODEL.MASK_FORMER.MASK_WEIGHT
# building criterion
matcher = VideoHungarianMatcher(
cost_class=class_weight,
cost_mask=mask_weight,
cost_dice=dice_weight,
num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS,
)
weight_dict = {"loss_ce": class_weight, "loss_mask": mask_weight, "loss_dice": dice_weight, "loss_bound": mask_weight, "loss_bound_neighbor": mask_weight}
if deep_supervision:
dec_layers = cfg.MODEL.MASK_FORMER.DEC_LAYERS
aux_weight_dict = {}
for i in range(dec_layers - 1):
aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
weight_dict.update(aux_weight_dict)
losses = ["labels", "masks"]
criterion = VideoSetCriterion(
sem_seg_head.num_classes,
matcher=matcher,
weight_dict=weight_dict,
eos_coef=no_object_weight,
losses=losses,
num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS,
oversample_ratio=cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO,
importance_sample_ratio=cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO,
)
return {
"backbone": backbone,
"sem_seg_head": sem_seg_head,
"criterion": criterion,
"num_queries": cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES,
"object_mask_threshold": cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD,
"overlap_threshold": cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD,
"metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
"size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY,
"sem_seg_postprocess_before_inference": True,
"pixel_mean": cfg.MODEL.PIXEL_MEAN,
"pixel_std": cfg.MODEL.PIXEL_STD,
# video
"num_frames": cfg.INPUT.SAMPLING_FRAME_NUM,
}
@property
def device(self):
return self.pixel_mean.device
def forward(self, batched_inputs):
"""
Args:
batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
* "image": Tensor, image in (C, H, W) format.
* "instances": per-region ground truth
* Other information that's included in the original dicts, such as:
"height", "width" (int): the output resolution of the model (may be different
from input resolution), used in inference.
Returns:
list[dict]:
each dict has the results for one image. The dict contains the following keys:
* "sem_seg":
A Tensor that represents the
per-pixel segmentation prediced by the head.
The prediction has shape KxHxW that represents the logits of
each class for each pixel.
* "panoptic_seg":
A tuple that represent panoptic output
panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
segments_info (list[dict]): Describe each segment in `panoptic_seg`.
Each dict contains keys "id", "category_id", "isthing".
"""
images = []
for video in batched_inputs:
for frame in video["image"]:
images.append(frame.to(self.device))
if self.training:
k_size = 3 #3
rs_images = ImageList.from_tensors(images, self.size_divisibility)
downsampled_images = F.avg_pool2d(rs_images.tensor.float(), kernel_size=4, stride=4, padding=0) #for img in images]
images_lab = [torch.as_tensor(color.rgb2lab(ds_image[[2, 1, 0]].byte().permute(1, 2, 0).cpu().numpy()), device=ds_image.device, dtype=torch.float32).permute(2, 0, 1) for ds_image in downsampled_images]
images_lab_sim = [get_images_color_similarity(img_lab.unsqueeze(0), k_size, 2) for img_lab in images_lab] # ori is 0.3, 0.5, 0.7
images_lab_sim_nei = [get_neighbor_images_patch_color_similarity(images_lab[ii].unsqueeze(0), images_lab[ii+1].unsqueeze(0), 3, 3) for ii in range(0, len(images_lab), 5)] # ori dilation is 3
images_lab_sim_nei1 = [get_neighbor_images_patch_color_similarity(images_lab[ii+1].unsqueeze(0), images_lab[ii+2].unsqueeze(0), 3, 3) for ii in range(0, len(images_lab), 5)]
images_lab_sim_nei2 = [get_neighbor_images_patch_color_similarity(images_lab[ii+2].unsqueeze(0), images_lab[ii+3].unsqueeze(0), 3, 3) for ii in range(0, len(images_lab), 5)]
images_lab_sim_nei3 = [get_neighbor_images_patch_color_similarity(images_lab[ii+3].unsqueeze(0), images_lab[ii+4].unsqueeze(0), 3, 3) for ii in range(0, len(images_lab), 5)]
images_lab_sim_nei4 = [get_neighbor_images_patch_color_similarity(images_lab[ii+4].unsqueeze(0), images_lab[ii].unsqueeze(0), 3, 3) for ii in range(0, len(images_lab), 5)]
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
images = ImageList.from_tensors(images, self.size_divisibility)
features = self.backbone(images.tensor)
outputs = self.sem_seg_head(features)
if self.training:
# mask classification target
targets = self.prepare_targets(batched_inputs, images)
# bipartite matching-based loss
losses = self.criterion(outputs, targets, images_lab_sim, images_lab_sim_nei, images_lab_sim_nei1, images_lab_sim_nei2, images_lab_sim_nei3, images_lab_sim_nei4)
for k in list(losses.keys()):
if k in self.criterion.weight_dict:
losses[k] *= self.criterion.weight_dict[k]
else:
# remove this loss if not specified in `weight_dict`
losses.pop(k)
return losses
else:
mask_cls_results = outputs["pred_logits"]
mask_pred_results = outputs["pred_masks"]
mask_cls_result = mask_cls_results[0]
# upsample masks
mask_pred_result = retry_if_cuda_oom(F.interpolate)(
mask_pred_results[0],
size=(images.tensor.shape[-2], images.tensor.shape[-1]),
mode="bilinear",
align_corners=False,
)
del outputs
input_per_image = batched_inputs[0]
image_size = images.image_sizes[0] # image size without padding after data augmentation
height = input_per_image.get("height", image_size[0]) # raw image size before data augmentation
width = input_per_image.get("width", image_size[1])
return retry_if_cuda_oom(self.inference_video)(mask_cls_result, mask_pred_result, image_size, height, width)
def prepare_targets(self, targets, images):
h_pad, w_pad = images.tensor.shape[-2:]
gt_instances = []
for targets_per_video in targets:
_num_instance = len(targets_per_video["instances"][0])
mask_shape = [_num_instance, self.num_frames, h_pad, w_pad]
gt_masks_per_video = torch.zeros(mask_shape, dtype=torch.bool, device=self.device)
gt_ids_per_video = []
for f_i, targets_per_frame in enumerate(targets_per_video["instances"]):
targets_per_frame = targets_per_frame.to(self.device)
h, w = targets_per_frame.image_size
gt_ids_per_video.append(targets_per_frame.gt_ids[:, None])
gt_masks_per_video[:, f_i, :h, :w] = targets_per_frame.gt_masks.tensor
gt_ids_per_video = torch.cat(gt_ids_per_video, dim=1)
valid_idx = (gt_ids_per_video != -1).any(dim=-1)
gt_classes_per_video = targets_per_frame.gt_classes[valid_idx] # N,
gt_ids_per_video = gt_ids_per_video[valid_idx] # N, num_frames
gt_instances.append({"labels": gt_classes_per_video, "ids": gt_ids_per_video})
gt_masks_per_video = gt_masks_per_video[valid_idx].float() # N, num_frames, H, W
gt_instances[-1].update({"masks": gt_masks_per_video})
return gt_instances
def inference_video(self, pred_cls, pred_masks, img_size, output_height, output_width):
if len(pred_cls) > 0:
scores = F.softmax(pred_cls, dim=-1)[:, :-1]
labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1)
# keep top-10 predictions
scores_per_image, topk_indices = scores.flatten(0, 1).topk(10, sorted=False)
labels_per_image = labels[topk_indices]
topk_indices = topk_indices // self.sem_seg_head.num_classes
pred_masks = pred_masks[topk_indices]
pred_masks = pred_masks[:, :, : img_size[0], : img_size[1]]
pred_masks = F.interpolate(
pred_masks, size=(output_height, output_width), mode="bilinear", align_corners=False
)
masks = pred_masks > 0.
out_scores = scores_per_image.tolist()
out_labels = labels_per_image.tolist()
out_masks = [m for m in masks.cpu()]
else:
out_scores = []
out_labels = []
out_masks = []
video_output = {
"image_size": (output_height, output_width),
"pred_scores": out_scores,
"pred_labels": out_labels,
"pred_masks": out_masks,
}
return video_output
================================================
FILE: mfvis_nococo/scripts/eval_8gpu_mask2former_r101_video.sh
================================================
export PYTHONPATH=$PYTHONPATH:`pwd`
ID=159
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\
--config-file configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml\
--eval-only MODEL.WEIGHTS ../mfvis_models/model_final_r101_0473.pth
================================================
FILE: mfvis_nococo/scripts/train_8gpu_mask2former_r101_video_coco.sh
================================================
export PYTHONPATH=$PYTHONPATH:`pwd`
ID=159
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\
--config-file configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep_coco.yaml
================================================
FILE: mfvis_nococo/scripts/train_8gpu_mask2former_r50_video.sh
================================================
export PYTHONPATH=$PYTHONPATH:`pwd`
ID=159
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\
--config-file configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml
================================================
FILE: mfvis_nococo/scripts/train_8gpu_mask2former_r50_video_coco.sh
================================================
export PYTHONPATH=$PYTHONPATH:`pwd`
ID=159
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\
--config-file configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep_coco.yaml
================================================
FILE: mfvis_nococo/scripts/visual_video_r101.sh
================================================
export PYTHONPATH=$PYTHONPATH:`pwd`
CUDA_VISIBLE_DEVICES=0 python3 demo_video/demo.py --config-file configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml --save-frames True \
--input './datasets/ytvis_2019/valid/JPEGImages/' \
--output 'box_patch_newknn_r101_vis/' \
--opts MODEL.WEIGHTS ../mfvis_models/model_final_r101_0473.pth
================================================
FILE: mfvis_nococo/scripts/visual_video_r50.sh
================================================
export PYTHONPATH=$PYTHONPATH:`pwd`
CUDA_VISIBLE_DEVICES=0 python3 demo_video/demo.py --config-file configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml --save-frames True \
--input './datasets/ytvis_2019/valid/JPEGImages/' \
--output 'box_patch_newknn_r50_vis/' \
--opts MODEL.WEIGHTS ./mfvis_models/model_final_r50_0438.pth
================================================
FILE: mfvis_nococo/train_net_video.py
================================================
"""
This script is a simplified version of the training script in detectron2/tools.
"""
try:
# ignore ShapelyDeprecationWarning from fvcore
from shapely.errors import ShapelyDeprecationWarning
import warnings
warnings.filterwarnings('ignore', category=ShapelyDeprecationWarning)
except:
pass
import copy
import itertools
import logging
import os
from collections import OrderedDict
from typing import Any, Dict, List, Set
import torch
import detectron2.utils.comm as comm
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import get_cfg
from detectron2.data import MetadataCatalog
from detectron2.engine import (
DefaultTrainer,
default_argument_parser,
default_setup,
launch,
)
from detectron2.evaluation import (
DatasetEvaluator,
inference_on_dataset,
print_csv_format,
verify_results,
)
from detectron2.projects.deeplab import add_deeplab_config, build_lr_scheduler
from detectron2.solver.build import maybe_add_gradient_clipping
from detectron2.utils.logger import setup_logger
# MaskFormer
from mask2former import add_maskformer2_config
from mask2former_video import (
YTVISDatasetMapper,
YTVISEvaluator,
add_maskformer2_video_config,
build_detection_train_loader,
build_detection_test_loader,
get_detection_dataset_dicts,
)
class Trainer(DefaultTrainer):
"""
Extension of the Trainer class adapted to MaskFormer.
"""
@classmethod
def build_evaluator(cls, cfg, dataset_name, output_folder=None):
"""
Create evaluator(s) for a given dataset.
This uses the special metadata "evaluator_type" associated with each builtin dataset.
For your own dataset, you can simply create an evaluator manually in your
script and do not have to worry about the hacky if-else logic here.
"""
if output_folder is None:
output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
os.makedirs(output_folder, exist_ok=True)
return YTVISEvaluator(dataset_name, cfg, True, output_folder)
@classmethod
def build_train_loader(cls, cfg):
dataset_name = cfg.DATASETS.TRAIN[0]
mapper = YTVISDatasetMapper(cfg, is_train=True)
dataset_dict = get_detection_dataset_dicts(
dataset_name,
filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
)
return build_detection_train_loader(cfg, mapper=mapper, dataset=dataset_dict)
@classmethod
def build_test_loader(cls, cfg, dataset_name):
dataset_name = cfg.DATASETS.TEST[0]
mapper = YTVISDatasetMapper(cfg, is_train=False)
return build_detection_test_loader(cfg, dataset_name, mapper=mapper)
@classmethod
def build_lr_scheduler(cls, cfg, optimizer):
"""
It now calls :func:`detectron2.solver.build_lr_scheduler`.
Overwrite it if you'd like a different scheduler.
"""
return build_lr_scheduler(cfg, optimizer)
@classmethod
def build_optimizer(cls, cfg, model):
weight_decay_norm = cfg.SOLVER.WEIGHT_DECAY_NORM
weight_decay_embed = cfg.SOLVER.WEIGHT_DECAY_EMBED
defaults = {}
defaults["lr"] = cfg.SOLVER.BASE_LR
defaults["weight_decay"] = cfg.SOLVER.WEIGHT_DECAY
norm_module_types = (
torch.nn.BatchNorm1d,
torch.nn.BatchNorm2d,
torch.nn.BatchNorm3d,
torch.nn.SyncBatchNorm,
# NaiveSyncBatchNorm inherits from BatchNorm2d
torch.nn.GroupNorm,
torch.nn.InstanceNorm1d,
torch.nn.InstanceNorm2d,
torch.nn.InstanceNorm3d,
torch.nn.LayerNorm,
torch.nn.LocalResponseNorm,
)
params: List[Dict[str, Any]] = []
memo: Set[torch.nn.parameter.Parameter] = set()
for module_name, module in model.named_modules():
for module_param_name, value in module.named_parameters(recurse=False):
if not value.requires_grad:
continue
# Avoid duplicating parameters
if value in memo:
continue
memo.add(value)
hyperparams = copy.copy(defaults)
if "backbone" in module_name:
hyperparams["lr"] = hyperparams["lr"] * cfg.SOLVER.BACKBONE_MULTIPLIER
if (
"relative_position_bias_table" in module_param_name
or "absolute_pos_embed" in module_param_name
):
print(module_param_name)
hyperparams["weight_decay"] = 0.0
if isinstance(module, norm_module_types):
hyperparams["weight_decay"] = weight_decay_norm
if isinstance(module, torch.nn.Embedding):
hyperparams["weight_decay"] = weight_decay_embed
params.append({"params": [value], **hyperparams})
def maybe_add_full_model_gradient_clipping(optim):
# detectron2 doesn't have full model gradient clipping now
clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE
enable = (
cfg.SOLVER.CLIP_GRADIENTS.ENABLED
and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model"
and clip_norm_val > 0.0
)
class FullModelGradientClippingOptimizer(optim):
def step(self, closure=None):
all_params = itertools.chain(*[x["params"] for x in self.param_groups])
torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val)
super().step(closure=closure)
return FullModelGradientClippingOptimizer if enable else optim
optimizer_type = cfg.SOLVER.OPTIMIZER
if optimizer_type == "SGD":
optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)(
params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM
)
elif optimizer_type == "ADAMW":
optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)(
params, cfg.SOLVER.BASE_LR
)
else:
raise NotImplementedError(f"no optimizer type {optimizer_type}")
if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model":
optimizer = maybe_add_gradient_clipping(cfg, optimizer)
return optimizer
@classmethod
def test(cls, cfg, model, evaluators=None):
"""
Evaluate the given model. The given model is expected to already contain
weights to evaluate.
Args:
cfg (CfgNode):
model (nn.Module):
evaluators (list[DatasetEvaluator] or None): if None, will call
:meth:`build_evaluator`. Otherwise, must have the same length as
``cfg.DATASETS.TEST``.
Returns:
dict: a dict of result metrics
"""
from torch.cuda.amp import autocast
logger = logging.getLogger(__name__)
if isinstance(evaluators, DatasetEvaluator):
evaluators = [evaluators]
if evaluators is not None:
assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format(
len(cfg.DATASETS.TEST), len(evaluators)
)
results = OrderedDict()
for idx, dataset_name in enumerate(cfg.DATASETS.TEST):
data_loader = cls.build_test_loader(cfg, dataset_name)
# When evaluators are passed in as arguments,
# implicitly assume that evaluators can be created before data_loader.
if evaluators is not None:
evaluator = evaluators[idx]
else:
try:
evaluator = cls.build_evaluator(cfg, dataset_name)
except NotImplementedError:
logger.warn(
"No evaluator found. Use `DefaultTrainer.test(evaluators=)`, "
"or implement its `build_evaluator` method."
)
results[dataset_name] = {}
continue
with autocast():
results_i = inference_on_dataset(model, data_loader, evaluator)
results[dataset_name] = results_i
if comm.is_main_process():
assert isinstance(
results_i, dict
), "Evaluator must return a dict on the main process. Got {} instead.".format(
results_i
)
logger.info("Evaluation results for {} in csv format:".format(dataset_name))
print_csv_format(results_i)
if len(results) == 1:
results = list(results.values())[0]
return results
def setup(args):
"""
Create configs and perform basic setups.
"""
cfg = get_cfg()
# for poly lr schedule
add_deeplab_config(cfg)
add_maskformer2_config(cfg)
add_maskformer2_video_config(cfg)
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.freeze()
default_setup(cfg, args)
# Setup logger for "mask_former" module
setup_logger(name="mask2former")
setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="mask2former_video")
return cfg
def main(args):
cfg = setup(args)
if args.eval_only:
model = Trainer.build_model(cfg)
DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
cfg.MODEL.WEIGHTS, resume=args.resume
)
res = Trainer.test(cfg, model)
if cfg.TEST.AUG.ENABLED:
raise NotImplementedError
if comm.is_main_process():
verify_results(cfg, res)
return res
trainer = Trainer(cfg)
trainer.resume_or_load(resume=args.resume)
return trainer.train()
if __name__ == "__main__":
args = default_argument_parser().parse_args()
print("Command Line Args:", args)
launch(
main,
args.num_gpus,
num_machines=args.num_machines,
machine_rank=args.machine_rank,
dist_url=args.dist_url,
args=(args,),
)
================================================
FILE: requirements.txt
================================================
cython
scipy
shapely
timm
h5py
submitit
scikit-image
================================================
FILE: scripts/eval_8gpu_mask2former_r101_video.sh
================================================
export PYTHONPATH=$PYTHONPATH:`pwd`
ID=159
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\
--config-file configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml\
--eval-only MODEL.WEIGHTS ./mfvis_models/model_final_r101_0491.pth
================================================
FILE: scripts/eval_8gpu_mask2former_r50_video.sh
================================================
export PYTHONPATH=$PYTHONPATH:`pwd`
ID=159
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\
--config-file configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml\
--eval-only MODEL.WEIGHTS ./mfvis_models/model_final_r50_0466.pth
================================================
FILE: scripts/eval_8gpu_mask2former_swinl_video.sh
================================================
export PYTHONPATH=$PYTHONPATH:`pwd`
ID=159
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\
--config-file configs/youtubevis_2019/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml\
--eval-only MODEL.WEIGHTS ./mfvis_models/model_final_swinl_0560.pth
================================================
FILE: scripts/train_8gpu_mask2former_r101_video.sh
================================================
export PYTHONPATH=$PYTHONPATH:`pwd`
ID=159
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\
--config-file configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml
================================================
FILE: scripts/train_8gpu_mask2former_r50_video.sh
================================================
export PYTHONPATH=$PYTHONPATH:`pwd`
#export CUDA_LAUNCH_BLOCKING=1 # for debug
ID=159
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\
--config-file configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml
================================================
FILE: scripts/train_8gpu_mask2former_swinl_video.sh
================================================
export PYTHONPATH=$PYTHONPATH:`pwd`
ID=159
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus 8 --resume --dist-url tcp://0.0.0.0:12349\
--config-file configs/youtubevis_2019/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml
================================================
FILE: scripts/visual_video.sh
================================================
export PYTHONPATH=$PYTHONPATH:`pwd`
CUDA_VISIBLE_DEVICES=0 python3 demo_video/demo.py --config-file configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml --save-frames True \
--input './datasets/ytvis_2019/valid/JPEGImages/' \
--output 'r101_vis/' \
--opts MODEL.WEIGHTS ./mfvis_models/model_final_r101_0491.pth
================================================
FILE: tools/README.md
================================================
This directory contains few tools for MaskFormer.
* `convert-torchvision-to-d2.py`
Tool to convert torchvision pre-trained weights for D2.
```
wget https://download.pytorch.org/models/resnet101-63fe2227.pth
python tools/convert-torchvision-to-d2.py resnet101-63fe2227.pth R-101.pkl
```
* `convert-pretrained-swin-model-to-d2.py`
Tool to convert Swin Transformer pre-trained weights for D2.
```
pip install timm
wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
python tools/convert-pretrained-swin-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth
python tools/convert-pretrained-swin-model-to-d2.py swin_small_patch4_window7_224.pth swin_small_patch4_window7_224.pkl
wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth
python tools/convert-pretrained-swin-model-to-d2.py swin_base_patch4_window12_384_22k.pth swin_base_patch4_window12_384_22k.pkl
wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth
python tools/convert-pretrained-swin-model-to-d2.py swin_large_patch4_window12_384_22k.pth swin_large_patch4_window12_384_22k.pkl
```
* `evaluate_pq_for_semantic_segmentation.py`
Tool to evaluate PQ (PQ-stuff) for semantic segmentation predictions.
Usage:
```
python tools/evaluate_pq_for_semantic_segmentation.py --dataset-name ade20k_sem_seg_val --json-file OUTPUT_DIR/inference/sem_seg_predictions.json
```
where `OUTPUT_DIR` is set in the config file.
* `evaluate_coco_boundary_ap.py`
Tool to evaluate Boundary AP for instance segmentation predictions.
Usage:
```
python tools/coco_instance_evaluation.py --gt-json-file COCO_GT_JSON --dt-json-file COCO_DT_JSON
```
To install Boundary IoU API, run:
```
pip install git+https://github.com/bowenc0221/boundary-iou-api.git
```
* `analyze_model.py`
Tool to analyze model parameters and flops.
Usage for semantic segmentation (ADE20K only, use with caution!):
```
python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE
```
Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`.
Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like Cityscapes!
Usage for panoptic and instance segmentation:
```
python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE
```
Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images.
================================================
FILE: tools/analyze_model.py
================================================
# -*- coding: utf-8 -*-
# Modified by Bowen Cheng from https://github.com/facebookresearch/detectron2/blob/main/tools/analyze_model.py
import logging
import numpy as np
from collections import Counter
import tqdm
from fvcore.nn import flop_count_table # can also try flop_count_str
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate
from detectron2.data import build_detection_test_loader
from detectron2.engine import default_argument_parser
from detectron2.modeling import build_model
from detectron2.projects.deeplab import add_deeplab_config
from detectron2.utils.analysis import (
FlopCountAnalysis,
activation_count_operators,
parameter_count_table,
)
from detectron2.utils.logger import setup_logger
# fmt: off
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
# fmt: on
from mask2former import add_maskformer2_config
logger = logging.getLogger("detectron2")
def setup(args):
if args.config_file.endswith(".yaml"):
cfg = get_cfg()
add_deeplab_config(cfg)
add_maskformer2_config(cfg)
cfg.merge_from_file(args.config_file)
cfg.DATALOADER.NUM_WORKERS = 0
cfg.merge_from_list(args.opts)
cfg.freeze()
else:
cfg = LazyConfig.load(args.config_file)
cfg = LazyConfig.apply_overrides(cfg, args.opts)
setup_logger(name="fvcore")
setup_logger()
return cfg
def do_flop(cfg):
if isinstance(cfg, CfgNode):
data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
model = build_model(cfg)
DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
else:
data_loader = instantiate(cfg.dataloader.test)
model = instantiate(cfg.model)
model.to(cfg.train.device)
DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
model.eval()
counts = Counter()
total_flops = []
for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa
if args.use_fixed_input_size and isinstance(cfg, CfgNode):
import torch
crop_size = cfg.INPUT.CROP.SIZE[0]
data[0]["image"] = torch.zeros((3, crop_size, crop_size))
flops = FlopCountAnalysis(model, data)
if idx > 0:
flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False)
counts += flops.by_operator()
total_flops.append(flops.total())
logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops))
logger.info(
"Average GFlops for each type of operators:\n"
+ str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()])
)
logger.info(
"Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9)
)
def do_activation(cfg):
if isinstance(cfg, CfgNode):
data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
model = build_model(cfg)
DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
else:
data_loader = instantiate(cfg.dataloader.test)
model = instantiate(cfg.model)
model.to(cfg.train.device)
DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
model.eval()
counts = Counter()
total_activations = []
for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa
count = activation_count_operators(model, data)
counts += count
total_activations.append(sum(count.values()))
logger.info(
"(Million) Activations for Each Type of Operators:\n"
+ str([(k, v / idx) for k, v in counts.items()])
)
logger.info(
"Total (Million) Activations: {}±{}".format(
np.mean(total_activations), np.std(total_activations)
)
)
def do_parameter(cfg):
if isinstance(cfg, CfgNode):
model = build_model(cfg)
else:
model = instantiate(cfg.model)
logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5))
def do_structure(cfg):
if isinstance(cfg, CfgNode):
model = build_model(cfg)
else:
model = instantiate(cfg.model)
logger.info("Model Structure:\n" + str(model))
if __name__ == "__main__":
parser = default_argument_parser(
epilog="""
Examples:
To show parameters of a model:
$ ./analyze_model.py --tasks parameter \\
--config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
Flops and activations are data-dependent, therefore inputs and model weights
are needed to count them:
$ ./analyze_model.py --num-inputs 100 --tasks flop \\
--config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\
MODEL.WEIGHTS /path/to/model.pkl
"""
)
parser.add_argument(
"--tasks",
choices=["flop", "activation", "parameter", "structure"],
required=True,
nargs="+",
)
parser.add_argument(
"-n",
"--num-inputs",
default=100,
type=int,
help="number of inputs used to compute statistics for flops/activations, "
"both are data dependent.",
)
parser.add_argument(
"--use-fixed-input-size",
action="store_true",
help="use fixed input size when calculating flops",
)
args = parser.parse_args()
assert not args.eval_only
assert args.num_gpus == 1
cfg = setup(args)
for task in args.tasks:
{
"flop": do_flop,
"activation": do_activation,
"parameter": do_parameter,
"structure": do_structure,
}[task](cfg)
================================================
FILE: tools/convert-pretrained-swin-model-to-d2.py
================================================
#!/usr/bin/env python
import pickle as pkl
import sys
import torch
"""
Usage:
# download pretrained swin model:
wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
# run the conversion
./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
# Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
MODEL:
WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
INPUT:
FORMAT: "RGB"
"""
if __name__ == "__main__":
input = sys.argv[1]
obj = torch.load(input, map_location="cpu")["model"]
res = {"model": obj, "__author__": "third_party", "matching_heuristics": True}
with open(sys.argv[2], "wb") as f:
pkl.dump(res, f)
================================================
FILE: tools/convert-torchvision-to-d2.py
================================================
#!/usr/bin/env python
import pickle as pkl
import sys
import torch
"""
Usage:
# download one of the ResNet{18,34,50,101,152} models from torchvision:
wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth
# run the conversion
./convert-torchvision-to-d2.py r50.pth r50.pkl
# Then, use r50.pkl with the following changes in config:
MODEL:
WEIGHTS: "/path/to/r50.pkl"
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
RESNETS:
DEPTH: 50
STRIDE_IN_1X1: False
INPUT:
FORMAT: "RGB"
"""
if __name__ == "__main__":
input = sys.argv[1]
obj = torch.load(input, map_location="cpu")
newmodel = {}
for k in list(obj.keys()):
old_k = k
if "layer" not in k:
k = "stem." + k
for t in [1, 2, 3, 4]:
k = k.replace("layer{}".format(t), "res{}".format(t + 1))
for t in [1, 2, 3]:
k = k.replace("bn{}".format(t), "conv{}.norm".format(t))
k = k.replace("downsample.0", "shortcut")
k = k.replace("downsample.1", "shortcut.norm")
print(old_k, "->", k)
newmodel[k] = obj.pop(old_k).detach().numpy()
res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True}
with open(sys.argv[2], "wb") as f:
pkl.dump(res, f)
if obj:
print("Unconverted keys:", obj.keys())
================================================
FILE: tools/evaluate_coco_boundary_ap.py
================================================
#!/usr/bin/env python
# Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py
"""
Evaluation for COCO val2017:
python ./tools/coco_instance_evaluation.py \
--gt-json-file COCO_GT_JSON \
--dt-json-file COCO_DT_JSON
"""
import argparse
import json
from boundary_iou.coco_instance_api.coco import COCO
from boundary_iou.coco_instance_api.cocoeval import COCOeval
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--gt-json-file", default="")
parser.add_argument("--dt-json-file", default="")
parser.add_argument("--iou-type", default="boundary")
parser.add_argument("--dilation-ratio", default="0.020", type=float)
args = parser.parse_args()
print(args)
annFile = args.gt_json_file
resFile = args.dt_json_file
dilation_ratio = args.dilation_ratio
if args.iou_type == "boundary":
get_boundary = True
else:
get_boundary = False
cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio)
# remove box predictions
resFile = json.load(open(resFile))
for c in resFile:
c.pop("bbox", None)
cocoDt = cocoGt.loadRes(resFile)
cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio)
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()
if __name__ == '__main__':
main()
================================================
FILE: tools/evaluate_pq_for_semantic_segmentation.py
================================================
#!/usr/bin/env python
import argparse
import json
import os
from collections import defaultdict
from tqdm import tqdm
import numpy as np
import torch
from detectron2.data import MetadataCatalog
from detectron2.data.detection_utils import read_image
from detectron2.utils.file_io import PathManager
from pycocotools import mask as maskUtils
from panopticapi.evaluation import PQStat
def default_argument_parser():
"""
Creates a parser with some common arguments used by analysis tools.
Returns:
argparse.ArgumentParser:
"""
parser = argparse.ArgumentParser(description="Evaluate PQ metric for semantic segmentation.")
# NOTE: currently does not support Cityscapes, you need to convert
# Cityscapes prediction format to Detectron2 prediction format.
parser.add_argument(
"--dataset-name",
default="ade20k_sem_seg_val",
choices=["ade20k_sem_seg_val", "coco_2017_test_stuff_10k_sem_seg", "ade20k_full_sem_seg_val"],
help="dataset name you want to evaluate")
parser.add_argument("--json-file", default="", help="path to detection json file")
return parser
# Modified from the official panoptic api: https://github.com/cocodataset/panopticapi/blob/master/panopticapi/evaluation.py
def pq_compute_single_image(segm_gt, segm_dt, categories, ignore_label):
pq_stat = PQStat()
VOID = ignore_label
OFFSET = 256 * 256 * 256
pan_gt = segm_gt
pan_pred = segm_dt
gt_ann = {'segments_info': []}
labels, labels_cnt = np.unique(segm_gt, return_counts=True)
for cat_id, cnt in zip(labels, labels_cnt):
if cat_id == VOID:
continue
gt_ann['segments_info'].append(
{"id": cat_id, "category_id": cat_id, "area": cnt, "iscrowd": 0}
)
pred_ann = {'segments_info': []}
for cat_id in np.unique(segm_dt):
pred_ann['segments_info'].append({"id": cat_id, "category_id": cat_id})
gt_segms = {el['id']: el for el in gt_ann['segments_info']}
pred_segms = {el['id']: el for el in pred_ann['segments_info']}
# predicted segments area calculation + prediction sanity checks
pred_labels_set = set(el['id'] for el in pred_ann['segments_info'])
labels, labels_cnt = np.unique(pan_pred, return_counts=True)
for label, label_cnt in zip(labels, labels_cnt):
if label not in pred_segms:
if label == VOID:
continue
raise KeyError('In the image with ID {} segment with ID {} is presented in PNG and not presented in JSON.'.format(image_id, label))
pred_segms[label]['area'] = label_cnt
pred_labels_set.remove(label)
if pred_segms[label]['category_id'] not in categories:
raise KeyError('In the image with ID {} segment with ID {} has unknown category_id {}.'.format(image_id, label, pred_segms[label]['category_id']))
if len(pred_labels_set) != 0:
raise KeyError('In the image with ID {} the following segment IDs {} are presented in JSON and not presented in PNG.'.format(image_id, list(pred_labels_set)))
# confusion matrix calculation
pan_gt_pred = pan_gt.astype(np.uint64) * OFFSET + pan_pred.astype(np.uint64)
gt_pred_map = {}
labels, labels_cnt = np.unique(pan_gt_pred, return_counts=True)
for label, intersection in zip(labels, labels_cnt):
gt_id = label // OFFSET
pred_id = label % OFFSET
gt_pred_map[(gt_id, pred_id)] = intersection
# count all matched pairs
gt_matched = set()
pred_matched = set()
for label_tuple, intersection in gt_pred_map.items():
gt_label, pred_label = label_tuple
if gt_label not in gt_segms:
continue
if pred_label not in pred_segms:
continue
if gt_segms[gt_label]['iscrowd'] == 1:
continue
if gt_segms[gt_label]['category_id'] != pred_segms[pred_label]['category_id']:
continue
union = pred_segms[pred_label]['area'] + gt_segms[gt_label]['area'] - intersection - gt_pred_map.get((VOID, pred_label), 0)
iou = intersection / union
if iou > 0.5:
pq_stat[gt_segms[gt_label]['category_id']].tp += 1
pq_stat[gt_segms[gt_label]['category_id']].iou += iou
gt_matched.add(gt_label)
pred_matched.add(pred_label)
# count false positives
crowd_labels_dict = {}
for gt_label, gt_info in gt_segms.items():
if gt_label in gt_matched:
continue
# crowd segments are ignored
if gt_info['iscrowd'] == 1:
crowd_labels_dict[gt_info['category_id']] = gt_label
continue
pq_stat[gt_info['category_id']].fn += 1
# count false positives
for pred_label, pred_info in pred_segms.items():
if pred_label in pred_matched:
continue
# intersection of the segment with VOID
intersection = gt_pred_map.get((VOID, pred_label), 0)
# plus intersection with corresponding CROWD region if it exists
if pred_info['category_id'] in crowd_labels_dict:
intersection += gt_pred_map.get((crowd_labels_dict[pred_info['category_id']], pred_label), 0)
# predicted segment is ignored if more than half of the segment correspond to VOID and CROWD regions
if intersection / pred_info['area'] > 0.5:
continue
pq_stat[pred_info['category_id']].fp += 1
return pq_stat
def main():
parser = default_argument_parser()
args = parser.parse_args()
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
json_file = args.json_file
with open(json_file) as f:
predictions = json.load(f)
imgToAnns = defaultdict(list)
for pred in predictions:
image_id = os.path.basename(pred["file_name"]).split(".")[0]
imgToAnns[image_id].append(
{"category_id" : pred["category_id"], "segmentation" : pred["segmentation"]}
)
image_ids = list(imgToAnns.keys())
meta = MetadataCatalog.get(args.dataset_name)
class_names = meta.stuff_classes
num_classes = len(meta.stuff_classes)
ignore_label = meta.ignore_label
conf_matrix = np.zeros((num_classes + 1, num_classes + 1), dtype=np.int64)
categories = {}
for i in range(num_classes):
categories[i] = {"id": i, "name": class_names[i], "isthing": 0}
pq_stat = PQStat()
for image_id in tqdm(image_ids):
if args.dataset_name == "ade20k_sem_seg_val":
gt_dir = os.path.join(_root, "ADEChallengeData2016", "annotations_detectron2", "validation")
segm_gt = read_image(os.path.join(gt_dir, image_id + ".png")).copy().astype(np.int64)
elif args.dataset_name == "coco_2017_test_stuff_10k_sem_seg":
gt_dir = os.path.join(_root, "coco", "coco_stuff_10k", "annotations_detectron2", "test")
segm_gt = read_image(os.path.join(gt_dir, image_id + ".png")).copy().astype(np.int64)
elif args.dataset_name == "ade20k_full_sem_seg_val":
gt_dir = os.path.join(_root, "ADE20K_2021_17_01", "annotations_detectron2", "validation")
segm_gt = read_image(os.path.join(gt_dir, image_id + ".tif")).copy().astype(np.int64)
else:
raise ValueError(f"Unsupported dataset {args.dataset_name}")
# get predictions
segm_dt = np.zeros_like(segm_gt)
anns = imgToAnns[image_id]
for ann in anns:
# map back category_id
if hasattr(meta, "stuff_dataset_id_to_contiguous_id"):
if ann["category_id"] in meta.stuff_dataset_id_to_contiguous_id:
category_id = meta.stuff_dataset_id_to_contiguous_id[ann["category_id"]]
else:
category_id = ann["category_id"]
mask = maskUtils.decode(ann["segmentation"])
segm_dt[mask > 0] = category_id
# miou
gt = segm_gt.copy()
pred = segm_dt.copy()
gt[gt == ignore_label] = num_classes
conf_matrix += np.bincount(
(num_classes + 1) * pred.reshape(-1) + gt.reshape(-1),
minlength=conf_matrix.size,
).reshape(conf_matrix.shape)
# pq
pq_stat_single = pq_compute_single_image(segm_gt, segm_dt, categories, meta.ignore_label)
pq_stat += pq_stat_single
metrics = [("All", None), ("Stuff", False)]
results = {}
for name, isthing in metrics:
results[name], per_class_results = pq_stat.pq_average(categories, isthing=isthing)
if name == 'All':
results['per_class'] = per_class_results
print("{:10s}| {:>5s} {:>5s} {:>5s} {:>5s}".format("", "PQ", "SQ", "RQ", "N"))
print("-" * (10 + 7 * 4))
for name, _isthing in metrics:
print("{:10s}| {:5.1f} {:5.1f} {:5.1f} {:5d}".format(
name,
100 * results[name]['pq'],
100 * results[name]['sq'],
100 * results[name]['rq'],
results[name]['n'])
)
# calculate miou
acc = np.full(num_classes, np.nan, dtype=np.float64)
iou = np.full(num_classes, np.nan, dtype=np.float64)
tp = conf_matrix.diagonal()[:-1].astype(np.float64)
pos_gt = np.sum(conf_matrix[:-1, :-1], axis=0).astype(np.float64)
pos_pred = np.sum(conf_matrix[:-1, :-1], axis=1).astype(np.float64)
acc_valid = pos_gt > 0
acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid]
iou_valid = (pos_gt + pos_pred) > 0
union = pos_gt + pos_pred - tp
iou[acc_valid] = tp[acc_valid] / union[acc_valid]
miou = np.sum(iou[acc_valid]) / np.sum(iou_valid)
print("")
print(f"mIoU: {miou}")
if __name__ == '__main__':
main()
================================================
FILE: train_net.py
================================================
"""
MaskFormer Training Script.
This script is a simplified version of the training script in detectron2/tools.
"""
try:
# ignore ShapelyDeprecationWarning from fvcore
from shapely.errors import ShapelyDeprecationWarning
import warnings
warnings.filterwarnings('ignore', category=ShapelyDeprecationWarning)
except:
pass
import copy
import itertools
import logging
import os
from collections import OrderedDict
from typing import Any, Dict, List, Set
import torch
import detectron2.utils.comm as comm
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import get_cfg
from detectron2.data import MetadataCatalog, build_detection_train_loader
from detectron2.engine import (
DefaultTrainer,
default_argument_parser,
default_setup,
launch,
)
from detectron2.evaluation import (
CityscapesInstanceEvaluator,
CityscapesSemSegEvaluator,
COCOEvaluator,
COCOPanopticEvaluator,
DatasetEvaluators,
LVISEvaluator,
SemSegEvaluator,
verify_results,
)
from detectron2.projects.deeplab import add_deeplab_config, build_lr_scheduler
from detectron2.solver.build import maybe_add_gradient_clipping
from detectron2.utils.logger import setup_logger
# MaskFormer
from mask2former import (
COCOInstanceNewBaselineDatasetMapper,
COCOPanopticNewBaselineDatasetMapper,
InstanceSegEvaluator,
MaskFormerInstanceDatasetMapper,
MaskFormerPanopticDatasetMapper,
MaskFormerSemanticDatasetMapper,
SemanticSegmentorWithTTA,
add_maskformer2_config,
)
class Trainer(DefaultTrainer):
"""
Extension of the Trainer class adapted to MaskFormer.
"""
@classmethod
def build_evaluator(cls, cfg, dataset_name, output_folder=None):
"""
Create evaluator(s) for a given dataset.
This uses the special metadata "evaluator_type" associated with each
builtin dataset. For your own dataset, you can simply create an
evaluator manually in your script and do not have to worry about the
hacky if-else logic here.
"""
if output_folder is None:
output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
evaluator_list = []
evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
# semantic segmentation
if evaluator_type in ["sem_seg", "ade20k_panoptic_seg"]:
evaluator_list.append(
SemSegEvaluator(
dataset_name,
distributed=True,
output_dir=output_folder,
)
)
# instance segmentation
if evaluator_type == "coco":
evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder))
# panoptic segmentation
if evaluator_type in [
"coco_panoptic_seg",
"ade20k_panoptic_seg",
"cityscapes_panoptic_seg",
"mapillary_vistas_panoptic_seg",
]:
if cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON:
evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
# COCO
if evaluator_type == "coco_panoptic_seg" and cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON:
evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder))
if evaluator_type == "coco_panoptic_seg" and cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON:
evaluator_list.append(SemSegEvaluator(dataset_name, distributed=True, output_dir=output_folder))
# Mapillary Vistas
if evaluator_type == "mapillary_vistas_panoptic_seg" and cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON:
evaluator_list.append(InstanceSegEvaluator(dataset_name, output_dir=output_folder))
if evaluator_type == "mapillary_vistas_panoptic_seg" and cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON:
evaluator_list.append(SemSegEvaluator(dataset_name, distributed=True, output_dir=output_folder))
# Cityscapes
if evaluator_type == "cityscapes_instance":
assert (
torch.cuda.device_count() > comm.get_rank()
), "CityscapesEvaluator currently do not work with multiple machines."
return CityscapesInstanceEvaluator(dataset_name)
if evaluator_type == "cityscapes_sem_seg":
assert (
torch.cuda.device_count() > comm.get_rank()
), "CityscapesEvaluator currently do not work with multiple machines."
return CityscapesSemSegEvaluator(dataset_name)
if evaluator_type == "cityscapes_panoptic_seg":
if cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON:
assert (
torch.cuda.device_count() > comm.get_rank()
), "CityscapesEvaluator currently do not work with multiple machines."
evaluator_list.append(CityscapesSemSegEvaluator(dataset_name))
if cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON:
assert (
torch.cuda.device_count() > comm.get_rank()
), "CityscapesEvaluator currently do not work with multiple machines."
evaluator_list.append(CityscapesInstanceEvaluator(dataset_name))
# ADE20K
if evaluator_type == "ade20k_panoptic_seg" and cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON:
evaluator_list.append(InstanceSegEvaluator(dataset_name, output_dir=output_folder))
# LVIS
if evaluator_type == "lvis":
return LVISEvaluator(dataset_name, output_dir=output_folder)
if len(evaluator_list) == 0:
raise NotImplementedError(
"no Evaluator for the dataset {} with the type {}".format(
dataset_name, evaluator_type
)
)
elif len(evaluator_list) == 1:
return evaluator_list[0]
return DatasetEvaluators(evaluator_list)
@classmethod
def build_train_loader(cls, cfg):
# Semantic segmentation dataset mapper
if cfg.INPUT.DATASET_MAPPER_NAME == "mask_former_semantic":
mapper = MaskFormerSemanticDatasetMapper(cfg, True)
return build_detection_train_loader(cfg, mapper=mapper)
# Panoptic segmentation dataset mapper
elif cfg.INPUT.DATASET_MAPPER_NAME == "mask_former_panoptic":
mapper = MaskFormerPanopticDatasetMapper(cfg, True)
return build_detection_train_loader(cfg, mapper=mapper)
# Instance segmentation dataset mapper
elif cfg.INPUT.DATASET_MAPPER_NAME == "mask_former_instance":
mapper = MaskFormerInstanceDatasetMapper(cfg, True)
return build_detection_train_loader(cfg, mapper=mapper)
# coco instance segmentation lsj new baseline
elif cfg.INPUT.DATASET_MAPPER_NAME == "coco_instance_lsj":
mapper = COCOInstanceNewBaselineDatasetMapper(cfg, True)
return build_detection_train_loader(cfg, mapper=mapper)
# coco panoptic segmentation lsj new baseline
elif cfg.INPUT.DATASET_MAPPER_NAME == "coco_panoptic_lsj":
mapper = COCOPanopticNewBaselineDatasetMapper(cfg, True)
return build_detection_train_loader(cfg, mapper=mapper)
else:
mapper = None
return build_detection_train_loader(cfg, mapper=mapper)
@classmethod
def build_lr_scheduler(cls, cfg, optimizer):
"""
It now calls :func:`detectron2.solver.build_lr_scheduler`.
Overwrite it if you'd like a different scheduler.
"""
return build_lr_scheduler(cfg, optimizer)
@classmethod
def build_optimizer(cls, cfg, model):
weight_decay_norm = cfg.SOLVER.WEIGHT_DECAY_NORM
weight_decay_embed = cfg.SOLVER.WEIGHT_DECAY_EMBED
defaults = {}
defaults["lr"] = cfg.SOLVER.BASE_LR
defaults["weight_decay"] = cfg.SOLVER.WEIGHT_DECAY
norm_module_types = (
torch.nn.BatchNorm1d,
torch.nn.BatchNorm2d,
torch.nn.BatchNorm3d,
torch.nn.SyncBatchNorm,
# NaiveSyncBatchNorm inherits from BatchNorm2d
torch.nn.GroupNorm,
torch.nn.InstanceNorm1d,
torch.nn.InstanceNorm2d,
torch.nn.InstanceNorm3d,
torch.nn.LayerNorm,
torch.nn.LocalResponseNorm,
)
params: List[Dict[str, Any]] = []
memo: Set[torch.nn.parameter.Parameter] = set()
for module_name, module in model.named_modules():
for module_param_name, value in module.named_parameters(recurse=False):
if not value.requires_grad:
continue
# Avoid duplicating parameters
if value in memo:
continue
memo.add(value)
hyperparams = copy.copy(defaults)
if "backbone" in module_name:
hyperparams["lr"] = hyperparams["lr"] * cfg.SOLVER.BACKBONE_MULTIPLIER
if (
"relative_position_bias_table" in module_param_name
or "absolute_pos_embed" in module_param_name
):
print(module_param_name)
hyperparams["weight_decay"] = 0.0
if isinstance(module, norm_module_types):
hyperparams["weight_decay"] = weight_decay_norm
if isinstance(module, torch.nn.Embedding):
hyperparams["weight_decay"] = weight_decay_embed
params.append({"params": [value], **hyperparams})
def maybe_add_full_model_gradient_clipping(optim):
# detectron2 doesn't have full model gradient clipping now
clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE
enable = (
cfg.SOLVER.CLIP_GRADIENTS.ENABLED
and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model"
and clip_norm_val > 0.0
)
class FullModelGradientClippingOptimizer(optim):
def step(self, closure=None):
all_params = itertools.chain(*[x["params"] for x in self.param_groups])
torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val)
super().step(closure=closure)
return FullModelGradientClippingOptimizer if enable else optim
optimizer_type = cfg.SOLVER.OPTIMIZER
if optimizer_type == "SGD":
optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)(
params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM
)
elif optimizer_type == "ADAMW":
optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)(
params, cfg.SOLVER.BASE_LR
)
else:
raise NotImplementedError(f"no optimizer type {optimizer_type}")
if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model":
optimizer = maybe_add_gradient_clipping(cfg, optimizer)
return optimizer
@classmethod
def test_with_TTA(cls, cfg, model):
logger = logging.getLogger("detectron2.trainer")
# In the end of training, run an evaluation with TTA.
logger.info("Running inference with test-time augmentation ...")
model = SemanticSegmentorWithTTA(cfg, model)
evaluators = [
cls.build_evaluator(
cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
)
for name in cfg.DATASETS.TEST
]
res = cls.test(cfg, model, evaluators)
res = OrderedDict({k + "_TTA": v for k, v in res.items()})
return res
def setup(args):
"""
Create configs and perform basic setups.
"""
cfg = get_cfg()
# for poly lr schedule
add_deeplab_config(cfg)
add_maskformer2_config(cfg)
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.freeze()
default_setup(cfg, args)
# Setup logger for "mask_former" module
setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="mask2former")
return cfg
def main(args):
cfg = setup(args)
if args.eval_only:
model = Trainer.build_model(cfg)
DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
cfg.MODEL.WEIGHTS, resume=args.resume
)
res = Trainer.test(cfg, model)
if cfg.TEST.AUG.ENABLED:
res.update(Trainer.test_with_TTA(cfg, model))
if comm.is_main_process():
verify_results(cfg, res)
return res
trainer = Trainer(cfg)
trainer.resume_or_load(resume=args.resume)
return trainer.train()
if __name__ == "__main__":
args = default_argument_parser().parse_args()
print("Command Line Args:", args)
launch(
main,
args.num_gpus,
num_machines=args.num_machines,
machine_rank=args.machine_rank,
dist_url=args.dist_url,
args=(args,),
)
================================================
FILE: train_net_video.py
================================================
"""
This script is a simplified version of the training script in detectron2/tools.
"""
try:
# ignore ShapelyDeprecationWarning from fvcore
from shapely.errors import ShapelyDeprecationWarning
import warnings
warnings.filterwarnings('ignore', category=ShapelyDeprecationWarning)
except:
pass
import copy
import itertools
import logging
import os
from collections import OrderedDict
from typing import Any, Dict, List, Set
import torch
import detectron2.utils.comm as comm
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import get_cfg
from detectron2.data import MetadataCatalog, build_detection_train_loader
from detectron2.engine import (
DefaultTrainer,
default_argument_parser,
default_setup,
launch,
)
from detectron2.evaluation import (
DatasetEvaluator,
inference_on_dataset,
print_csv_format,
verify_results,
)
from detectron2.projects.deeplab import add_deeplab_config, build_lr_scheduler
from detectron2.solver.build import maybe_add_gradient_clipping
from detectron2.utils.logger import setup_logger
# MaskFormer
from mask2former import add_maskformer2_config
from mask2former_video import (
YTVISDatasetMapper,
CocoClipDatasetMapper,
build_combined_loader,
YTVISEvaluator,
add_maskformer2_video_config,
build_detection_train_loader,
build_detection_test_loader,
get_detection_dataset_dicts,
)
from torch.utils.data import Dataset, ConcatDataset
class Trainer(DefaultTrainer):
"""
Extension of the Trainer class adapted to MaskFormer.
"""
@classmethod
def build_evaluator(cls, cfg, dataset_name, output_folder=None):
"""
Create evaluator(s) for a given dataset.
This uses the special metadata "evaluator_type" associated with each builtin dataset.
For your own dataset, you can simply create an evaluator manually in your
script and do not have to worry about the hacky if-else logic here.
"""
if output_folder is None:
output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
os.makedirs(output_folder, exist_ok=True)
return YTVISEvaluator(dataset_name, cfg, True, output_folder)
@classmethod
def build_train_loader(cls, cfg):
mappers = []
for d_i, dataset_name in enumerate(cfg.DATASETS.TRAIN):
if dataset_name.startswith('coco'):
mappers.append(
CocoClipDatasetMapper(
cfg, is_train=True, is_tgt=(d_i==len(cfg.DATASETS.TRAIN)-1), src_dataset_name=dataset_name
)
)
elif dataset_name.startswith('ytvis') or dataset_name.startswith('ovis'):
mappers.append(
YTVISDatasetMapper(cfg, is_train=True, is_tgt=(d_i==len(cfg.DATASETS.TRAIN)-1), src_dataset_name=dataset_name)
)
loaders = [
build_detection_train_loader(cfg, mapper=mapper, dataset_name=dataset_name)
for mapper, dataset_name in zip(mappers, cfg.DATASETS.TRAIN)
]
DATASET_RATIO = [1.0, 0.75]
combined_data_loader = build_combined_loader(cfg, loaders, DATASET_RATIO)
return combined_data_loader
@classmethod
def build_test_loader(cls, cfg, dataset_name):
dataset_name = cfg.DATASETS.TEST[0]
mapper = YTVISDatasetMapper(cfg, is_train=False)
return build_detection_test_loader(cfg, dataset_name, mapper=mapper)
@classmethod
def build_lr_scheduler(cls, cfg, optimizer):
"""
It now calls :func:`detectron2.solver.build_lr_scheduler`.
Overwrite it if you'd like a different scheduler.
"""
return build_lr_scheduler(cfg, optimizer)
@classmethod
def build_optimizer(cls, cfg, model):
weight_decay_norm = cfg.SOLVER.WEIGHT_DECAY_NORM
weight_decay_embed = cfg.SOLVER.WEIGHT_DECAY_EMBED
defaults = {}
defaults["lr"] = cfg.SOLVER.BASE_LR
defaults["weight_decay"] = cfg.SOLVER.WEIGHT_DECAY
norm_module_types = (
torch.nn.BatchNorm1d,
torch.nn.BatchNorm2d,
torch.nn.BatchNorm3d,
torch.nn.SyncBatchNorm,
# NaiveSyncBatchNorm inherits from BatchNorm2d
torch.nn.GroupNorm,
torch.nn.InstanceNorm1d,
torch.nn.InstanceNorm2d,
torch.nn.InstanceNorm3d,
torch.nn.LayerNorm,
torch.nn.LocalResponseNorm,
)
params: List[Dict[str, Any]] = []
memo: Set[torch.nn.parameter.Parameter] = set()
for module_name, module in model.named_modules():
for module_param_name, value in module.named_parameters(recurse=False):
if not value.requires_grad:
continue
# Avoid duplicating parameters
if value in memo:
continue
memo.add(value)
hyperparams = copy.copy(defaults)
if "backbone" in module_name:
hyperparams["lr"] = hyperparams["lr"] * cfg.SOLVER.BACKBONE_MULTIPLIER
if (
"relative_position_bias_table" in module_param_name
or "absolute_pos_embed" in module_param_name
):
print(module_param_name)
hyperparams["weight_decay"] = 0.0
if isinstance(module, norm_module_types):
hyperparams["weight_decay"] = weight_decay_norm
if isinstance(module, torch.nn.Embedding):
hyperparams["weight_decay"] = weight_decay_embed
params.append({"params": [value], **hyperparams})
def maybe_add_full_model_gradient_clipping(optim):
# detectron2 doesn't have full model gradient clipping now
clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE
enable = (
cfg.SOLVER.CLIP_GRADIENTS.ENABLED
and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model"
and clip_norm_val > 0.0
)
class FullModelGradientClippingOptimizer(optim):
def step(self, closure=None):
all_params = itertools.chain(*[x["params"] for x in self.param_groups])
torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val)
super().step(closure=closure)
return FullModelGradientClippingOptimizer if enable else optim
optimizer_type = cfg.SOLVER.OPTIMIZER
if optimizer_type == "SGD":
optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)(
params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM
)
elif optimizer_type == "ADAMW":
optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)(
params, cfg.SOLVER.BASE_LR
)
else:
raise NotImplementedError(f"no optimizer type {optimizer_type}")
if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model":
optimizer = maybe_add_gradient_clipping(cfg, optimizer)
return optimizer
@classmethod
def test(cls, cfg, model, evaluators=None):
"""
Evaluate the given model. The given model is expected to already contain
weights to evaluate.
Args:
cfg (CfgNode):
model (nn.Module):
evaluators (list[DatasetEvaluator] or None): if None, will call
:meth:`build_evaluator`. Otherwise, must have the same length as
``cfg.DATASETS.TEST``.
Returns:
dict: a dict of result metrics
"""
from torch.cuda.amp import autocast
logger = logging.getLogger(__name__)
if isinstance(evaluators, DatasetEvaluator):
evaluators = [evaluators]
if evaluators is not None:
assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format(
len(cfg.DATASETS.TEST), len(evaluators)
)
results = OrderedDict()
for idx, dataset_name in enumerate(cfg.DATASETS.TEST):
data_loader = cls.build_test_loader(cfg, dataset_name)
# When evaluators are passed in as arguments,
# implicitly assume that evaluators can be created before data_loader.
if evaluators is not None:
evaluator = evaluators[idx]
else:
try:
evaluator = cls.build_evaluator(cfg, dataset_name)
except NotImplementedError:
logger.warn(
"No evaluator found. Use `DefaultTrainer.test(evaluators=)`, "
"or implement its `build_evaluator` method."
)
results[dataset_name] = {}
continue
with autocast():
results_i = inference_on_dataset(model, data_loader, evaluator)
results[dataset_name] = results_i
if comm.is_main_process():
assert isinstance(
results_i, dict
), "Evaluator must return a dict on the main process. Got {} instead.".format(
results_i
)
logger.info("Evaluation results for {} in csv format:".format(dataset_name))
print_csv_format(results_i)
if len(results) == 1:
results = list(results.values())[0]
return results
def setup(args):
"""
Create configs and perform basic setups.
"""
cfg = get_cfg()
# for poly lr schedule
add_deeplab_config(cfg)
add_maskformer2_config(cfg)
add_maskformer2_video_config(cfg)
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.freeze()
default_setup(cfg, args)
# Setup logger for "mask_former" module
setup_logger(name="mask2former")
setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="mask2former_video")
return cfg
def main(args):
cfg = setup(args)
if args.eval_only:
model = Trainer.build_model(cfg)
DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
cfg.MODEL.WEIGHTS, resume=args.resume
)
res = Trainer.test(cfg, model)
if cfg.TEST.AUG.ENABLED:
raise NotImplementedError
if comm.is_main_process():
verify_results(cfg, res)
return res
trainer = Trainer(cfg)
trainer.resume_or_load(resume=args.resume)
return trainer.train()
if __name__ == "__main__":
args = default_argument_parser().parse_args()
print("Command Line Args:", args)
launch(
main,
args.num_gpus,
num_machines=args.num_machines,
machine_rank=args.machine_rank,
dist_url=args.dist_url,
args=(args,),
)
================================================
FILE: util/__init__.py
================================================
# ------------------------------------------------------------------------
# SeqFormer
# ------------------------------------------------------------------------
# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# ------------------------------------------------------------------------
================================================
FILE: util/box_ops.py
================================================
# ------------------------------------------------------------------------
# SeqFormer
# ------------------------------------------------------------------------
# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# ------------------------------------------------------------------------
"""
Utilities for bounding box manipulation and GIoU.
"""
import torch
from torchvision.ops.boxes import box_area
def box_cxcywh_to_xyxy(x):
# print('box:\n', x)
x_c, y_c, w, h = x.unbind(-1)
b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
(x_c + 0.5 * w), (y_c + 0.5 * h)]
return torch.stack(b, dim=-1)
def box_xyxy_to_cxcywh(x):
x0, y0, x1, y1 = x.unbind(-1)
b = [(x0 + x1) / 2, (y0 + y1) / 2,
(x1 - x0), (y1 - y0)]
return torch.stack(b, dim=-1)
# modified from torchvision to also return the union
def box_iou(boxes1, boxes2):
area1 = box_area(boxes1)
area2 = box_area(boxes2)
lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
wh = (rb - lt).clamp(min=0) # [N,M,2]
inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
union = area1[:, None] + area2 - inter
iou = inter / (union + 1e-7)
return iou, union
def multi_box_iou(boxes1, boxes2):
area1 = box_area(boxes1.flatten(0,1)).reshape(boxes1.shape[0], boxes1.shape[1])
area2 = box_area(boxes2.flatten(0,1)).reshape(boxes2.shape[0], boxes2.shape[1])
lt = torch.max(boxes1[:, :, None, :2], boxes2[:, None, :, :2]) # [nf,N,M,2]
rb = torch.min(boxes1[:, :, None, 2:], boxes2[:, None, :, 2:]) # [nf,N,M,2]
wh = (rb - lt).clamp(min=0) # [nf,N,M,2]
inter = wh[:, :, :, 0] * wh[:, :, :, 1] # [nf,N,M]
union = area1[:, :, None] + area2[:, None, :] - inter
iou = inter / (union + 1e-7)
return iou, union
def generalized_box_iou(boxes1, boxes2):
"""
Generalized IoU from https://giou.stanford.edu/
The boxes should be in [x0, y0, x1, y1] format
Returns a [N, M] pairwise matrix, where N = len(boxes1)
and M = len(boxes2)
"""
# degenerate boxes gives inf / nan results
# so do an early check
assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
iou, union = box_iou(boxes1, boxes2)
lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
wh = (rb - lt).clamp(min=0) # [N,M,2]
area = wh[:, :, 0] * wh[:, :, 1]
# return iou - (area - union) / area
return iou - (area - union) / (area + 1e-7)
def generalized_multi_box_iou(boxes1, boxes2):
"""
Generalized IoU from https://giou.stanford.edu/
The boxes should be in [x0, y0, x1, y1] format
boxes1.shape = [nf, N, 4]
boxes2.shape = [nf, M, 4]
Returns a [nf, N, M] pairwise matrix, where N = boxes1.shape[1]
and M = boxes2.shape[1]
"""
# degenerate boxes gives inf / nan results
# so do an early check
assert (boxes1[:, :, 2:] >= boxes1[:, :, :2]).all()
assert (boxes2[:, :, 2:] >= boxes2[:, :, :2]).all()
iou, union = multi_box_iou(boxes1, boxes2)
lt = torch.min(boxes1[:, :, None, :2], boxes2[:, None, :, :2])
rb = torch.max(boxes1[:, :, None, 2:], boxes2[:, None, :, 2:])
wh = (rb - lt).clamp(min=0) # [nf,N,M,2]
area = wh[:, :, :, 0] * wh[:, :, :, 1]
return iou - (area - union) / (area + 1e-7)
def masks_to_boxes(masks):
"""Compute the bounding boxes around the provided masks
The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
Returns a [N, 4] tensors, with the boxes in xyxy format
"""
if masks.numel() == 0:
return torch.zeros((0, 4), device=masks.device)
h, w = masks.shape[-2:]
y = torch.arange(0, h, dtype=torch.float, device=masks.device)
x = torch.arange(0, w, dtype=torch.float, device=masks.device)
y, x = torch.meshgrid(y, x)
x_mask = (masks * x.unsqueeze(0))
x_max = x_mask.flatten(1).max(-1)[0]
x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
y_mask = (masks * y.unsqueeze(0))
y_max = y_mask.flatten(1).max(-1)[0]
y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
return torch.stack([x_min, y_min, x_max, y_max], 1)
================================================
FILE: util/misc.py
================================================
# ------------------------------------------------------------------------
# SeqFormer
# ------------------------------------------------------------------------
# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# ------------------------------------------------------------------------
"""
Misc functions, including distributed helpers.
Mostly copy-paste from torchvision references.
"""
import os
import subprocess
import time
from collections import defaultdict, deque
import datetime
import pickle
from typing import Optional, List
import torch
import torch.nn as nn
import torch.distributed as dist
from torch import Tensor
# needed due to empty tensor bug in pytorch and torchvision 0.5
import torchvision
if float(torchvision.__version__[:3]) < 0.5:
import math
from torchvision.ops.misc import _NewEmptyTensorOp
def _check_size_scale_factor(dim, size, scale_factor):
# type: (int, Optional[List[int]], Optional[float]) -> None
if size is None and scale_factor is None:
raise ValueError("either size or scale_factor should be defined")
if size is not None and scale_factor is not None:
raise ValueError("only one of size or scale_factor should be defined")
if not (scale_factor is not None and len(scale_factor) != dim):
raise ValueError(
"scale_factor shape must match input shape. "
"Input is {}D, scale_factor size is {}".format(dim, len(scale_factor))
)
def _output_size(dim, input, size, scale_factor):
# type: (int, Tensor, Optional[List[int]], Optional[float]) -> List[int]
assert dim == 2
_check_size_scale_factor(dim, size, scale_factor)
if size is not None:
return size
# if dim is not 2 or scale_factor is iterable use _ntuple instead of concat
assert scale_factor is not None and isinstance(scale_factor, (int, float))
scale_factors = [scale_factor, scale_factor]
# math.floor might return float in py2.7
return [
int(math.floor(input.size(i + 2) * scale_factors[i])) for i in range(dim)
]
elif float(torchvision.__version__[:3]) < 0.7:
from torchvision.ops import _new_empty_tensor
from torchvision.ops.misc import _output_size
class SmoothedValue(object):
"""Track a series of values and provide access to smoothed values over a
window or the global series average.
"""
def __init__(self, window_size=20, fmt=None):
if fmt is None:
fmt = "{median:.4f} ({global_avg:.4f})"
self.deque = deque(maxlen=window_size)
self.total = 0.0
self.count = 0
self.fmt = fmt
def update(self, value, n=1):
self.deque.append(value)
self.count += n
self.total += value * n
def synchronize_between_processes(self):
"""
Warning: does not synchronize the deque!
"""
if not is_dist_avail_and_initialized():
return
t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
dist.barrier()
dist.all_reduce(t)
t = t.tolist()
self.count = int(t[0])
self.total = t[1]
@property
def median(self):
d = torch.tensor(list(self.deque))
return d.median().item()
@property
def avg(self):
d = torch.tensor(list(self.deque), dtype=torch.float32)
return d.mean().item()
@property
def global_avg(self):
return self.total / self.count
@property
def max(self):
return max(self.deque)
@property
def value(self):
return self.deque[-1]
def __str__(self):
return self.fmt.format(
median=self.median,
avg=self.avg,
global_avg=self.global_avg,
max=self.max,
value=self.value)
def all_gather(data):
"""
Run all_gather on arbitrary picklable data (not necessarily tensors)
Args:
data: any picklable object
Returns:
list[data]: list of data gathered from each rank
"""
world_size = get_world_size()
if world_size == 1:
return [data]
# serialized to a Tensor
buffer = pickle.dumps(data)
storage = torch.ByteStorage.from_buffer(buffer)
tensor = torch.ByteTensor(storage).to("cuda")
# obtain Tensor size of each rank
local_size = torch.tensor([tensor.numel()], device="cuda")
size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
dist.all_gather(size_list, local_size)
size_list = [int(size.item()) for size in size_list]
max_size = max(size_list)
# receiving Tensor from all ranks
# we pad the tensor because torch all_gather does not support
# gathering tensors of different shapes
tensor_list = []
for _ in size_list:
tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
if local_size != max_size:
padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
tensor = torch.cat((tensor, padding), dim=0)
dist.all_gather(tensor_list, tensor)
data_list = []
for size, tensor in zip(size_list, tensor_list):
buffer = tensor.cpu().numpy().tobytes()[:size]
data_list.append(pickle.loads(buffer))
return data_list
def reduce_dict(input_dict, average=True):
"""
Args:
input_dict (dict): all the values will be reduced
average (bool): whether to do average or sum
Reduce the values in the dictionary from all processes so that all processes
have the averaged results. Returns a dict with the same fields as
input_dict, after reduction.
"""
world_size = get_world_size()
if world_size < 2:
return input_dict
with torch.no_grad():
names = []
values = []
# sort the keys so that they are consistent across processes
for k in sorted(input_dict.keys()):
names.append(k)
values.append(input_dict[k])
values = torch.stack(values, dim=0)
dist.all_reduce(values)
if average:
values /= world_size
reduced_dict = {k: v for k, v in zip(names, values)}
return reduced_dict
class MetricLogger(object):
def __init__(self, delimiter="\t"):
self.meters = defaultdict(SmoothedValue)
self.delimiter = delimiter
def update(self, **kwargs):
for k, v in kwargs.items():
if isinstance(v, torch.Tensor):
v = v.item()
assert isinstance(v, (float, int))
self.meters[k].update(v)
def __getattr__(self, attr):
if attr in self.meters:
return self.meters[attr]
if attr in self.__dict__:
return self.__dict__[attr]
raise AttributeError("'{}' object has no attribute '{}'".format(
type(self).__name__, attr))
def __str__(self):
loss_str = []
for name, meter in self.meters.items():
loss_str.append(
"{}: {}".format(name, str(meter))
)
return self.delimiter.join(loss_str)
def synchronize_between_processes(self):
for meter in self.meters.values():
meter.synchronize_between_processes()
def add_meter(self, name, meter):
self.meters[name] = meter
def log_every(self, iterable, print_freq, header=None):
i = 0
if not header:
header = ''
start_time = time.time()
end = time.time()
iter_time = SmoothedValue(fmt='{avg:.4f}')
data_time = SmoothedValue(fmt='{avg:.4f}')
space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
if torch.cuda.is_available():
log_msg = self.delimiter.join([
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}',
'max mem: {memory:.0f}'
])
else:
log_msg = self.delimiter.join([
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}'
])
MB = 1024.0 * 1024.0
for obj in iterable:
data_time.update(time.time() - end)
yield obj
iter_time.update(time.time() - end)
if i % print_freq == 0 or i == len(iterable) - 1:
eta_seconds = iter_time.global_avg * (len(iterable) - i)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
if torch.cuda.is_available():
print(log_msg.format(
i, len(iterable), eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB))
else:
print(log_msg.format(
i, len(iterable), eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time)))
i += 1
end = time.time()
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('{} Total time: {} ({:.4f} s / it)'.format(
header, total_time_str, total_time / len(iterable)))
def get_sha():
cwd = os.path.dirname(os.path.abspath(__file__))
def _run(command):
return subprocess.check_output(command, cwd=cwd).decode('ascii').strip()
sha = 'N/A'
diff = "clean"
branch = 'N/A'
try:
sha = _run(['git', 'rev-parse', 'HEAD'])
subprocess.check_output(['git', 'diff'], cwd=cwd)
diff = _run(['git', 'diff-index', 'HEAD'])
diff = "has uncommited changes" if diff else "clean"
branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])
except Exception:
pass
message = f"sha: {sha}, status: {diff}, branch: {branch}"
return message
def collate_fn(batch):
batch = list(zip(*batch))
batch[0] = nested_tensor_from_tensor_list(batch[0], size_divisibility=32)
return tuple(batch)
def _max_by_axis(the_list):
# type: (List[List[int]]) -> List[int]
maxes = the_list[0]
for sublist in the_list[1:]:
for index, item in enumerate(sublist):
maxes[index] = max(maxes[index], item)
return maxes
def nested_tensor_from_tensor_list(tensor_list: List[Tensor], size_divisibility=1, split=True):
if split:
tensor_list = [tensor.split(3,dim=0) for tensor in tensor_list]
tensor_list = [item for sublist in tensor_list for item in sublist]
# TODO make this more general
if tensor_list[0].ndim == 3:
# TODO make it support different-sized images
max_size = _max_by_axis([list(img.shape) for img in tensor_list])
if size_divisibility > 1:
stride = size_divisibility
# the last two dims are H,W, both subject to divisibility requirement
max_size[-2] = (max_size[-2] + (stride - 1)) // stride * stride
max_size[-1] = (max_size[-1] + (stride - 1)) // stride * stride
# min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
batch_shape = [len(tensor_list)] + max_size
b, c, h, w = batch_shape
dtype = tensor_list[0].dtype
device = tensor_list[0].device
tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
for img, pad_img, m in zip(tensor_list, tensor, mask):
pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
m[: img.shape[1], :img.shape[2]] = False
else:
raise ValueError('not supported')
return NestedTensor(tensor, mask)
class NestedTensor(object):
def __init__(self, tensors, mask: Optional[Tensor]):
self.tensors = tensors
self.mask = mask
def to(self, device, non_blocking=False):
# type: (Device) -> NestedTensor # noqa
cast_tensor = self.tensors.to(device, non_blocking=non_blocking)
mask = self.mask
if mask is not None:
assert mask is not None
cast_mask = mask.to(device, non_blocking=non_blocking)
else:
cast_mask = None
return NestedTensor(cast_tensor, cast_mask)
def record_stream(self, *args, **kwargs):
self.tensors.record_stream(*args, **kwargs)
if self.mask is not None:
self.mask.record_stream(*args, **kwargs)
def decompose(self):
return self.tensors, self.mask
def __repr__(self):
return str(self.tensors)
def setup_for_distributed(is_master):
"""
This function disables printing when not in master process
"""
import builtins as __builtin__
builtin_print = __builtin__.print
def print(*args, **kwargs):
force = kwargs.pop('force', False)
if is_master or force:
builtin_print(*args, **kwargs)
__builtin__.print = print
def is_dist_avail_and_initialized():
if not dist.is_available():
return False
if not dist.is_initialized():
return False
return True
def get_world_size():
if not is_dist_avail_and_initialized():
return 1
return dist.get_world_size()
def get_rank():
if not is_dist_avail_and_initialized():
return 0
return dist.get_rank()
def get_local_size():
if not is_dist_avail_and_initialized():
return 1
return int(os.environ['LOCAL_SIZE'])
def get_local_rank():
if not is_dist_avail_and_initialized():
return 0
return int(os.environ['LOCAL_RANK'])
def is_main_process():
return get_rank() == 0
def save_on_master(*args, **kwargs):
if is_main_process():
torch.save(*args, **kwargs)
def init_distributed_mode(args):
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
args.rank = int(os.environ["RANK"])
args.world_size = int(os.environ['WORLD_SIZE'])
args.gpu = int(os.environ['LOCAL_RANK'])
args.dist_url = 'env://'
os.environ['LOCAL_SIZE'] = str(torch.cuda.device_count())
elif 'SLURM_PROCID' in os.environ:
proc_id = int(os.environ['SLURM_PROCID'])
ntasks = int(os.environ['SLURM_NTASKS'])
node_list = os.environ['SLURM_NODELIST']
num_gpus = torch.cuda.device_count()
addr = subprocess.getoutput(
'scontrol show hostname {} | head -n1'.format(node_list))
os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', '29500')
os.environ['MASTER_ADDR'] = addr
os.environ['WORLD_SIZE'] = str(ntasks)
os.environ['RANK'] = str(proc_id)
os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
os.environ['LOCAL_SIZE'] = str(num_gpus)
args.dist_url = 'env://'
args.world_size = ntasks
args.rank = proc_id
args.gpu = proc_id % num_gpus
else:
print('Not using distributed mode')
args.distributed = False
return
args.distributed = True
torch.cuda.set_device(args.gpu)
args.dist_backend = 'nccl'
print('| distributed init (rank {}): {}'.format(
args.rank, args.dist_url), flush=True)
torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
torch.distributed.barrier()
setup_for_distributed(args.rank == 0)
@torch.no_grad()
def accuracy(output, target, topk=(1,)):
"""Computes the precision@k for the specified values of k"""
if target.numel() == 0:
return [torch.zeros([], device=output.device)]
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0)
res.append(correct_k.mul_(100.0 / batch_size))
return res
def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
# type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
"""
Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
This will eventually be supported natively by PyTorch, and this
class can go away.
"""
if float(torchvision.__version__[:3]) < 0.7:
if input.numel() > 0:
return torch.nn.functional.interpolate(
input, size, scale_factor, mode, align_corners
)
output_shape = _output_size(2, input, size, scale_factor)
output_shape = list(input.shape[:-2]) + list(output_shape)
if float(torchvision.__version__[:3]) < 0.5:
return _NewEmptyTensorOp.apply(input, output_shape)
return _new_empty_tensor(input, output_shape)
else:
return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
def get_total_grad_norm(parameters, norm_type=2):
parameters = list(filter(lambda p: p.grad is not None, parameters))
norm_type = float(norm_type)
device = parameters[0].grad.device
total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]),
norm_type)
return total_norm
def inverse_sigmoid(x, eps=1e-5):
x = x.clamp(min=0, max=1)
x1 = x.clamp(min=eps)
x2 = (1 - x).clamp(min=eps)
return torch.log(x1/x2)
================================================
FILE: util/plot_utils.py
================================================
# ------------------------------------------------------------------------
# SeqFormer
# ------------------------------------------------------------------------
# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# ------------------------------------------------------------------------
"""
Plotting utilities to visualize training logs.
"""
import torch
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path, PurePath
def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'):
'''
Function to plot specific fields from training log(s). Plots both training and test results.
:: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file
- fields = which results to plot from each log file - plots both training and test for each field.
- ewm_col = optional, which column to use as the exponential weighted smoothing of the plots
- log_name = optional, name of log file if different than default 'log.txt'.
:: Outputs - matplotlib plots of results in fields, color coded for each log file.
- solid lines are training results, dashed lines are test results.
'''
func_name = "plot_utils.py::plot_logs"
# verify logs is a list of Paths (list[Paths]) or single Pathlib object Path,
# convert single Path to list to avoid 'not iterable' error
if not isinstance(logs, list):
if isinstance(logs, PurePath):
logs = [logs]
print(f"{func_name} info: logs param expects a list argument, converted to list[Path].")
else:
raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \
Expect list[Path] or single Path obj, received {type(logs)}")
# verify valid dir(s) and that every item in list is Path object
for i, dir in enumerate(logs):
if not isinstance(dir, PurePath):
raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}")
if dir.exists():
continue
raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}")
# load log file(s) and plot
dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs]
fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5))
for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))):
for j, field in enumerate(fields):
if field == 'mAP':
coco_eval = pd.DataFrame(pd.np.stack(df.test_coco_eval.dropna().values)[:, 1]).ewm(com=ewm_col).mean()
axs[j].plot(coco_eval, c=color)
else:
df.interpolate().ewm(com=ewm_col).mean().plot(
y=[f'train_{field}', f'test_{field}'],
ax=axs[j],
color=[color] * 2,
style=['-', '--']
)
for ax, field in zip(axs, fields):
ax.legend([Path(p).name for p in logs])
ax.set_title(field)
def plot_precision_recall(files, naming_scheme='iter'):
if naming_scheme == 'exp_id':
# name becomes exp_id
names = [f.parts[-3] for f in files]
elif naming_scheme == 'iter':
names = [f.stem for f in files]
else:
raise ValueError(f'not supported {naming_scheme}')
fig, axs = plt.subplots(ncols=2, figsize=(16, 5))
for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names):
data = torch.load(f)
# precision is n_iou, n_points, n_cat, n_area, max_det
precision = data['precision']
recall = data['params'].recThrs
scores = data['scores']
# take precision for all classes, all areas and 100 detections
precision = precision[0, :, :, 0, -1].mean(1)
scores = scores[0, :, :, 0, -1].mean(1)
prec = precision.mean()
rec = data['recall'][0, :, 0, -1].mean()
print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' +
f'score={scores.mean():0.3f}, ' +
f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}'
)
axs[0].plot(recall, precision, c=color)
axs[1].plot(recall, scores, c=color)
axs[0].set_title('Precision / Recall')
axs[0].legend(names)
axs[1].set_title('Scores / Recall')
axs[1].legend(names)
return fig, axs