Full Code of SysCV/MaskFreeVIS for AI

main 0e7018b7fe61 cached

215 files

1.3 MB

370.7k tokens

1000 symbols

1 requests

Download .txt

Showing preview only (1,442K chars total). Download the full file or copy to clipboard to get everything.

Repository: SysCV/MaskFreeVIS
Branch: main
Commit: 0e7018b7fe61
Files: 215
Total size: 1.3 MB

Directory structure:
gitextract_tlc1nw96/

├── DATASET_prepare.md
├── LICENSE
├── README.md
├── configs/
│   ├── coco/
│   │   └── instance-segmentation/
│   │       ├── Base-COCO-InstanceSegmentation.yaml
│   │       └── maskformer2_R50_bs16_50ep.yaml
│   └── youtubevis_2019/
│       ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
│       ├── Base-YouTubeVIS-VideoInstanceSegmentation_long.yaml
│       ├── Base-YouTubeVIS-VideoInstanceSegmentation_long_bs16.yaml
│       ├── swin/
│       │   └── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml
│       ├── video_maskformer2_R101_bs16_8ep.yaml
│       ├── video_maskformer2_R50_bs16_8ep.yaml
│       └── video_maskformer2_R50_bs16_8ep_swin.yaml
├── demo/
│   ├── README.md
│   ├── demo.py
│   └── predictor.py
├── demo_video/
│   ├── README.md
│   ├── demo.py
│   ├── predictor.py
│   └── visualizer.py
├── mask2former/
│   ├── __init__.py
│   ├── config.py
│   ├── data/
│   │   ├── __init__.py
│   │   ├── dataset_mappers/
│   │   │   ├── __init__.py
│   │   │   ├── __init__.py.new
│   │   │   ├── coco_instance_new_baseline_dataset_mapper.py
│   │   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
│   │   │   ├── mask_former_instance_dataset_mapper.py
│   │   │   ├── mask_former_panoptic_dataset_mapper.py
│   │   │   └── mask_former_semantic_dataset_mapper.py
│   │   └── datasets/
│   │       ├── __init__.py
│   │       ├── register_ade20k_full.py
│   │       ├── register_ade20k_instance.py
│   │       ├── register_ade20k_panoptic.py
│   │       ├── register_coco_panoptic_annos_semseg.py
│   │       ├── register_coco_stuff_10k.py
│   │       ├── register_mapillary_vistas.py
│   │       └── register_mapillary_vistas_panoptic.py
│   ├── evaluation/
│   │   ├── __init__.py
│   │   ├── __init__.py.new
│   │   └── instance_evaluation.py
│   ├── maskformer_model.py
│   ├── modeling/
│   │   ├── __init__.py
│   │   ├── backbone/
│   │   │   ├── __init__.py
│   │   │   ├── __init__.py.new
│   │   │   └── swin.py
│   │   ├── criterion.py
│   │   ├── matcher.py
│   │   ├── meta_arch/
│   │   │   ├── __init__.py
│   │   │   ├── __init__.py.new
│   │   │   ├── mask_former_head.py
│   │   │   └── per_pixel_baseline.py
│   │   ├── pixel_decoder/
│   │   │   ├── __init__.py
│   │   │   ├── __init__.py.new
│   │   │   ├── fpn.py
│   │   │   ├── msdeformattn.py
│   │   │   └── ops/
│   │   │       ├── functions/
│   │   │       │   ├── __init__.py
│   │   │       │   └── ms_deform_attn_func.py
│   │   │       ├── make.sh
│   │   │       ├── modules/
│   │   │       │   ├── __init__.py
│   │   │       │   └── ms_deform_attn.py
│   │   │       ├── setup.py
│   │   │       ├── src/
│   │   │       │   ├── cpu/
│   │   │       │   │   ├── ms_deform_attn_cpu.cpp
│   │   │       │   │   └── ms_deform_attn_cpu.h
│   │   │       │   ├── cuda/
│   │   │       │   │   ├── ms_deform_attn_cuda.cu
│   │   │       │   │   ├── ms_deform_attn_cuda.h
│   │   │       │   │   └── ms_deform_im2col_cuda.cuh
│   │   │       │   ├── ms_deform_attn.h
│   │   │       │   └── vision.cpp
│   │   │       └── test.py
│   │   └── transformer_decoder/
│   │       ├── __init__.py
│   │       ├── mask2former_transformer_decoder.py
│   │       ├── maskformer_transformer_decoder.py
│   │       ├── position_encoding.py
│   │       └── transformer.py
│   ├── test_time_augmentation.py
│   └── utils/
│       ├── __init__.py
│       ├── __init__.py.new
│       └── misc.py
├── mask2former_video/
│   ├── __init__.py
│   ├── config.py
│   ├── data_video/
│   │   ├── __init__.py
│   │   ├── augmentation.py
│   │   ├── build.py
│   │   ├── combined_loader.py
│   │   ├── dataset_mapper.py
│   │   ├── datasets/
│   │   │   ├── __init__.py
│   │   │   ├── builtin.py
│   │   │   ├── ytvis.py
│   │   │   └── ytvis_api/
│   │   │       ├── __init__.py
│   │   │       ├── ytvos.py
│   │   │       └── ytvoseval.py
│   │   └── ytvis_eval.py
│   ├── modeling/
│   │   ├── __init__.py
│   │   ├── criterion.py
│   │   ├── matcher.py
│   │   └── transformer_decoder/
│   │       ├── __init__.py
│   │       ├── position_encoding.py
│   │       └── video_mask2former_transformer_decoder.py
│   ├── utils/
│   │   ├── __init__.py
│   │   ├── __init__.py.new
│   │   └── memory.py
│   └── video_maskformer_model.py
├── mfvis_nococo/
│   ├── __init__.py
│   ├── configs/
│   │   └── youtubevis_2019/
│   │       ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
│   │       ├── video_maskformer2_R101_bs16_8ep_coco.yaml
│   │       ├── video_maskformer2_R50_bs16_8ep.yaml
│   │       └── video_maskformer2_R50_bs16_8ep_coco.yaml
│   ├── mask2former/
│   │   ├── __init__.py
│   │   ├── config.py
│   │   ├── data/
│   │   │   ├── __init__.py
│   │   │   ├── dataset_mappers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __init__.py.new
│   │   │   │   ├── coco_instance_new_baseline_dataset_mapper.py
│   │   │   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
│   │   │   │   ├── mask_former_instance_dataset_mapper.py
│   │   │   │   ├── mask_former_panoptic_dataset_mapper.py
│   │   │   │   └── mask_former_semantic_dataset_mapper.py
│   │   │   └── datasets/
│   │   │       ├── __init__.py
│   │   │       ├── register_ade20k_full.py
│   │   │       ├── register_ade20k_instance.py
│   │   │       ├── register_ade20k_panoptic.py
│   │   │       ├── register_coco_panoptic_annos_semseg.py
│   │   │       ├── register_coco_stuff_10k.py
│   │   │       ├── register_mapillary_vistas.py
│   │   │       └── register_mapillary_vistas_panoptic.py
│   │   ├── evaluation/
│   │   │   ├── __init__.py
│   │   │   ├── __init__.py.new
│   │   │   └── instance_evaluation.py
│   │   ├── maskformer_model.py
│   │   ├── modeling/
│   │   │   ├── __init__.py
│   │   │   ├── backbone/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __init__.py.new
│   │   │   │   └── swin.py
│   │   │   ├── criterion.py
│   │   │   ├── matcher.py
│   │   │   ├── meta_arch/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __init__.py.new
│   │   │   │   ├── mask_former_head.py
│   │   │   │   └── per_pixel_baseline.py
│   │   │   ├── pixel_decoder/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __init__.py.new
│   │   │   │   ├── fpn.py
│   │   │   │   ├── msdeformattn.py
│   │   │   │   └── ops/
│   │   │   │       ├── functions/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   └── ms_deform_attn_func.py
│   │   │   │       ├── make.sh
│   │   │   │       ├── modules/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   └── ms_deform_attn.py
│   │   │   │       ├── setup.py
│   │   │   │       ├── src/
│   │   │   │       │   ├── cpu/
│   │   │   │       │   │   ├── ms_deform_attn_cpu.cpp
│   │   │   │       │   │   └── ms_deform_attn_cpu.h
│   │   │   │       │   ├── cuda/
│   │   │   │       │   │   ├── ms_deform_attn_cuda.cu
│   │   │   │       │   │   ├── ms_deform_attn_cuda.h
│   │   │   │       │   │   └── ms_deform_im2col_cuda.cuh
│   │   │   │       │   ├── ms_deform_attn.h
│   │   │   │       │   └── vision.cpp
│   │   │   │       └── test.py
│   │   │   └── transformer_decoder/
│   │   │       ├── __init__.py
│   │   │       ├── mask2former_transformer_decoder.py
│   │   │       ├── maskformer_transformer_decoder.py
│   │   │       ├── position_encoding.py
│   │   │       └── transformer.py
│   │   ├── test_time_augmentation.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── __init__.py.new
│   │       └── misc.py
│   ├── mask2former_video/
│   │   ├── __init__.py
│   │   ├── config.py
│   │   ├── data_video/
│   │   │   ├── __init__.py
│   │   │   ├── augmentation.py
│   │   │   ├── build.py
│   │   │   ├── dataset_mapper.py
│   │   │   ├── datasets/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── builtin.py
│   │   │   │   ├── ytvis.py
│   │   │   │   └── ytvis_api/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── ytvos.py
│   │   │   │       └── ytvoseval.py
│   │   │   └── ytvis_eval.py
│   │   ├── modeling/
│   │   │   ├── __init__.py
│   │   │   ├── criterion.py
│   │   │   ├── matcher.py
│   │   │   └── transformer_decoder/
│   │   │       ├── __init__.py
│   │   │       ├── position_encoding.py
│   │   │       └── video_mask2former_transformer_decoder.py
│   │   ├── utils/
│   │   │   ├── __init__.py
│   │   │   └── memory.py
│   │   └── video_maskformer_model.py
│   ├── scripts/
│   │   ├── eval_8gpu_mask2former_r101_video.sh
│   │   ├── train_8gpu_mask2former_r101_video_coco.sh
│   │   ├── train_8gpu_mask2former_r50_video.sh
│   │   ├── train_8gpu_mask2former_r50_video_coco.sh
│   │   ├── visual_video_r101.sh
│   │   └── visual_video_r50.sh
│   └── train_net_video.py
├── requirements.txt
├── scripts/
│   ├── eval_8gpu_mask2former_r101_video.sh
│   ├── eval_8gpu_mask2former_r50_video.sh
│   ├── eval_8gpu_mask2former_swinl_video.sh
│   ├── train_8gpu_mask2former_r101_video.sh
│   ├── train_8gpu_mask2former_r50_video.sh
│   ├── train_8gpu_mask2former_swinl_video.sh
│   └── visual_video.sh
├── tools/
│   ├── README.md
│   ├── analyze_model.py
│   ├── convert-pretrained-swin-model-to-d2.py
│   ├── convert-torchvision-to-d2.py
│   ├── evaluate_coco_boundary_ap.py
│   └── evaluate_pq_for_semantic_segmentation.py
├── train_net.py
├── train_net_video.py
└── util/
    ├── __init__.py
    ├── box_ops.py
    ├── misc.py
    └── plot_utils.py

================================================
FILE CONTENTS
================================================

================================================
FILE: DATASET_prepare.md
================================================
# Prepare Datasets for MaskFreeVIS

A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog)
for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc).
This document explains how to setup the builtin datasets so they can be used by the above APIs.
[Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`,
and how to add new datasets to them.

MaskFreeVIS has builtin support for a few datasets.
The datasets are assumed to exist in a directory specified by the environment variable
`DETECTRON2_DATASETS`.

You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`.
If left unset, the default is `./datasets` relative to your current working directory.

The model zoo contains configs and models that use these builtin datasets. We will convert each object mask to box when after reading the corresponding instance annotation.

## Expected dataset structure for [COCO](https://cocodataset.org/#download):

```
coco/
  annotations/
    instances_{train,val}2017.json
    panoptic_{train,val}2017.json
  {train,val}2017/
    # image files that are mentioned in the corresponding json
  panoptic_{train,val}2017/  # png annotations
  panoptic_semseg_{train,val}2017/  # generated by the script mentioned below
```

Install panopticapi by:
```
pip install git+https://github.com/cocodataset/panopticapi.git
```
Then, run `python datasets/prepare_coco_semantic_annos_from_panoptic_annos.py`, to extract semantic annotations from panoptic annotations (only used for evaluation).


## Expected dataset structure for [YouTubeVIS 2019](https://competitions.codalab.org/competitions/20128):

```
ytvis_2019/
  {train,valid,test}.json
  {train,valid,test}/
    Annotations/
    JPEGImages/
```

## Expected dataset structure for [YouTubeVIS 2021](https://competitions.codalab.org/competitions/28988):

```
ytvis_2021/
  {train,valid,test}.json
  {train,valid,test}/
    Annotations/
    JPEGImages/
```


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# MaskFreeVIS

Mask-Free Video Instance Segmentation [CVPR 2023].

This is the official pytorch implementation of [MaskFreeVIS](https://github.com/SysCV/MaskFreeVis/) built on the open-source detectron2. We aim to **remove the necessity for expensive video masks and even image masks** for training VIS models. Our project website contains more information, including the visual video comparison: [vis.xyz/pub/maskfreevis](https://www.vis.xyz/pub/maskfreevis/).


> [**Mask-Free Video Instance Segmentation**](https://arxiv.org/abs/2303.15904)           
> Lei Ke, Martin Danelljan, Henghui Ding, Yu-Wing Tai, Chi-Keung Tang, Fisher Yu \
> CVPR 2023

Highlights
-----------------
- **High-performing** video instance segmentation **without using any video masks or even image mask** labels. Using SwinL and built on Mask2Former, MaskFreeVIS achieved 56.0 AP on YTVIS without using any video masks labels. Using ResNet-101, MaskFreeVIS achieves 49.1 AP without using video masks, and 47.3 AP only using COCO mask initialized model.
- **Novelty:** a new **parameter-free** Temporal KNN-patch Loss (TK-Loss), which leverages temporal masks consistency using unsupervised one-to-k patch correspondence.
- **Simple:** TK-Loss is flexible to intergrated with state-of-the-art transformer-based VIS models, with no trainable parameters.

Visualization results of MaskFreeVIS
-----------------

<table>
  <tr>
    <td><img src="vis_demos/example1.gif" width="350"></td>
    <td><img src="vis_demos/example2.gif" width="350"></td>
  </tr>
  <tr>
    <td><img src="vis_demos/example3.gif" width="350"></td>
    <td><img src="vis_demos/example4.gif" width="350"></td>
  </tr>
</table>

Introduction
-----------------
The recent advancement in Video Instance Segmentation (VIS) has largely been driven by the use of deeper and increasingly data-hungry transformer-based models. However, video masks are tedious and expensive to annotate, limiting the scale and diversity of existing VIS datasets. In this work, we aim to remove the mask-annotation requirement. We propose MaskFreeVIS, achieving highly competitive VIS performance, while only using bounding box annotations for the object state. We leverage the rich temporal mask consistency constraints in videos by introducing the Temporal KNN-patch Loss (TK-Loss), providing strong mask supervision without any labels. Our TK-Loss finds one-to-many matches across frames, through an efficient patch-matching step followed by a K-nearest neighbor selection. A consistency loss is then enforced on the found matches. Our mask-free objective is simple to implement, has no trainable parameters, is computationally efficient, yet outperforms baselines employing, e.g., state-of-the-art optical flow to enforce temporal mask consistency. We validate MaskFreeVIS on the YouTube-VIS 2019/2021, OVIS and BDD100K MOTS benchmarks. The results clearly demonstrate the efficacy of our method by drastically narrowing the gap between fully and weakly-supervised VIS performance.


Methods
-----------------
<img width="1096" alt="image" src="https://user-images.githubusercontent.com/17427852/228353991-ff09784f-9afd-4ac2-bddf-c5b2763d25e6.png">

### **Installation**
Please see [Getting Started with Detectron2](https://github.com/facebookresearch/detectron2/blob/master/GETTING_STARTED.md) for full usage.

### Requirements
- Linux or macOS with Python 3.6
- PyTorch 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
  Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check
  PyTorch version matches that is required by Detectron2.
- Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html).
- OpenCV is optional but needed by demo and visualization
- `pip install -r requirements.txt`

### CUDA kernel for MSDeformAttn
After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn:

`CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit.

```bash
cd mask2former/modeling/pixel_decoder/ops
sh make.sh
```

#### Building on another system
To build on a system that does not have a GPU device but provide the drivers:
```bash
TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install
```

### Example conda environment setup
```bash
conda create --name maskfreevis python=3.8 -y
conda activate maskfreevis
conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia
pip install -U opencv-python

# under your working directory
git clone git@github.com:facebookresearch/detectron2.git
cd detectron2
pip install -e .

cd ..
git clone https://github.com/SysCV/MaskFreeVIS.git
cd MaskFreeVIS
pip install -r requirements.txt
cd mask2former/modeling/pixel_decoder/ops
sh make.sh
```

### **Dataset preparation**
Please see the document [here](DATASET_prepare.md).


### **Model Zoo**

## Video Instance Segmentation (YouTubeVIS) 

Using COCO image masks **without YTVIS video masks** during training:
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Config Name</th>
<th valign="bottom">Backbone</th>
<th valign="bottom">AP</th>
<th valign="bottom">download</th>
<th valign="bottom">Training Script</th>
<th valign="bottom">COCO Init Weight</th>
<!-- TABLE BODY -->
<!-- ROW: maskformer2_R50_bs16_50ep -->
 <tr><td align="left"><a href="configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml">MaskFreeVIS</a></td>
<td align="center">R50</td>
<td align="center">46.6</td>
<td align="center"><a href="https://drive.google.com/file/d/1Jjq-YgHqwixs2AdJ3kSNp4d2DjjV5qEA/view?usp=share_link">model</a></td>
<td align="center"><a href="scripts/train_8gpu_mask2former_r50_video.sh">script</a></td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/maskformer/mask2former/coco/instance/maskformer2_R50_bs16_50ep/model_final_3c8ec9.pkl">Init</a></td>
</tr>

<!-- ROW: maskformer2_R101_bs16_50ep -->
 <tr><td align="left"><a href="configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml">MaskFreeVIS</a></td>
<td align="center">R101</td>
<td align="center">49.1</td>
<td align="center"><a href="https://drive.google.com/file/d/1eo05Rdl5cgTEB0mxB2HLwQGhEu6vEwDu/view?usp=share_link">model</a></td>
<td align="center"><a href="scripts/train_8gpu_mask2former_r101_video.sh">script</a></td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/maskformer/mask2former/coco/instance/maskformer2_R101_bs16_50ep/model_final_eba159.pkl">Init</a></td>
</tr>

<!-- ROW: maskformer2_swin_base_IN21k_384_bs16_50ep -->
 <tr><td align="left"><a href="configs/youtubevis_2019/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml">MaskFreeVIS</a></td>
<td align="center">Swin-L</td>
<td align="center">56.0</td>
<td align="center"><a href="https://drive.google.com/file/d/1kvckNoaDftN5R16CRJ-izfHeKTl_rskt/view?usp=share_link">model</a></td>
<td align="center"><a href="scripts/train_8gpu_mask2former_swinl_video.sh">script</a></td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/maskformer/mask2former/coco/instance/maskformer2_swin_large_IN21k_384_bs16_100ep/model_final_e5f453.pkl">Init</a></td>
</tr>
</tbody></table>

**For below two training settings without using pseudo COCO images masks** for joint video training, please change the folder to:
```
cd mfvis_nococo
```

1) Only using **COCO mask initialized model without YTVIS video masks** during training:
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Config Name</th>
<th valign="bottom">Backbone</th>
<th valign="bottom">AP</th>
<th valign="bottom">download</th>
<th valign="bottom">Training Script</th>
<th valign="bottom">COCO Init Weight</th>
<!-- TABLE BODY -->
<!-- ROW: maskformer2_R50_bs16_50ep -->
 <tr><td align="left"><a href="mfvis_nococo/configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep_coco.yaml">MaskFreeVIS</a></td>
<td align="center">R50</td>
<td align="center">43.8</td>
<td align="center"><a href="https://drive.google.com/file/d/1hAfGtRk5uxYj9BPX3PGPjufyiF5l0IsW/view?usp=share_link">model</a></td>
<td align="center"><a href="mfvis_nococo/scripts/train_8gpu_mask2former_r50_video_coco.sh">script</a></td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/maskformer/mask2former/coco/instance/maskformer2_R50_bs16_50ep/model_final_3c8ec9.pkl">Init</a></td>
</tr>
<!-- ROW: maskformer2_R101_bs16_50ep -->
 <tr><td align="left"><a href="mfvis_nococo/configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep_coco.yaml">MaskFreeVIS</a></td>
<td align="center">R101</td>
<td align="center">47.3</td>
<td align="center"><a href="https://drive.google.com/file/d/1imHH-m9Q9YkJBzEe2MD0ewypjJdfdMZZ/view?usp=share_link">model</a></td>
<td align="center"><a href="mfvis_nococo/scripts/train_8gpu_mask2former_r101_video_coco.sh">script</a></td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/maskformer/mask2former/coco/instance/maskformer2_R101_bs16_50ep/model_final_eba159.pkl">Init</a></td>
</tr>
<!-- ROW: maskformer2_swin_base_IN21k_384_bs16_50ep -->
</tbody></table>

2) Only using **COCO box initialized model without YTVIS video masks** during training:
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Config Name</th>
<th valign="bottom">Backbone</th>
<th valign="bottom">AP</th>
<th valign="bottom">download</th>
<th valign="bottom">Training Script</th>
<th valign="bottom">COCO Box Init Weight</th>
<!-- TABLE BODY -->
<!-- ROW: maskformer2_R50_bs16_50ep -->
 <tr><td align="left"><a href="mfvis_nococo/configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml">MaskFreeVIS</a></td>
<td align="center">R50</td>
<td align="center">42.5</td>
<td align="center"><a href="https://drive.google.com/file/d/1F5VZPxR4637JmFu3t4WaKgvWs4WSxPPl/view?usp=share_link">model</a></td>
<td align="center"><a href="mfvis_nococo/scripts/train_8gpu_mask2former_r50_video.sh">script</a></td>
<td align="center"><a href="https://drive.google.com/file/d/1qiFBqFK0VEgdj0ulylEqNKGExSguGc8V/view?usp=share_link">Init</a></td>
</tr>
</tbody></table>


Please see our script folder. 

## Inference & Evaluation

First download the provided trained model from our model zoo table and put them into the mfvis_models. 

```
mkdir mfvis_models
```

Refer to our [scripts folder](./scripts) for more commands:

Example evaluation scripts:
```
bash scripts/eval_8gpu_mask2former_r50_video.sh
bash scripts/eval_8gpu_mask2former_r101_video.sh
bash scripts/eval_8gpu_mask2former_swinl_video.sh
```

## Results Visualization

Example visualization script:
```
bash scripts/visual_video.sh
```


Citation
---------------
If you find MaskFreeVIS useful in your research or refer to the provided baseline results, please star :star: this repository and consider citing :pencil::
```
@inproceedings{maskfreevis,
    author={Ke, Lei and Danelljan, Martin and Ding, Henghui and Tai, Yu-Wing and Tang, Chi-Keung and Yu, Fisher},
    title={Mask-Free Video Instance Segmentation},
    booktitle = {CVPR},
    year = {2023}
}  
```

## Acknowledgments
- Thanks [BoxInst](https://github.com/aim-uofa/AdelaiDet/blob/master/configs/BoxInst/README.md) image-based instance segmentation losses.
- Thanks [Mask2Former](https://github.com/facebookresearch/Mask2Former) and [VMT](https://github.com/SysCV/vmt) for providing useful inference and evaluation toolkits.


================================================
FILE: configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml
================================================
MODEL:
  BACKBONE:
    FREEZE_AT: 0
    NAME: "build_resnet_backbone"
  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
  PIXEL_MEAN: [123.675, 116.280, 103.530]
  PIXEL_STD: [58.395, 57.120, 57.375]
  RESNETS:
    DEPTH: 50
    STEM_TYPE: "basic"  # not used
    STEM_OUT_CHANNELS: 64
    STRIDE_IN_1X1: False
    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
    # NORM: "SyncBN"
    RES5_MULTI_GRID: [1, 1, 1]  # not used
DATASETS:
  TRAIN: ("coco_2017_train",)
  TEST: ("coco_2017_val",)
SOLVER:
  IMS_PER_BATCH: 16
  BASE_LR: 0.0001
  STEPS: (327778, 355092)
  MAX_ITER: 368750
  WARMUP_FACTOR: 1.0
  WARMUP_ITERS: 10
  WEIGHT_DECAY: 0.05
  OPTIMIZER: "ADAMW"
  BACKBONE_MULTIPLIER: 0.1
  CLIP_GRADIENTS:
    ENABLED: True
    CLIP_TYPE: "full_model"
    CLIP_VALUE: 0.01
    NORM_TYPE: 2.0
  AMP:
    ENABLED: True
INPUT:
  IMAGE_SIZE: 1024
  MIN_SCALE: 0.1
  MAX_SCALE: 2.0
  FORMAT: "RGB"
  DATASET_MAPPER_NAME: "coco_instance_lsj"
TEST:
  EVAL_PERIOD: 5000
DATALOADER:
  FILTER_EMPTY_ANNOTATIONS: True
  NUM_WORKERS: 4
VERSION: 2


================================================
FILE: configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml
================================================
_BASE_: Base-COCO-InstanceSegmentation.yaml
OUTPUT_DIR: './output/'
MODEL:
  META_ARCHITECTURE: "MaskFormer"
  SEM_SEG_HEAD:
    NAME: "MaskFormerHead"
    IGNORE_VALUE: 255
    NUM_CLASSES: 80
    LOSS_WEIGHT: 1.0
    CONVS_DIM: 256
    MASK_DIM: 256
    NORM: "GN"
    # pixel decoder
    PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
    IN_FEATURES: ["res2", "res3", "res4", "res5"]
    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
    COMMON_STRIDE: 4
    TRANSFORMER_ENC_LAYERS: 6
  MASK_FORMER:
    TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
    DEEP_SUPERVISION: True
    NO_OBJECT_WEIGHT: 0.1
    CLASS_WEIGHT: 2.0
    MASK_WEIGHT: 5.0
    DICE_WEIGHT: 5.0
    HIDDEN_DIM: 256
    NUM_OBJECT_QUERIES: 100
    NHEADS: 8
    DROPOUT: 0.0
    DIM_FEEDFORWARD: 2048
    ENC_LAYERS: 0
    PRE_NORM: False
    ENFORCE_INPUT_PROJ: False
    SIZE_DIVISIBILITY: 32
    DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
    TRAIN_NUM_POINTS: 12544
    OVERSAMPLE_RATIO: 3.0
    IMPORTANCE_SAMPLE_RATIO: 0.75
    TEST:
      SEMANTIC_ON: False
      INSTANCE_ON: True
      PANOPTIC_ON: False
      OVERLAP_THRESHOLD: 0.8
      OBJECT_MASK_THRESHOLD: 0.8


================================================
FILE: configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation.yaml
================================================
MODEL:
  BACKBONE:
    FREEZE_AT: 0
    NAME: "build_resnet_backbone"
  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
  PIXEL_MEAN: [123.675, 116.280, 103.530]
  PIXEL_STD: [58.395, 57.120, 57.375]
  MASK_ON: True
  RESNETS:
    DEPTH: 50
    STEM_TYPE: "basic"  # not used
    STEM_OUT_CHANNELS: 64
    STRIDE_IN_1X1: False
    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
    # NORM: "SyncBN"
    RES5_MULTI_GRID: [1, 1, 1]  # not used
DATASETS:
  TRAIN: ("ytvis_2019_train", "coco_2017_train_fake",)
  TEST: ("ytvis_2019_val",)
SOLVER:
  IMS_PER_BATCH: 16
  BASE_LR: 0.0001
  STEPS: (4000,)
  MAX_ITER: 6000
  WARMUP_FACTOR: 1.0
  WARMUP_ITERS: 10
  WEIGHT_DECAY: 0.05
  OPTIMIZER: "ADAMW"
  BACKBONE_MULTIPLIER: 0.1
  CLIP_GRADIENTS:
    ENABLED: True
    CLIP_TYPE: "full_model"
    CLIP_VALUE: 0.01
    NORM_TYPE: 2.0
  AMP:
    ENABLED: True
INPUT:
  MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
  RANDOM_FLIP: "flip_by_clip"
  AUGMENTATIONS: []
  MIN_SIZE_TRAIN: (360, 480)
  MIN_SIZE_TEST: 360
  CROP:
    ENABLED: False
    TYPE: "absolute_range"
    SIZE: (600, 720)
  FORMAT: "RGB"
TEST:
  EVAL_PERIOD: 0
DATALOADER:
  FILTER_EMPTY_ANNOTATIONS: False
  NUM_WORKERS: 4
VERSION: 2


================================================
FILE: configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation_long.yaml
================================================
MODEL:
  BACKBONE:
    FREEZE_AT: 0
    NAME: "build_resnet_backbone"
  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
  PIXEL_MEAN: [123.675, 116.280, 103.530]
  PIXEL_STD: [58.395, 57.120, 57.375]
  MASK_ON: True
  RESNETS:
    DEPTH: 50
    STEM_TYPE: "basic"  # not used
    STEM_OUT_CHANNELS: 64
    STRIDE_IN_1X1: False
    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
    # NORM: "SyncBN"
    RES5_MULTI_GRID: [1, 1, 1]  # not used
DATASETS:
  TRAIN: ("coco_2017_train_fake", "ytvis_2019_train",)
  TEST: ("ytvis_2019_val",)
SOLVER:
  IMS_PER_BATCH: 8
  BASE_LR: 0.00005
  STEPS: (75000,)
  MAX_ITER: 140000
  WARMUP_FACTOR: 1.0
  WARMUP_ITERS: 10
  WEIGHT_DECAY: 0.05
  OPTIMIZER: "ADAMW"
  BACKBONE_MULTIPLIER: 0.1
  CLIP_GRADIENTS:
    ENABLED: True
    CLIP_TYPE: "full_model"
    CLIP_VALUE: 0.01
    NORM_TYPE: 2.0
  AMP:
    ENABLED: True
INPUT:
  MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
  RANDOM_FLIP: "flip_by_clip"
  AUGMENTATIONS: []
  MIN_SIZE_TRAIN: (360, 480)
  MIN_SIZE_TEST: 360
  CROP:
    ENABLED: False
    TYPE: "absolute_range"
    SIZE: (600, 720)
  FORMAT: "RGB"
TEST:
  EVAL_PERIOD: 0
DATALOADER:
  FILTER_EMPTY_ANNOTATIONS: False
  NUM_WORKERS: 4
VERSION: 2


================================================
FILE: configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation_long_bs16.yaml
================================================
MODEL:
  BACKBONE:
    FREEZE_AT: 0
    NAME: "build_resnet_backbone"
  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
  PIXEL_MEAN: [123.675, 116.280, 103.530]
  PIXEL_STD: [58.395, 57.120, 57.375]
  MASK_ON: True
  RESNETS:
    DEPTH: 50
    STEM_TYPE: "basic"  # not used
    STEM_OUT_CHANNELS: 64
    STRIDE_IN_1X1: False
    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
    # NORM: "SyncBN"
    RES5_MULTI_GRID: [1, 1, 1]  # not used
DATASETS:
  TRAIN: ("coco_2017_train_fake", "ytvis_2019_train",)
  TEST: ("ytvis_2019_val",)
SOLVER:
  IMS_PER_BATCH: 16
  BASE_LR: 0.0001
  STEPS: (37500,)
  MAX_ITER: 70000
  WARMUP_FACTOR: 1.0
  WARMUP_ITERS: 10
  WEIGHT_DECAY: 0.05
  OPTIMIZER: "ADAMW"
  BACKBONE_MULTIPLIER: 0.1
  CLIP_GRADIENTS:
    ENABLED: True
    CLIP_TYPE: "full_model"
    CLIP_VALUE: 0.01
    NORM_TYPE: 2.0
  AMP:
    ENABLED: True
INPUT:
  MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
  RANDOM_FLIP: "flip_by_clip"
  AUGMENTATIONS: []
  MIN_SIZE_TRAIN: (360, 480)
  MIN_SIZE_TEST: 360
  CROP:
    ENABLED: False
    TYPE: "absolute_range"
    SIZE: (600, 720)
  FORMAT: "RGB"
TEST:
  EVAL_PERIOD: 0
DATALOADER:
  FILTER_EMPTY_ANNOTATIONS: False
  NUM_WORKERS: 4
VERSION: 2


================================================
FILE: configs/youtubevis_2019/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml
================================================
_BASE_: ../video_maskformer2_R50_bs16_8ep_swin.yaml
OUTPUT_DIR: 'swinl_joint_withcoco'
MODEL:
  WEIGHTS: "./pretrained_model/model_final_e5f453.pkl"
  BACKBONE:
    NAME: "D2SwinTransformer"
  SWIN:
    EMBED_DIM: 192
    DEPTHS: [2, 2, 18, 2]
    NUM_HEADS: [6, 12, 24, 48]
    WINDOW_SIZE: 12
    APE: False
    DROP_PATH_RATE: 0.3
    PATCH_NORM: True
    PRETRAIN_IMG_SIZE: 384
  #WEIGHTS: "model_final_e5f453.pkl"
  PIXEL_MEAN: [123.675, 116.280, 103.530]
  PIXEL_STD: [58.395, 57.120, 57.375]
  MASK_FORMER:
    NUM_OBJECT_QUERIES: 200
INPUT:
  MIN_SIZE_TEST: 480


================================================
FILE: configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml
================================================
_BASE_: video_maskformer2_R50_bs16_8ep.yaml
OUTPUT_DIR: './r101_coco_joint/'
MODEL:
  WEIGHTS: "pretrained_model/model_final_eba159.pkl"
  RESNETS:
    DEPTH: 101
    STEM_TYPE: "basic"  # not used
    STEM_OUT_CHANNELS: 64
    STRIDE_IN_1X1: False
    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
    # NORM: "SyncBN"
    RES5_MULTI_GRID: [1, 1, 1]  # not used


================================================
FILE: configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml
================================================
_BASE_: Base-YouTubeVIS-VideoInstanceSegmentation_long_bs16.yaml
OUTPUT_DIR: './r50_coco_joint/'
SEED: 29118357
MODEL:
  WEIGHTS: "./pretrained_model/model_final_3c8ec9.pkl"
  META_ARCHITECTURE: "VideoMaskFormer"
  SEM_SEG_HEAD:
    NAME: "MaskFormerHead"
    IGNORE_VALUE: 255
    NUM_CLASSES: 40
    LOSS_WEIGHT: 1.0
    CONVS_DIM: 256
    MASK_DIM: 256
    NORM: "GN"
    # pixel decoder
    PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
    IN_FEATURES: ["res2", "res3", "res4", "res5"]
    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
    COMMON_STRIDE: 4
    TRANSFORMER_ENC_LAYERS: 6
  MASK_FORMER:
    TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder"
    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
    DEEP_SUPERVISION: True
    NO_OBJECT_WEIGHT: 0.1
    CLASS_WEIGHT: 2.0
    MASK_WEIGHT: 5.0
    DICE_WEIGHT: 5.0
    HIDDEN_DIM: 256
    NUM_OBJECT_QUERIES: 100
    NHEADS: 8
    DROPOUT: 0.0
    DIM_FEEDFORWARD: 2048
    ENC_LAYERS: 0
    PRE_NORM: False
    ENFORCE_INPUT_PROJ: False
    SIZE_DIVISIBILITY: 32
    DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
    TRAIN_NUM_POINTS: 20000 #20000 #12544
    OVERSAMPLE_RATIO: 3.0
    IMPORTANCE_SAMPLE_RATIO: 0.75
    TEST:
      SEMANTIC_ON: False
      INSTANCE_ON: True
      PANOPTIC_ON: False
      OVERLAP_THRESHOLD: 0.8
      OBJECT_MASK_THRESHOLD: 0.8

INPUT:
  MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
  PSEUDO:
    SAMPLING_FRAME_NUM: 4
    SAMPLING_FRAME_RANGE: 20
    AUGMENTATIONS: ['rotation']
    MIN_SIZE_TRAIN: (288, 320, 352, 384, 416, 448, 480, 512)
    MAX_SIZE_TRAIN: 768
    CROP:
      ENABLED: True
      TYPE: "absolute_range"
      SIZE: (384, 600)
  LSJ_AUG:
    ENABLED: False
    IMAGE_SIZE: 768
    MIN_SCALE: 0.1
    MAX_SCALE: 2.0
DATALOADER:
  FILTER_EMPTY_ANNOTATIONS: True
  # NUM_WORKERS: 8


================================================
FILE: configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep_swin.yaml
================================================
_BASE_: Base-YouTubeVIS-VideoInstanceSegmentation_long.yaml
OUTPUT_DIR: './swinl_joint_withcoco/'
SEED: 29118357
MODEL:
  WEIGHTS: "./pretrained_model/model_final_3c8ec9.pkl"
  META_ARCHITECTURE: "VideoMaskFormer"
  SEM_SEG_HEAD:
    NAME: "MaskFormerHead"
    IGNORE_VALUE: 255
    NUM_CLASSES: 40
    LOSS_WEIGHT: 1.0
    CONVS_DIM: 256
    MASK_DIM: 256
    NORM: "GN"
    # pixel decoder
    PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
    IN_FEATURES: ["res2", "res3", "res4", "res5"]
    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
    COMMON_STRIDE: 4
    TRANSFORMER_ENC_LAYERS: 6
  MASK_FORMER:
    TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder"
    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
    DEEP_SUPERVISION: True
    NO_OBJECT_WEIGHT: 0.1
    CLASS_WEIGHT: 2.0
    MASK_WEIGHT: 5.0
    DICE_WEIGHT: 5.0
    HIDDEN_DIM: 256
    NUM_OBJECT_QUERIES: 100
    NHEADS: 8
    DROPOUT: 0.0
    DIM_FEEDFORWARD: 2048
    ENC_LAYERS: 0
    PRE_NORM: False
    ENFORCE_INPUT_PROJ: False
    SIZE_DIVISIBILITY: 32
    DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
    TRAIN_NUM_POINTS: 20000 #20000 #12544
    OVERSAMPLE_RATIO: 3.0
    IMPORTANCE_SAMPLE_RATIO: 0.75
    TEST:
      SEMANTIC_ON: False
      INSTANCE_ON: True
      PANOPTIC_ON: False
      OVERLAP_THRESHOLD: 0.8
      OBJECT_MASK_THRESHOLD: 0.8

INPUT:
  MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
  PSEUDO:
    SAMPLING_FRAME_NUM: 4
    SAMPLING_FRAME_RANGE: 20
    AUGMENTATIONS: ['rotation']
    MIN_SIZE_TRAIN: (288, 320, 352, 384, 416, 448, 480, 512)
    MAX_SIZE_TRAIN: 768
    CROP:
      ENABLED: True
      TYPE: "absolute_range"
      SIZE: (384, 600)
  LSJ_AUG:
    ENABLED: False
    IMAGE_SIZE: 768
    MIN_SCALE: 0.1
    MAX_SCALE: 2.0
DATALOADER:
  FILTER_EMPTY_ANNOTATIONS: True
  # NUM_WORKERS: 8


================================================
FILE: demo/README.md
================================================
## Mask2Former Demo

We provide a command line tool to run a simple demo of builtin configs.
The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md).


================================================
FILE: demo/demo.py
================================================
# Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py
import argparse
import glob
import multiprocessing as mp
import os
# fmt: off
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
# fmt: on
import tempfile
import time
import warnings
import cv2
import numpy as np
import tqdm
from detectron2.config import get_cfg
from detectron2.data.detection_utils import read_image
from detectron2.projects.deeplab import add_deeplab_config
from detectron2.utils.logger import setup_logger
from mask2former import add_maskformer2_config
from predictor import VisualizationDemo

# constants
WINDOW_NAME = "mask2former demo"
def setup_cfg(args):
    # load config from file and command-line arguments
    cfg = get_cfg()
    add_deeplab_config(cfg)
    add_maskformer2_config(cfg)
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()
    return cfg
    
def get_parser():
    parser = argparse.ArgumentParser(description="maskformer2 demo for builtin configs")
    parser.add_argument(
        "--config-file",
        default="configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
    parser.add_argument("--video-input", help="Path to video file.")
    parser.add_argument(
        "--input",
        nargs="+",
        help="A list of space separated input images; "
        "or a single glob pattern such as 'directory/*.jpg'",
    )
    parser.add_argument(
        "--output",
        help="A file or directory to save output visualizations. "
        "If not given, will show output in an OpenCV window.",
    )
    parser.add_argument(
        "--confidence-threshold",
        type=float,
        default=0.5,
        help="Minimum score for instance predictions to be shown",
    )
    parser.add_argument(
        "--opts",
        help="Modify config options using the command-line 'KEY VALUE' pairs",
        default=[],
        nargs=argparse.REMAINDER,
    )
    return parser

def test_opencv_video_format(codec, file_ext):
    with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
        filename = os.path.join(dir, "test_file" + file_ext)
        writer = cv2.VideoWriter(
            filename=filename,
            fourcc=cv2.VideoWriter_fourcc(*codec),
            fps=float(30),
            frameSize=(10, 10),
            isColor=True,
        )
        [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
        writer.release()
        if os.path.isfile(filename):
            return True
        return False

if __name__ == "__main__":
    mp.set_start_method("spawn", force=True)
    args = get_parser().parse_args()
    setup_logger(name="fvcore")
    logger = setup_logger()
    logger.info("Arguments: " + str(args))
    cfg = setup_cfg(args)
    demo = VisualizationDemo(cfg)
    if args.input:
        if len(args.input) == 1:
            args.input = glob.glob(os.path.expanduser(args.input[0]))
            assert args.input, "The input path(s) was not found"
        for path in tqdm.tqdm(args.input, disable=not args.output):
            # use PIL, to be consistent with evaluation
            img = read_image(path, format="BGR")
            start_time = time.time()
            predictions, visualized_output = demo.run_on_image(img, args.confidence_threshold)
            logger.info(
                "{}: {} in {:.2f}s".format(
                    path,
                    "detected {} instances".format(len(predictions["instances"]))
                    if "instances" in predictions
                    else "finished",
                    time.time() - start_time,
                )
            )
            if args.output:
                if os.path.isdir(args.output):
                    assert os.path.isdir(args.output), args.output
                    out_filename = os.path.join(args.output, os.path.basename(path))
                else:
                    #assert len(args.input) == 1, "Please specify a directory with args.output"
                    os.makedirs(args.output)
                    out_filename = os.path.join(args.output, os.path.basename(path))
                    #out_filename = args.output
                visualized_output.save(out_filename)
            else:
                cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
                cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
                if cv2.waitKey(0) == 27:
                    break  # esc to quit
    elif args.webcam:
        assert args.input is None, "Cannot have both --input and --webcam!"
        assert args.output is None, "output not yet supported with --webcam!"
        cam = cv2.VideoCapture(0)
        for vis in tqdm.tqdm(demo.run_on_video(cam)):
            cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
            cv2.imshow(WINDOW_NAME, vis)
            if cv2.waitKey(1) == 27:
                break  # esc to quit
        cam.release()
        cv2.destroyAllWindows()
    elif args.video_input:
        video = cv2.VideoCapture(args.video_input)
        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
        frames_per_second = video.get(cv2.CAP_PROP_FPS)
        num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
        basename = os.path.basename(args.video_input)
        codec, file_ext = (
            ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4")
        )
        if codec == ".mp4v":
            warnings.warn("x264 codec not available, switching to mp4v")
        if args.output:
            if os.path.isdir(args.output):
                output_fname = os.path.join(args.output, basename)
                output_fname = os.path.splitext(output_fname)[0] + file_ext
            else:
                output_fname = args.output
            assert not os.path.isfile(output_fname), output_fname
            output_file = cv2.VideoWriter(
                filename=output_fname,
                # some installation of opencv may not support x264 (due to its license),
                # you can try other format (e.g. MPEG)
                fourcc=cv2.VideoWriter_fourcc(*codec),
                fps=float(frames_per_second),
                frameSize=(width, height),
                isColor=True,
            )
        assert os.path.isfile(args.video_input)
        for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames):
            if args.output:
                output_file.write(vis_frame)
            else:
                cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
                cv2.imshow(basename, vis_frame)
                if cv2.waitKey(1) == 27:
                    break  # esc to quit
        video.release()
        if args.output:
            output_file.release()
        else:
            cv2.destroyAllWindows()


================================================
FILE: demo/predictor.py
================================================
# Copied from: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py
import atexit
import bisect
import multiprocessing as mp
from collections import deque
import cv2
import torch
import numpy as np
from detectron2.data import MetadataCatalog
from detectron2.engine.defaults import DefaultPredictor
from detectron2.utils.video_visualizer import VideoVisualizer
from detectron2.utils.visualizer import ColorMode, Visualizer
import matplotlib.pyplot as plt

class VisualizationDemo(object):
    def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
        """
        Args:
            cfg (CfgNode):
            instance_mode (ColorMode):
            parallel (bool): whether to run the model in different processes from visualization.
                Useful since the visualization logic can be slow.
        """
        self.metadata = MetadataCatalog.get(
            cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
        )
        self.cpu_device = torch.device("cpu")
        self.instance_mode = instance_mode
        self.parallel = parallel
        self.cfg_vis = cfg
        if parallel:
            num_gpu = torch.cuda.device_count()
            self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
        else:
            self.predictor = DefaultPredictor(cfg)

    def run_on_image(self, image, conf_thre):
        """
        Args:
            image (np.ndarray): an image of shape (H, W, C) (in BGR order).
                This is the format used by OpenCV.
        Returns:
            predictions (dict): the output of the model.
            vis_output (VisImage): the visualized image output.
        """
        vis_output = None
        predictions = self.predictor(image)
        # Convert image from OpenCV BGR format to Matplotlib RGB format.
        image = image[:, :, ::-1]
        visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
        if "panoptic_seg" in predictions:
            panoptic_seg, segments_info = predictions["panoptic_seg"]
            vis_output = visualizer.draw_panoptic_seg_predictions(
                panoptic_seg.to(self.cpu_device), segments_info
            )
        else:
            if "sem_seg" in predictions:
                vis_output = visualizer.draw_sem_seg(
                    predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
                )
            if "instances" in predictions:
                instances = predictions["instances"].to(self.cpu_device)
                instances = instances[instances.scores >= conf_thre]
                '''
                mask = instances.pred_masks.squeeze(1).data.cpu().numpy()
                for i_m in range(len(mask)):
                    print('mask shape:', mask.shape)
                    print('mask max:', mask.max())
                    #heatmapshow = cv2.normalize(mask[i], heatmapshow, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8U)
                    heatmapshow = cv2.applyColorMap((mask[i_m] * 255).astype(np.uint8), cv2.COLORMAP_JET) 
                    cv2.imwrite(str(i_m)+"_heatmap_n.jpg", heatmapshow)
                '''
                '''
                print('instances scores:', instances.scores.shape)
                print('instances scores:', instances.scores)
                print('instances class:', instances.pred_classes.shape)
                print('instances boxes:', instances.pred_boxes)
                print('instances masks:', instances.pred_masks.shape)
                instances.pred_boxes = None
                '''
                vis_output = visualizer.draw_instance_predictions(predictions=instances)
        return predictions, vis_output

    def _frame_from_video(self, video):
        while video.isOpened():
            success, frame = video.read()
            if success:
                yield frame
            else:
                break

    def run_on_video(self, video):
        """
        Visualizes predictions on frames of the input video.
        Args:
            video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
                either a webcam or a video file.
        Yields:
            ndarray: BGR visualizations of each video frame.
        """
        video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
        def process_predictions(frame, predictions):
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            if "panoptic_seg" in predictions:
                panoptic_seg, segments_info = predictions["panoptic_seg"]
                vis_frame = video_visualizer.draw_panoptic_seg_predictions(
                    frame, panoptic_seg.to(self.cpu_device), segments_info
                )
            elif "instances" in predictions:
                predictions = predictions["instances"].to(self.cpu_device)
                vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
            elif "sem_seg" in predictions:
                vis_frame = video_visualizer.draw_sem_seg(
                    frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
                )
            # Converts Matplotlib RGB format to OpenCV BGR format
            vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
            return vis_frame
        frame_gen = self._frame_from_video(video)
        if self.parallel:
            buffer_size = self.predictor.default_buffer_size
            frame_data = deque()
            for cnt, frame in enumerate(frame_gen):
                frame_data.append(frame)
                self.predictor.put(frame)
                if cnt >= buffer_size:
                    frame = frame_data.popleft()
                    predictions = self.predictor.get()
                    yield process_predictions(frame, predictions)
            while len(frame_data):
                frame = frame_data.popleft()
                predictions = self.predictor.get()
                yield process_predictions(frame, predictions)
        else:
            for frame in frame_gen:
                yield process_predictions(frame, self.predictor(frame))

class AsyncPredictor:
    """
    A predictor that runs the model asynchronously, possibly on >1 GPUs.
    Because rendering the visualization takes considerably amount of time,
    this helps improve throughput a little bit when rendering videos.
    """
    class _StopToken:
        pass
        
    class _PredictWorker(mp.Process):
        def __init__(self, cfg, task_queue, result_queue):
            self.cfg = cfg
            self.task_queue = task_queue
            self.result_queue = result_queue
            super().__init__()
        def run(self):
            predictor = DefaultPredictor(self.cfg)
            while True:
                task = self.task_queue.get()
                if isinstance(task, AsyncPredictor._StopToken):
                    break
                idx, data = task
                result = predictor(data)
                self.result_queue.put((idx, result))

    def __init__(self, cfg, num_gpus: int = 1):
        """
        Args:
            cfg (CfgNode):
            num_gpus (int): if 0, will run on CPU
        """
        num_workers = max(num_gpus, 1)
        self.task_queue = mp.Queue(maxsize=num_workers * 3)
        self.result_queue = mp.Queue(maxsize=num_workers * 3)
        self.procs = []
        for gpuid in range(max(num_gpus, 1)):
            cfg = cfg.clone()
            cfg.defrost()
            cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
            self.procs.append(
                AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
            )
        self.put_idx = 0
        self.get_idx = 0
        self.result_rank = []
        self.result_data = []
        for p in self.procs:
            p.start()
        atexit.register(self.shutdown)

    def put(self, image):
        self.put_idx += 1
        self.task_queue.put((self.put_idx, image))

    def get(self):
        self.get_idx += 1  # the index needed for this request
        if len(self.result_rank) and self.result_rank[0] == self.get_idx:
            res = self.result_data[0]
            del self.result_data[0], self.result_rank[0]
            return res
        while True:
            # make sure the results are returned in the correct order
            idx, res = self.result_queue.get()
            if idx == self.get_idx:
                return res
            insert = bisect.bisect(self.result_rank, idx)
            self.result_rank.insert(insert, idx)
            self.result_data.insert(insert, res)

    def __len__(self):
        return self.put_idx - self.get_idx

    def __call__(self, image):
        self.put(image)
        return self.get()

    def shutdown(self):
        for _ in self.procs:
            self.task_queue.put(AsyncPredictor._StopToken())

    @property
    def default_buffer_size(self):
        return len(self.procs) * 5


================================================
FILE: demo_video/README.md
================================================
## Video Mask2Former Demo

We provide a command line tool to run a simple demo of builtin configs.
The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md).


================================================
FILE: demo_video/demo.py
================================================
# Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py
import argparse
import glob
import multiprocessing as mp
import os
# fmt: off
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
# fmt: on
import tempfile
import time
import warnings
import cv2
import numpy as np
import tqdm
from torch.cuda.amp import autocast
from detectron2.config import get_cfg
from detectron2.data.detection_utils import read_image
from detectron2.projects.deeplab import add_deeplab_config
from detectron2.utils.logger import setup_logger
from mask2former import add_maskformer2_config
from mask2former_video import add_maskformer2_video_config
from predictor import VisualizationDemo
import imageio

# constants
WINDOW_NAME = "mask2former video demo"
def setup_cfg(args):
    # load config from file and command-line arguments
    cfg = get_cfg()
    add_deeplab_config(cfg)
    add_maskformer2_config(cfg)
    add_maskformer2_video_config(cfg)
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()
    return cfg
def get_parser():
    parser = argparse.ArgumentParser(description="maskformer2 demo for builtin configs")
    parser.add_argument(
        "--config-file",
        default="configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument("--video-input", help="Path to video file.")
    parser.add_argument(
        "--input",
        nargs="+",
        help="A list of space separated input images; "
        "or a single glob pattern such as 'directory/*.jpg'"
        "this will be treated as frames of a video",
    )
    parser.add_argument(
        "--output",
        help="A file or directory to save output visualizations. "
        "If not given, will show output in an OpenCV window.",
    )
    parser.add_argument(
        "--save-frames",
        default=False,
        help="Save frame level image outputs.",
    )
    parser.add_argument(
        "--confidence-threshold",
        type=float,
        default=0.5,
        help="Minimum score for instance predictions to be shown",
    )
    parser.add_argument(
        "--opts",
        help="Modify config options using the command-line 'KEY VALUE' pairs",
        default=[],
        nargs=argparse.REMAINDER,
    )
    return parser

def test_opencv_video_format(codec, file_ext):
    with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
        filename = os.path.join(dir, "test_file" + file_ext)
        writer = cv2.VideoWriter(
            filename=filename,
            fourcc=cv2.VideoWriter_fourcc(*codec),
            fps=float(30),
            frameSize=(10, 10),
            isColor=True,
        )
        [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
        writer.release()
        if os.path.isfile(filename):
            return True
        return False

if __name__ == "__main__":
    mp.set_start_method("spawn", force=True)
    args = get_parser().parse_args()
    setup_logger(name="fvcore")
    logger = setup_logger()
    logger.info("Arguments: " + str(args))
    cfg = setup_cfg(args)
    demo = VisualizationDemo(cfg)
    if args.output:
        os.makedirs(args.output, exist_ok=True)
    if args.input:
        # if len(args.input) == 1:
        #     args.input = glob.glob(os.path.expanduser(args.input[0]))
        #     assert args.input, "The input path(s) was not found"
        print('args input:', args.input)
        args.input = args.input[0]
        for file_name in os.listdir(args.input):
            input_path_list = sorted([args.input + file_name + '/' + f for f in os.listdir(args.input + file_name)])
            print('input path list:', input_path_list)
            if len(input_path_list) == 0:
                continue 
            vid_frames = []
            for path in input_path_list:
                img = read_image(path, format="BGR")
                vid_frames.append(img)
            start_time = time.time()
            with autocast():
                predictions, visualized_output = demo.run_on_video(vid_frames, args.confidence_threshold)
            logger.info(
                "detected {} instances per frame in {:.2f}s".format(
                    len(predictions["pred_scores"]), time.time() - start_time
                )
            )
            if args.output:
                if args.save_frames:
                    if args.output:
                        os.makedirs(args.output + file_name, exist_ok=True)
                    print('save frames')
                    for path, _vis_output in zip(input_path_list, visualized_output):
                        out_filename = os.path.join(args.output, file_name, os.path.basename(path))
                        _vis_output.save(out_filename)
                H, W = visualized_output[0].height, visualized_output[0].width
                images = []
                for _vis_output in visualized_output:
                    frame = _vis_output.get_image()#[:, :, ::-1]
                    images.append(frame)
                imageio.mimsave(args.output + file_name + ".gif", images, fps=5)
                '''
                cap = cv2.VideoCapture(-1)
                fourcc = cv2.VideoWriter_fourcc(*"mp4v")
                out = cv2.VideoWriter(os.path.join(args.output, "visualization.mp4"), fourcc, 10.0, (W, H), True)
                for _vis_output in visualized_output:
                    frame = _vis_output.get_image()[:, :, ::-1]
                    out.write(frame)
                cap.release()
                out.release()
                '''
    elif args.video_input:
        video = cv2.VideoCapture(args.video_input)
        vid_frames = []
        while video.isOpened():
            success, frame = video.read()
            if success:
                vid_frames.append(frame)
            else:
                break
        start_time = time.time()
        with autocast():
            predictions, visualized_output = demo.run_on_video(vid_frames)
        logger.info(
            "detected {} instances per frame in {:.2f}s".format(
                len(predictions["pred_scores"]), time.time() - start_time
            )
        )
        if args.output:
            if args.save_frames:
                for idx, _vis_output in enumerate(visualized_output):
                    out_filename = os.path.join(args.output, f"{idx}.jpg")
                    _vis_output.save(out_filename)
            H, W = visualized_output[0].height, visualized_output[0].width
            cap = cv2.VideoCapture(-1)
            fourcc = cv2.VideoWriter_fourcc(*"mp4v")
            out = cv2.VideoWriter(os.path.join(args.output, "visualization.mp4"), fourcc, 10.0, (W, H), True)
            for _vis_output in visualized_output:
                frame = _vis_output.get_image()[:, :, ::-1]
                out.write(frame)
            cap.release()
            out.release()


================================================
FILE: demo_video/predictor.py
================================================
# reference: https://github.com/sukjunhwang/IFC/blob/master/projects/IFC/demo/predictor.py
import atexit
import bisect
import multiprocessing as mp
from collections import deque
import cv2
import torch
from visualizer import TrackVisualizer
from detectron2.data import MetadataCatalog
from detectron2.engine.defaults import DefaultPredictor
from detectron2.structures import Instances
from detectron2.utils.video_visualizer import VideoVisualizer
from detectron2.utils.visualizer import ColorMode

class VisualizationDemo(object):
    def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
        """
        Args:
            cfg (CfgNode):
            instance_mode (ColorMode):
            parallel (bool): whether to run the model in different processes from visualization.
                Useful since the visualization logic can be slow.
        """
        self.metadata = MetadataCatalog.get(
            cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
        )
        self.cpu_device = torch.device("cpu")
        self.instance_mode = instance_mode
        self.parallel = parallel
        if parallel:
            num_gpu = torch.cuda.device_count()
            self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
        else:
            self.predictor = VideoPredictor(cfg)

    def run_on_video(self, frames, conf_thre):
        """
        Args:
            frames (List[np.ndarray]): a list of images of shape (H, W, C) (in BGR order).
                This is the format used by OpenCV.
        Returns:
            predictions (dict): the output of the model.
            vis_output (VisImage): the visualized image output.
        """
        vis_output = None
        predictions = self.predictor(frames)
        image_size = predictions["image_size"]
        pred_scores = predictions["pred_scores"]
        pred_labels = predictions["pred_labels"]
        pred_masks = predictions["pred_masks"]
        remain_index = [ii for ii in range(len(pred_scores)) if pred_scores[ii] >= conf_thre ]
        pred_scores = [pred_scores[ind] for ind in remain_index]
        pred_labels = [pred_labels[ind] for ind in remain_index]
        pred_masks = [pred_masks[ind] for ind in remain_index]
        frame_masks = list(zip(*pred_masks))
        total_vis_output = []
        for frame_idx in range(len(frames)):
            frame = frames[frame_idx][:, :, ::-1]
            visualizer = TrackVisualizer(frame, self.metadata, instance_mode=self.instance_mode)
            ins = Instances(image_size)
            if len(pred_scores) > 0:
                print('pred scores:', pred_scores)
                ins.scores = pred_scores
                ins.pred_classes = pred_labels
                ins.pred_masks = torch.stack(frame_masks[frame_idx], dim=0)
            vis_output = visualizer.draw_instance_predictions(predictions=ins)
            total_vis_output.append(vis_output)
        return predictions, total_vis_output

class VideoPredictor(DefaultPredictor):
    """
    Create a simple end-to-end predictor with the given config that runs on
    single device for a single input image.
    Compared to using the model directly, this class does the following additions:
    1. Load checkpoint from `cfg.MODEL.WEIGHTS`.
    2. Always take BGR image as the input and apply conversion defined by `cfg.INPUT.FORMAT`.
    3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`.
    4. Take one input image and produce a single output, instead of a batch.
    If you'd like to do anything more fancy, please refer to its source code
    as examples to build and use the model manually.
    Attributes:
        metadata (Metadata): the metadata of the underlying dataset, obtained from
            cfg.DATASETS.TEST.
    Examples:
    ::
        pred = DefaultPredictor(cfg)
        inputs = cv2.imread("input.jpg")
        outputs = pred(inputs)
    """
    def __call__(self, frames):
        """
        Args:
            original_image (np.ndarray): an image of shape (H, W, C) (in BGR order).
        Returns:
            predictions (dict):
                the output of the model for one image only.
                See :doc:`/tutorials/models` for details about the format.
        """
        with torch.no_grad():  # https://github.com/sphinx-doc/sphinx/issues/4258
            input_frames = []
            for original_image in frames:
                # Apply pre-processing to image.
                if self.input_format == "RGB":
                    # whether the model expects BGR inputs or RGB
                    original_image = original_image[:, :, ::-1]
                height, width = original_image.shape[:2]
                image = self.aug.get_transform(original_image).apply_image(original_image)
                image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
                input_frames.append(image)
            inputs = {"image": input_frames, "height": height, "width": width}
            predictions = self.model([inputs])
            return predictions

class AsyncPredictor:
    """
    A predictor that runs the model asynchronously, possibly on >1 GPUs.
    Because rendering the visualization takes considerably amount of time,
    this helps improve throughput when rendering videos.
    """
    class _StopToken:
        pass
    class _PredictWorker(mp.Process):
        def __init__(self, cfg, task_queue, result_queue):
            self.cfg = cfg
            self.task_queue = task_queue
            self.result_queue = result_queue
            super().__init__()
        def run(self):
            predictor = VideoPredictor(self.cfg)
            while True:
                task = self.task_queue.get()
                if isinstance(task, AsyncPredictor._StopToken):
                    break
                idx, data = task
                result = predictor(data)
                self.result_queue.put((idx, result))

    def __init__(self, cfg, num_gpus: int = 1):
        """
        Args:
            cfg (CfgNode):
            num_gpus (int): if 0, will run on CPU
        """
        num_workers = max(num_gpus, 1)
        self.task_queue = mp.Queue(maxsize=num_workers * 3)
        self.result_queue = mp.Queue(maxsize=num_workers * 3)
        self.procs = []
        for gpuid in range(max(num_gpus, 1)):
            cfg = cfg.clone()
            cfg.defrost()
            cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
            self.procs.append(
                AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
            )
        self.put_idx = 0
        self.get_idx = 0
        self.result_rank = []
        self.result_data = []
        for p in self.procs:
            p.start()
        atexit.register(self.shutdown)

    def put(self, image):
        self.put_idx += 1
        self.task_queue.put((self.put_idx, image))

    def get(self):
        self.get_idx += 1  # the index needed for this request
        if len(self.result_rank) and self.result_rank[0] == self.get_idx:
            res = self.result_data[0]
            del self.result_data[0], self.result_rank[0]
            return res
        while True:
            # make sure the results are returned in the correct order
            idx, res = self.result_queue.get()
            if idx == self.get_idx:
                return res
            insert = bisect.bisect(self.result_rank, idx)
            self.result_rank.insert(insert, idx)
            self.result_data.insert(insert, res)

    def __len__(self):
        return self.put_idx - self.get_idx

    def __call__(self, image):
        self.put(image)
        return self.get()

    def shutdown(self):
        for _ in self.procs:
            self.task_queue.put(AsyncPredictor._StopToken())
            
    @property
    def default_buffer_size(self):
        return len(self.procs) * 5


================================================
FILE: demo_video/visualizer.py
================================================
# reference: https://github.com/sukjunhwang/IFC/blob/master/projects/IFC/demo/visualizer.py
import torch
import numpy as np
import matplotlib.colors as mplc
from detectron2.utils.visualizer import ColorMode, GenericMask, Visualizer, _create_text_labels
_ID_JITTERS = [[0.9047944201469568, 0.3241718265806123, 0.33443746665210006], [0.4590171386127151, 0.9095038146383864, 0.3143840671974788], [0.4769356899795538, 0.5044406738441948, 0.5354530846360839], [0.00820945625670777, 0.24099210193126785, 0.15471834055332978], [0.6195684374237388, 0.4020380013509799, 0.26100266066404676], [0.08281237756545068, 0.05900744492710419, 0.06106221202154216], [0.2264886829978755, 0.04925271007292076, 0.10214429345996079], [0.1888247470009874, 0.11275000298612425, 0.46112894830685514], [0.37415767691880975, 0.844284596118331, 0.950471611180866], [0.3817344218157631, 0.3483259270707101, 0.6572989333690541], [0.2403115731054466, 0.03078280287279167, 0.5385975692534737], [0.7035076951650824, 0.12352084932325424, 0.12873080308790197], [0.12607434914489934, 0.111244793010015, 0.09333334699716023], [0.6551607300342269, 0.7003064103554443, 0.4131794512286162], [0.13592107365596595, 0.5390702818232149, 0.004540643174930525], [0.38286244894454347, 0.709142545393449, 0.529074791609835], [0.4279376583651734, 0.5634708596431771, 0.8505569717104301], [0.3460488523902999, 0.464769595519293, 0.6676839675477276], [0.8544063246675081, 0.5041190233407755, 0.9081217697141578], [0.9207009090747208, 0.2403865944739051, 0.05375410999863772], [0.6515786136947107, 0.6299918449948327, 0.45292029442034387], [0.986174217295693, 0.2424849846977214, 0.3981993323108266], [0.22101915872994693, 0.3408589198278038, 0.006381420347677524], [0.3159785813515982, 0.1145748921741011, 0.595754317197274], [0.10263421488052715, 0.5864139253490858, 0.23908000741142432], [0.8272999391532938, 0.6123527260897751, 0.3365197327803193], [0.5269583712937912, 0.25668929554516506, 0.7888411215078127], [0.2433880265410031, 0.7240751234287827, 0.8483215810528648], [0.7254601709704898, 0.8316525547295984, 0.9325253855921963], [0.5574483824856672, 0.2935331727879944, 0.6594839453793155], [0.6209642371433579, 0.054030693198821256, 0.5080873988178534], [0.9055507077365624, 0.12865888619203514, 0.9309191861440005], [0.9914469722960537, 0.3074114506206205, 0.8762107657323488], [0.4812682518247371, 0.15055826298548158, 0.9656340505308308], [0.6459219454316445, 0.9144794010251625, 0.751338812155106], [0.860840174209798, 0.8844626353077639, 0.3604624506769899], [0.8194991672032272, 0.926399617787601, 0.8059222327343247], [0.6540413175393658, 0.04579445254618297, 0.26891917826531275], [0.37778835833987046, 0.36247927666109536, 0.7989799305827889], [0.22738304978177726, 0.9038018263773739, 0.6970838854138303], [0.6362015495896184, 0.527680794236961, 0.5570915425178721], [0.6436401915860954, 0.6316925317144524, 0.9137151236993912], [0.04161828388587163, 0.3832413349082706, 0.6880829921949752], [0.7768167825719299, 0.8933821497682587, 0.7221278391266809], [0.8632760876301346, 0.3278628094906323, 0.8421587587114462], [0.8556499133262127, 0.6497385872901932, 0.5436895688477963], [0.9861940318610894, 0.03562313777386272, 0.9183454677106616], [0.8042586091176366, 0.6167222703170994, 0.24181981557207644], [0.9504247117633057, 0.3454233714011461, 0.6883727005547743], [0.9611909135491202, 0.46384154263898114, 0.32700443315058914], [0.523542176970206, 0.446222414615845, 0.9067402987747814], [0.7536954008682911, 0.6675512338797588, 0.22538238957839196], [0.1554052265688285, 0.05746097492966129, 0.8580358872587424], [0.8540838640971405, 0.9165504335482566, 0.6806982829158964], [0.7065090319405029, 0.8683059983962002, 0.05167128320624026], [0.39134812961899124, 0.8910075505622979, 0.7639815712623922], [0.1578117311479783, 0.20047326898284668, 0.9220177338840568], [0.2017488993096358, 0.6949259970936679, 0.8729196864798128], [0.5591089340651949, 0.15576770423813258, 0.1469857469387812], [0.14510398622626974, 0.24451497734532168, 0.46574271993578786], [0.13286397822351492, 0.4178244533944635, 0.03728728952131943], [0.556463206310225, 0.14027595183361663, 0.2731537988657907], [0.4093837966398032, 0.8015225687789814, 0.8033567296903834], [0.527442563956637, 0.902232617214431, 0.7066626674362227], [0.9058355503297827, 0.34983989180213004, 0.8353262183839384], [0.7108382186953104, 0.08591307895133471, 0.21434688012521974], [0.22757345065207668, 0.7943075496583976, 0.2992305547627421], [0.20454109788173636, 0.8251670332103687, 0.012981987094547232], [0.7672562637297392, 0.005429019973062554, 0.022163616037108702], [0.37487345910117564, 0.5086240194440863, 0.9061216063654387], [0.9878004014101087, 0.006345852772772331, 0.17499753379350858], [0.030061528704491303, 0.1409704315546606, 0.3337131835834506], [0.5022506782611504, 0.5448435505388706, 0.40584238936140726], [0.39560774627423445, 0.8905943695833262, 0.5850815030921116], [0.058615671926786406, 0.5365713844300387, 0.1620457551256279], [0.41843842882069693, 0.1536005983609976, 0.3127878501592438], [0.05947621790155899, 0.5412421167331932, 0.2611322146455659], [0.5196159938235607, 0.7066461551682705, 0.970261497412556], [0.30443031606149007, 0.45158581060034975, 0.4331841153149706], [0.8848298403933996, 0.7241791700943656, 0.8917110054596072], [0.5720260591898779, 0.3072801598203052, 0.8891066705989902], [0.13964015336177327, 0.2531778096760302, 0.5703756837403124], [0.2156307542329836, 0.4139947500641685, 0.87051676884144], [0.10800455881891169, 0.05554646035458266, 0.2947027428551443], [0.35198009410633857, 0.365849666213808, 0.06525787683513773], [0.5223264108118847, 0.9032195574351178, 0.28579084943315025], [0.7607724246546966, 0.3087194381828555, 0.6253235528354899], [0.5060485442077824, 0.19173600467625274, 0.9931175692203702], [0.5131805830323746, 0.07719515392040577, 0.923212006754969], [0.3629762141280106, 0.02429179642710888, 0.6963754952399983], [0.7542592485456767, 0.6478893299494212, 0.3424965345400731], [0.49944574453364454, 0.6775665366832825, 0.33758796076989583], [0.010621818120767679, 0.8221571611173205, 0.5186257457566332], [0.5857910304290109, 0.7178133992025467, 0.9729243483606071], [0.16987399482717613, 0.9942570210657463, 0.18120758122552927], [0.016362572521240848, 0.17582788603087263, 0.7255176922640298], [0.10981764283706419, 0.9078582203470377, 0.7638063718334003], [0.9252097840441119, 0.3330197086990039, 0.27888705301420136], [0.12769972651171546, 0.11121470804891687, 0.12710743734391716], [0.5753520518360334, 0.2763862879599456, 0.6115636613363361]]
_OFF_WHITE = (1.0, 1.0, 240.0 / 255)

class TrackVisualizer(Visualizer):
    def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE):
        super().__init__(
            img_rgb, metadata=metadata, scale=scale, instance_mode=instance_mode
        )
        self.cpu_device = torch.device("cpu")
    def _jitter(self, color, id):
        """
        Randomly modifies given color to produce a slightly different color than the color given.
        Args:
            color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
                picked. The values in the list are in the [0.0, 1.0] range.
        Returns:
            jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
                color after being jittered. The values in the list are in the [0.0, 1.0] range.
        """
        color = mplc.to_rgb(color)
        vec = _ID_JITTERS[id]
        # better to do it in another color space
        vec = vec / np.linalg.norm(vec) * 0.5
        res = np.clip(vec + color, 0, 1)
        return tuple(res)

    def overlay_instances(
        self,
        *,
        boxes=None,
        labels=None,
        masks=None,
        keypoints=None,
        assigned_colors=None,
        alpha=0.5
    ):
        """
        Args:
            boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`,
                or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
                or a :class:`RotatedBoxes`,
                or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
                for the N objects in a single image,
            labels (list[str]): the text to be displayed for each instance.
            masks (masks-like object): Supported types are:
                * :class:`detectron2.structures.PolygonMasks`,
                  :class:`detectron2.structures.BitMasks`.
                * list[list[ndarray]]: contains the segmentation masks for all objects in one image.
                  The first level of the list corresponds to individual instances. The second
                  level to all the polygon that compose the instance, and the third level
                  to the polygon coordinates. The third level should have the format of
                  [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
                * list[ndarray]: each ndarray is a binary mask of shape (H, W).
                * list[dict]: each dict is a COCO-style RLE.
            keypoints (Keypoint or array like): an array-like object of shape (N, K, 3),
                where the N is the number of instances and K is the number of keypoints.
                The last dimension corresponds to (x, y, visibility or score).
            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
                for full list of formats that the colors are accepted in.
        Returns:
            output (VisImage): image object with visualizations.
        """
        num_instances = 0
        if boxes is not None:
            boxes = self._convert_boxes(boxes)
            num_instances = len(boxes)
        if masks is not None:
            # print('masks:', masks)
            #masks = self._convert_masks(masks)
            if num_instances:
                assert len(masks) == num_instances
            else:
                num_instances = len(masks)
        if keypoints is not None:
            if num_instances:
                assert len(keypoints) == num_instances
            else:
                num_instances = len(keypoints)
            keypoints = self._convert_keypoints(keypoints)
        if labels is not None:
            assert len(labels) == num_instances
        if assigned_colors is None:
            assigned_colors = [random_color(ii, rgb=True, maximum=1) for ii in range(num_instances)]
        if num_instances == 0:
            return self.output
        if boxes is not None and boxes.shape[1] == 5:
            return self.overlay_rotated_instances(
                boxes=boxes, labels=labels, assigned_colors=assigned_colors
            )
        # Display in largest to smallest order to reduce occlusion.
        areas = None
        if boxes is not None:
            areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
        elif masks is not None:
            areas = np.asarray([x.sum() for x in masks])
        if areas is not None:
            sorted_idxs = np.argsort(-areas).tolist()
            # Re-order overlapped instances in descending order.
            boxes = boxes[sorted_idxs] if boxes is not None else None
            labels = [labels[k] for k in sorted_idxs] if labels is not None else None
            masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None
            assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
            keypoints = keypoints[sorted_idxs] if keypoints is not None else None
        for i in range(num_instances):
            color = assigned_colors[i]
            # if boxes is not None:
            #     self.draw_box(boxes[i], edge_color=color)
            if masks is not None:
	            #self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha)
                binary_mask = masks[i].astype(np.uint8)
                #alpha = 0.7
                #print('binary mask:', binary_mask)
                self.draw_binary_mask(
		            binary_mask,
		            color=color,
		            edge_color=None, # _OFF_WHITE
		            alpha=alpha,
	            )
            if False:
            # if labels is not None:
                # first get a box
                if boxes is not None:
                    x0, y0, x1, y1 = boxes[i]
                    text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
                    horiz_align = "left"
                elif masks is not None:
                    # skip small mask without polygon
                    if len(masks[i].polygons) == 0:
                        continue
                    x0, y0, x1, y1 = masks[i].bbox()
                    # draw text in the center (defined by median) when box is not drawn
                    # median is less sensitive to outliers.
                    text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1]
                    horiz_align = "center"
                else:
                    continue  # drawing the box confidence for keypoints isn't very useful.
                # for small objects, draw text at the side to avoid occlusion
                instance_area = (y1 - y0) * (x1 - x0)
                if (
                    instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
                    or y1 - y0 < 40 * self.output.scale
                ):
                    if y1 >= self.output.height - 5:
                        text_pos = (x1, y0)
                    else:
                        text_pos = (x0, y1)
                height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
                font_size = (
                    np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
                    * 0.5
                    * self._default_font_size
                )
                # self.draw_text(
                #     labels[i],
                #     text_pos,
                #     color=lighter_color,
                #     horizontal_alignment=horiz_align,
                #     font_size=font_size,
                # )
        # draw keypoints
        if keypoints is not None:
            for keypoints_per_instance in keypoints:
                self.draw_and_connect_keypoints(keypoints_per_instance)
        return self.output
        
    def draw_instance_predictions(self, predictions):
        """
        Draw instance-level prediction results on an image.
        Args:
            predictions (Instances): the output of an instance detection/segmentation
                model. Following fields will be used to draw:
                "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
        Returns:
            output (VisImage): image object with visualizations.
        """
        preds = predictions.to(self.cpu_device)
        boxes = preds.pred_boxes if preds.has("pred_boxes") else None
        scores = preds.scores if preds.has("scores") else None
        classes = preds.pred_classes if preds.has("pred_classes") else None
        labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
        if labels is not None:
            labels = ["[{}] ".format(_id) + l for _id, l in enumerate(labels)]
        if preds.has("pred_masks"):
            masks = np.asarray(preds.pred_masks)
            print('enter here==========')
            # masks = [GenericMask(x, self.output.height, self.output.width) for x in masks]
        else:
            masks = None
        if classes is None:
            return self.output
        colors = [
            self._jitter([x / 255 for x in self.metadata.thing_colors[c]], id) for id, c in enumerate(classes)
        ]
        alpha = 0.5
        if self._instance_mode == ColorMode.IMAGE_BW:
            self.output.img = self._create_grayscale_image(
                (preds.pred_masks.any(dim=0) > 0).numpy()
                if preds.has("pred_masks")
                else None
            )
            alpha = 0.3
        self.overlay_instances(
            masks=masks,
            boxes=boxes,
            labels=labels,
            assigned_colors=colors,
            alpha=alpha,
        )
        return self.output


================================================
FILE: mask2former/__init__.py
================================================
from . import data  # register all new datasets
from . import modeling

# config
from .config import add_maskformer2_config

# dataset loading
from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
    MaskFormerInstanceDatasetMapper,
)
from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
    MaskFormerPanopticDatasetMapper,
)
from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
    MaskFormerSemanticDatasetMapper,
)

# models
from .maskformer_model import MaskFormer
from .test_time_augmentation import SemanticSegmentorWithTTA

# evaluation
from .evaluation.instance_evaluation import InstanceSegEvaluator


================================================
FILE: mask2former/config.py
================================================
# -*- coding: utf-8 -*-
from detectron2.config import CfgNode as CN


def add_maskformer2_config(cfg):
    """
    Add config for MASK_FORMER.
    """
    # NOTE: configs from original maskformer
    # data config
    # select the dataset mapper
    cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
    # Color augmentation
    cfg.INPUT.COLOR_AUG_SSD = False
    # We retry random cropping until no single category in semantic segmentation GT occupies more
    # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
    cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
    # Pad image and segmentation GT in dataset mapper.
    cfg.INPUT.SIZE_DIVISIBILITY = -1

    # solver config
    # weight decay on embedding
    cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
    # optimizer
    cfg.SOLVER.OPTIMIZER = "ADAMW"
    cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1

    # mask_former model config
    cfg.MODEL.MASK_FORMER = CN()

    # loss
    cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
    cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
    cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0
    cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
    cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0

    # transformer config
    cfg.MODEL.MASK_FORMER.NHEADS = 8
    cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
    cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
    cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
    cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
    cfg.MODEL.MASK_FORMER.PRE_NORM = False

    cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
    cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100

    cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
    cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False

    # mask_former inference config
    cfg.MODEL.MASK_FORMER.TEST = CN()
    cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True
    cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False
    cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
    cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
    cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
    cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False

    # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
    # you can use this config to override
    cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32

    # pixel decoder config
    cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
    # adding transformer in pixel decoder
    cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
    # pixel decoder
    cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"

    # swin transformer backbone
    cfg.MODEL.SWIN = CN()
    cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
    cfg.MODEL.SWIN.PATCH_SIZE = 4
    cfg.MODEL.SWIN.EMBED_DIM = 96
    cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
    cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
    cfg.MODEL.SWIN.WINDOW_SIZE = 7
    cfg.MODEL.SWIN.MLP_RATIO = 4.0
    cfg.MODEL.SWIN.QKV_BIAS = True
    cfg.MODEL.SWIN.QK_SCALE = None
    cfg.MODEL.SWIN.DROP_RATE = 0.0
    cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
    cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
    cfg.MODEL.SWIN.APE = False
    cfg.MODEL.SWIN.PATCH_NORM = True
    cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
    cfg.MODEL.SWIN.USE_CHECKPOINT = False

    # NOTE: maskformer2 extra configs
    # transformer module
    cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder"

    # LSJ aug
    cfg.INPUT.IMAGE_SIZE = 1024
    cfg.INPUT.MIN_SCALE = 0.1
    cfg.INPUT.MAX_SCALE = 2.0

    # MSDeformAttn encoder configs
    cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
    cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
    cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8

    # point loss configs
    # Number of points sampled during training for a mask point head.
    cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112
    # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
    # original paper.
    cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0
    # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
    # the original paper.
    cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75


================================================
FILE: mask2former/data/__init__.py
================================================
from . import datasets


================================================
FILE: mask2former/data/dataset_mappers/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.


================================================
FILE: mask2former/data/dataset_mappers/__init__.py.new
================================================


================================================
FILE: mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py
================================================
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
import copy
import logging

import numpy as np
import torch

from detectron2.config import configurable
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.data.transforms import TransformGen
from detectron2.structures import BitMasks, Instances

from pycocotools import mask as coco_mask

__all__ = ["COCOInstanceNewBaselineDatasetMapper"]

def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
    """
    Compute the bounding boxes around the provided masks.

    Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with
    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.

    Args:
        masks (Tensor[N, H, W]): masks to transform where N is the number of masks
            and (H, W) are the spatial dimensions.

    Returns:
        Tensor[N, 4]: bounding boxes
    """
    if masks.numel() == 0:
        return masks

    n = masks.shape[0]
    for index, mask in enumerate(masks):
        y, x = torch.where(mask != 0)
        if len(x) * len(y) == 0:
            continue
        
        h = torch.max(y) - torch.min(y)
        w = torch.max(x) - torch.min(x)
        masks[index, torch.min(y):torch.max(y), torch.min(x):torch.max(x)] = 1.0

    return masks

def convert_coco_poly_to_mask(segmentations, height, width):
    masks = []
    for polygons in segmentations:
        rles = coco_mask.frPyObjects(polygons, height, width)
        mask = coco_mask.decode(rles)
        if len(mask.shape) < 3:
            mask = mask[..., None]
        mask = torch.as_tensor(mask, dtype=torch.uint8)
        mask = mask.any(dim=2)
        masks.append(mask)
    if masks:
        masks = torch.stack(masks, dim=0)
        masks = masks_to_boxes(masks)
    else:
        masks = torch.zeros((0, height, width), dtype=torch.uint8)

    return masks


def build_transform_gen(cfg, is_train):
    """
    Create a list of default :class:`Augmentation` from config.
    Now it includes resizing and flipping.
    Returns:
        list[Augmentation]
    """
    assert is_train, "Only support training augmentation"
    image_size = cfg.INPUT.IMAGE_SIZE
    min_scale = cfg.INPUT.MIN_SCALE
    max_scale = cfg.INPUT.MAX_SCALE

    augmentation = []

    if cfg.INPUT.RANDOM_FLIP != "none":
        augmentation.append(
            T.RandomFlip(
                horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
                vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
            )
        )

    augmentation.extend([
        T.ResizeScale(
            min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
        ),
        T.FixedSizeCrop(crop_size=(image_size, image_size)),
    ])

    return augmentation


# This is specifically designed for the COCO dataset.
class COCOInstanceNewBaselineDatasetMapper:
    """
    A callable which takes a dataset dict in Detectron2 Dataset format,
    and map it into a format used by MaskFormer.

    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.

    The callable currently does the following:

    1. Read the image from "file_name"
    2. Applies geometric transforms to the image and annotation
    3. Find and applies suitable cropping to the image and annotation
    4. Prepare image and annotation to Tensors
    """

    @configurable
    def __init__(
        self,
        is_train=True,
        *,
        tfm_gens,
        image_format,
    ):
        """
        NOTE: this interface is experimental.
        Args:
            is_train: for training or inference
            augmentations: a list of augmentations or deterministic transforms to apply
            tfm_gens: data augmentation
            image_format: an image format supported by :func:`detection_utils.read_image`.
        """
        self.tfm_gens = tfm_gens
        logging.getLogger(__name__).info(
            "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens))
        )

        self.img_format = image_format
        self.is_train = is_train
    
    @classmethod
    def from_config(cls, cfg, is_train=True):
        # Build augmentation
        tfm_gens = build_transform_gen(cfg, is_train)

        ret = {
            "is_train": is_train,
            "tfm_gens": tfm_gens,
            "image_format": cfg.INPUT.FORMAT,
        }
        return ret

    def __call__(self, dataset_dict):
        """
        Args:
            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.

        Returns:
            dict: a format that builtin models in detectron2 accept
        """
        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
        utils.check_image_size(dataset_dict, image)

        # TODO: get padding mask
        # by feeding a "segmentation mask" to the same transforms
        padding_mask = np.ones(image.shape[:2])

        image, transforms = T.apply_transform_gens(self.tfm_gens, image)
        # the crop transformation has default padding value 0 for segmentation
        padding_mask = transforms.apply_segmentation(padding_mask)
        padding_mask = ~ padding_mask.astype(bool)

        image_shape = image.shape[:2]  # h, w

        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
        # Therefore it's important to use torch.Tensor.
        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
        dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))

        if not self.is_train:
            # USER: Modify this if you want to keep them for some reason.
            dataset_dict.pop("annotations", None)
            return dataset_dict

        if "annotations" in dataset_dict:
            # USER: Modify this if you want to keep them for some reason.
            for anno in dataset_dict["annotations"]:
                # Let's always keep mask
                # if not self.mask_on:
                #     anno.pop("segmentation", None)
                anno.pop("keypoints", None)

            # USER: Implement additional transformations if you have other types of data
            annos = [
                utils.transform_instance_annotations(obj, transforms, image_shape)
                for obj in dataset_dict.pop("annotations")
                if obj.get("iscrowd", 0) == 0
            ]
            # NOTE: does not support BitMask due to augmentation
            # Current BitMask cannot handle empty objects
            instances = utils.annotations_to_instances(annos, image_shape)
            # After transforms such as cropping are applied, the bounding box may no longer
            # tightly bound the object. As an example, imagine a triangle object
            # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
            # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
            # the intersection of original bounding box and the cropping box.
            instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
            # Need to filter empty instances first (due to augmentation)
            instances = utils.filter_empty_instances(instances)
            # Generate masks from polygon
            h, w = instances.image_size
            # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
            if hasattr(instances, 'gt_masks'):
                gt_masks = instances.gt_masks
                gt_masks_box = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
                instances.gt_masks = gt_masks_box
            dataset_dict["instances"] = instances

        return dataset_dict


================================================
FILE: mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py
================================================
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
import copy
import logging

import numpy as np
import torch

from detectron2.config import configurable
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.data.transforms import TransformGen
from detectron2.structures import BitMasks, Boxes, Instances

__all__ = ["COCOPanopticNewBaselineDatasetMapper"]


def build_transform_gen(cfg, is_train):
    """
    Create a list of default :class:`Augmentation` from config.
    Now it includes resizing and flipping.
    Returns:
        list[Augmentation]
    """
    assert is_train, "Only support training augmentation"
    image_size = cfg.INPUT.IMAGE_SIZE
    min_scale = cfg.INPUT.MIN_SCALE
    max_scale = cfg.INPUT.MAX_SCALE

    augmentation = []

    if cfg.INPUT.RANDOM_FLIP != "none":
        augmentation.append(
            T.RandomFlip(
                horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
                vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
            )
        )

    augmentation.extend([
        T.ResizeScale(
            min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
        ),
        T.FixedSizeCrop(crop_size=(image_size, image_size)),
    ])

    return augmentation


# This is specifically designed for the COCO dataset.
class COCOPanopticNewBaselineDatasetMapper:
    """
    A callable which takes a dataset dict in Detectron2 Dataset format,
    and map it into a format used by MaskFormer.

    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.

    The callable currently does the following:

    1. Read the image from "file_name"
    2. Applies geometric transforms to the image and annotation
    3. Find and applies suitable cropping to the image and annotation
    4. Prepare image and annotation to Tensors
    """

    @configurable
    def __init__(
        self,
        is_train=True,
        *,
        tfm_gens,
        image_format,
    ):
        """
        NOTE: this interface is experimental.
        Args:
            is_train: for training or inference
            augmentations: a list of augmentations or deterministic transforms to apply
            crop_gen: crop augmentation
            tfm_gens: data augmentation
            image_format: an image format supported by :func:`detection_utils.read_image`.
        """
        self.tfm_gens = tfm_gens
        logging.getLogger(__name__).info(
            "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
                str(self.tfm_gens)
            )
        )

        self.img_format = image_format
        self.is_train = is_train

    @classmethod
    def from_config(cls, cfg, is_train=True):
        # Build augmentation
        tfm_gens = build_transform_gen(cfg, is_train)

        ret = {
            "is_train": is_train,
            "tfm_gens": tfm_gens,
            "image_format": cfg.INPUT.FORMAT,
        }
        return ret

    def __call__(self, dataset_dict):
        """
        Args:
            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.

        Returns:
            dict: a format that builtin models in detectron2 accept
        """
        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
        utils.check_image_size(dataset_dict, image)

        image, transforms = T.apply_transform_gens(self.tfm_gens, image)
        image_shape = image.shape[:2]  # h, w

        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
        # Therefore it's important to use torch.Tensor.
        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))

        if not self.is_train:
            # USER: Modify this if you want to keep them for some reason.
            dataset_dict.pop("annotations", None)
            return dataset_dict

        if "pan_seg_file_name" in dataset_dict:
            pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
            segments_info = dataset_dict["segments_info"]

            # apply the same transformation to panoptic segmentation
            pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)

            from panopticapi.utils import rgb2id

            pan_seg_gt = rgb2id(pan_seg_gt)

            instances = Instances(image_shape)
            classes = []
            masks = []
            for segment_info in segments_info:
                class_id = segment_info["category_id"]
                if not segment_info["iscrowd"]:
                    classes.append(class_id)
                    masks.append(pan_seg_gt == segment_info["id"])

            classes = np.array(classes)
            instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
            if len(masks) == 0:
                # Some image does not have annotation (all ignored)
                instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
                instances.gt_boxes = Boxes(torch.zeros((0, 4)))
            else:
                masks = BitMasks(
                    torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
                )
                instances.gt_masks = masks.tensor
                instances.gt_boxes = masks.get_bounding_boxes()

            dataset_dict["instances"] = instances

        return dataset_dict


================================================
FILE: mask2former/data/dataset_mappers/mask_former_instance_dataset_mapper.py
================================================
import copy
import logging

import numpy as np
import pycocotools.mask as mask_util
import torch
from torch.nn import functional as F

from detectron2.config import configurable
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.projects.point_rend import ColorAugSSDTransform
from detectron2.structures import BitMasks, Instances, polygons_to_bitmask

__all__ = ["MaskFormerInstanceDatasetMapper"]


class MaskFormerInstanceDatasetMapper:
    """
    A callable which takes a dataset dict in Detectron2 Dataset format,
    and map it into a format used by MaskFormer for instance segmentation.

    The callable currently does the following:

    1. Read the image from "file_name"
    2. Applies geometric transforms to the image and annotation
    3. Find and applies suitable cropping to the image and annotation
    4. Prepare image and annotation to Tensors
    """

    @configurable
    def __init__(
        self,
        is_train=True,
        *,
        augmentations,
        image_format,
        size_divisibility,
    ):
        """
        NOTE: this interface is experimental.
        Args:
            is_train: for training or inference
            augmentations: a list of augmentations or deterministic transforms to apply
            image_format: an image format supported by :func:`detection_utils.read_image`.
            size_divisibility: pad image size to be divisible by this value
        """
        self.is_train = is_train
        self.tfm_gens = augmentations
        self.img_format = image_format
        self.size_divisibility = size_divisibility

        logger = logging.getLogger(__name__)
        mode = "training" if is_train else "inference"
        logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")

    @classmethod
    def from_config(cls, cfg, is_train=True):
        # Build augmentation
        augs = [
            T.ResizeShortestEdge(
                cfg.INPUT.MIN_SIZE_TRAIN,
                cfg.INPUT.MAX_SIZE_TRAIN,
                cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
            )
        ]
        if cfg.INPUT.CROP.ENABLED:
            augs.append(
                T.RandomCrop(
                    cfg.INPUT.CROP.TYPE,
                    cfg.INPUT.CROP.SIZE,
                )
            )
        if cfg.INPUT.COLOR_AUG_SSD:
            augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
        augs.append(T.RandomFlip())

        ret = {
            "is_train": is_train,
            "augmentations": augs,
            "image_format": cfg.INPUT.FORMAT,
            "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
        }
        return ret

    def __call__(self, dataset_dict):
        """
        Args:
            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.

        Returns:
            dict: a format that builtin models in detectron2 accept
        """
        assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"

        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
        utils.check_image_size(dataset_dict, image)

        aug_input = T.AugInput(image)
        aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
        image = aug_input.image

        # transform instnace masks
        assert "annotations" in dataset_dict
        for anno in dataset_dict["annotations"]:
            anno.pop("keypoints", None)

        annos = [
            utils.transform_instance_annotations(obj, transforms, image.shape[:2])
            for obj in dataset_dict.pop("annotations")
            if obj.get("iscrowd", 0) == 0
        ]

        if len(annos):
            assert "segmentation" in annos[0]
        segms = [obj["segmentation"] for obj in annos]
        masks = []
        for segm in segms:
            if isinstance(segm, list):
                # polygon
                masks.append(polygons_to_bitmask(segm, *image.shape[:2]))
            elif isinstance(segm, dict):
                # COCO RLE
                masks.append(mask_util.decode(segm))
            elif isinstance(segm, np.ndarray):
                assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
                    segm.ndim
                )
                # mask array
                masks.append(segm)
            else:
                raise ValueError(
                    "Cannot convert segmentation of type '{}' to BitMasks!"
                    "Supported types are: polygons as list[list[float] or ndarray],"
                    " COCO-style RLE as a dict, or a binary segmentation mask "
                    " in a 2D numpy array of shape HxW.".format(type(segm))
                )

        # Pad image and segmentation label here!
        image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
        masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks]

        classes = [int(obj["category_id"]) for obj in annos]
        classes = torch.tensor(classes, dtype=torch.int64)

        if self.size_divisibility > 0:
            image_size = (image.shape[-2], image.shape[-1])
            padding_size = [
                0,
                self.size_divisibility - image_size[1],
                0,
                self.size_divisibility - image_size[0],
            ]
            # pad image
            image = F.pad(image, padding_size, value=128).contiguous()
            # pad mask
            masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks]

        image_shape = (image.shape[-2], image.shape[-1])  # h, w

        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
        # Therefore it's important to use torch.Tensor.
        dataset_dict["image"] = image

        # Prepare per-category binary masks
        instances = Instances(image_shape)
        instances.gt_classes = classes
        if len(masks) == 0:
            # Some image does not have annotation (all ignored)
            instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1]))
        else:
            masks = BitMasks(torch.stack(masks))
            instances.gt_masks = masks.tensor

        dataset_dict["instances"] = instances

        return dataset_dict


================================================
FILE: mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py
================================================
import copy
import logging

import numpy as np
import torch
from torch.nn import functional as F

from detectron2.config import configurable
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.structures import BitMasks, Instances

from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper

__all__ = ["MaskFormerPanopticDatasetMapper"]


class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper):
    """
    A callable which takes a dataset dict in Detectron2 Dataset format,
    and map it into a format used by MaskFormer for panoptic segmentation.

    The callable currently does the following:

    1. Read the image from "file_name"
    2. Applies geometric transforms to the image and annotation
    3. Find and applies suitable cropping to the image and annotation
    4. Prepare image and annotation to Tensors
    """

    @configurable
    def __init__(
        self,
        is_train=True,
        *,
        augmentations,
        image_format,
        ignore_label,
        size_divisibility,
    ):
        """
        NOTE: this interface is experimental.
        Args:
            is_train: for training or inference
            augmentations: a list of augmentations or deterministic transforms to apply
            image_format: an image format supported by :func:`detection_utils.read_image`.
            ignore_label: the label that is ignored to evaluation
            size_divisibility: pad image size to be divisible by this value
        """
        super().__init__(
            is_train,
            augmentations=augmentations,
            image_format=image_format,
            ignore_label=ignore_label,
            size_divisibility=size_divisibility,
        )

    def __call__(self, dataset_dict):
        """
        Args:
            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.

        Returns:
            dict: a format that builtin models in detectron2 accept
        """
        assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"

        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
        utils.check_image_size(dataset_dict, image)

        # semantic segmentation
        if "sem_seg_file_name" in dataset_dict:
            # PyTorch transformation not implemented for uint16, so converting it to double first
            sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
        else:
            sem_seg_gt = None

        # panoptic segmentation
        if "pan_seg_file_name" in dataset_dict:
            pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
            segments_info = dataset_dict["segments_info"]
        else:
            pan_seg_gt = None
            segments_info = None

        if pan_seg_gt is None:
            raise ValueError(
                "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format(
                    dataset_dict["file_name"]
                )
            )

        aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
        aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
        image = aug_input.image
        if sem_seg_gt is not None:
            sem_seg_gt = aug_input.sem_seg

        # apply the same transformation to panoptic segmentation
        pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)

        from panopticapi.utils import rgb2id

        pan_seg_gt = rgb2id(pan_seg_gt)

        # Pad image and segmentation label here!
        image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
        if sem_seg_gt is not None:
            sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
        pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long"))

        if self.size_divisibility > 0:
            image_size = (image.shape[-2], image.shape[-1])
            padding_size = [
                0,
                self.size_divisibility - image_size[1],
                0,
                self.size_divisibility - image_size[0],
            ]
            image = F.pad(image, padding_size, value=128).contiguous()
            if sem_seg_gt is not None:
                sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
            pan_seg_gt = F.pad(
                pan_seg_gt, padding_size, value=0
            ).contiguous()  # 0 is the VOID panoptic label

        image_shape = (image.shape[-2], image.shape[-1])  # h, w

        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
        # Therefore it's important to use torch.Tensor.
        dataset_dict["image"] = image
        if sem_seg_gt is not None:
            dataset_dict["sem_seg"] = sem_seg_gt.long()

        if "annotations" in dataset_dict:
            raise ValueError("Pemantic segmentation dataset should not have 'annotations'.")

        # Prepare per-category binary masks
        pan_seg_gt = pan_seg_gt.numpy()
        instances = Instances(image_shape)
        classes = []
        masks = []
        for segment_info in segments_info:
            class_id = segment_info["category_id"]
            if not segment_info["iscrowd"]:
                classes.append(class_id)
                masks.append(pan_seg_gt == segment_info["id"])

        classes = np.array(classes)
        instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
        if len(masks) == 0:
            # Some image does not have annotation (all ignored)
            instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
        else:
            masks = BitMasks(
                torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
            )
            instances.gt_masks = masks.tensor

        dataset_dict["instances"] = instances

        return dataset_dict


================================================
FILE: mask2former/data/dataset_mappers/mask_former_semantic_dataset_mapper.py
================================================
import copy
import logging

import numpy as np
import torch
from torch.nn import functional as F

from detectron2.config import configurable
from detectron2.data import MetadataCatalog
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.projects.point_rend import ColorAugSSDTransform
from detectron2.structures import BitMasks, Instances

__all__ = ["MaskFormerSemanticDatasetMapper"]


class MaskFormerSemanticDatasetMapper:
    """
    A callable which takes a dataset dict in Detectron2 Dataset format,
    and map it into a format used by MaskFormer for semantic segmentation.

    The callable currently does the following:

    1. Read the image from "file_name"
    2. Applies geometric transforms to the image and annotation
    3. Find and applies suitable cropping to the image and annotation
    4. Prepare image and annotation to Tensors
    """

    @configurable
    def __init__(
        self,
        is_train=True,
        *,
        augmentations,
        image_format,
        ignore_label,
        size_divisibility,
    ):
        """
        NOTE: this interface is experimental.
        Args:
            is_train: for training or inference
            augmentations: a list of augmentations or deterministic transforms to apply
            image_format: an image format supported by :func:`detection_utils.read_image`.
            ignore_label: the label that is ignored to evaluation
            size_divisibility: pad image size to be divisible by this value
        """
        self.is_train = is_train
        self.tfm_gens = augmentations
        self.img_format = image_format
        self.ignore_label = ignore_label
        self.size_divisibility = size_divisibility

        logger = logging.getLogger(__name__)
        mode = "training" if is_train else "inference"
        logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")

    @classmethod
    def from_config(cls, cfg, is_train=True):
        # Build augmentation
        augs = [
            T.ResizeShortestEdge(
                cfg.INPUT.MIN_SIZE_TRAIN,
                cfg.INPUT.MAX_SIZE_TRAIN,
                cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
            )
        ]
        if cfg.INPUT.CROP.ENABLED:
            augs.append(
                T.RandomCrop_CategoryAreaConstraint(
                    cfg.INPUT.CROP.TYPE,
                    cfg.INPUT.CROP.SIZE,
                    cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA,
                    cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
                )
            )
        if cfg.INPUT.COLOR_AUG_SSD:
            augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
        augs.append(T.RandomFlip())

        # Assume always applies to the training set.
        dataset_names = cfg.DATASETS.TRAIN
        meta = MetadataCatalog.get(dataset_names[0])
        ignore_label = meta.ignore_label

        ret = {
            "is_train": is_train,
            "augmentations": augs,
            "image_format": cfg.INPUT.FORMAT,
            "ignore_label": ignore_label,
            "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
        }
        return ret

    def __call__(self, dataset_dict):
        """
        Args:
            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.

        Returns:
            dict: a format that builtin models in detectron2 accept
        """
        assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!"

        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
        utils.check_image_size(dataset_dict, image)

        if "sem_seg_file_name" in dataset_dict:
            # PyTorch transformation not implemented for uint16, so converting it to double first
            sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
        else:
            sem_seg_gt = None

        if sem_seg_gt is None:
            raise ValueError(
                "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format(
                    dataset_dict["file_name"]
                )
            )

        aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
        aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
        image = aug_input.image
        sem_seg_gt = aug_input.sem_seg

        # Pad image and segmentation label here!
        image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
        if sem_seg_gt is not None:
            sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))

        if self.size_divisibility > 0:
            image_size = (image.shape[-2], image.shape[-1])
            padding_size = [
                0,
                self.size_divisibility - image_size[1],
                0,
                self.size_divisibility - image_size[0],
            ]
            image = F.pad(image, padding_size, value=128).contiguous()
            if sem_seg_gt is not None:
                sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()

        image_shape = (image.shape[-2], image.shape[-1])  # h, w

        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
        # Therefore it's important to use torch.Tensor.
        dataset_dict["image"] = image

        if sem_seg_gt is not None:
            dataset_dict["sem_seg"] = sem_seg_gt.long()

        if "annotations" in dataset_dict:
            raise ValueError("Semantic segmentation dataset should not have 'annotations'.")

        # Prepare per-category binary masks
        if sem_seg_gt is not None:
            sem_seg_gt = sem_seg_gt.numpy()
            instances = Instances(image_shape)
            classes = np.unique(sem_seg_gt)
            # remove ignored region
            classes = classes[classes != self.ignore_label]
            instances.gt_classes = torch.tensor(classes, dtype=torch.int64)

            masks = []
            for class_id in classes:
                masks.append(sem_seg_gt == class_id)

            if len(masks) == 0:
                # Some image does not have annotation (all ignored)
                instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1]))
            else:
                masks = BitMasks(
                    torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
                )
                instances.gt_masks = masks.tensor

            dataset_dict["instances"] = instances

        return dataset_dict


================================================
FILE: mask2former/data/datasets/__init__.py
================================================
from . import (
    register_ade20k_full,
    register_ade20k_panoptic,
    register_coco_stuff_10k,
    register_mapillary_vistas,
    register_coco_panoptic_annos_semseg,
    register_ade20k_instance,
    register_mapillary_vistas_panoptic,
)


================================================
FILE: mask2former/data/datasets/register_ade20k_full.py
================================================
import os

from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets import load_sem_seg

ADE20K_SEM_SEG_FULL_CATEGORIES = [
    {"name": "wall", "id": 2978, "trainId": 0},
    {"name": "building, edifice", "id": 312, "trainId": 1},
    {"name": "sky", "id": 2420, "trainId": 2},
    {"name": "tree", "id": 2855, "trainId": 3},
    {"name": "road, route", "id": 2131, "trainId": 4},
    {"name": "floor, flooring", "id": 976, "trainId": 5},
    {"name": "ceiling", "id": 447, "trainId": 6},
    {"name": "bed", "id": 165, "trainId": 7},
    {"name": "sidewalk, pavement", "id": 2377, "trainId": 8},
    {"name": "earth, ground", "id": 838, "trainId": 9},
    {"name": "cabinet", "id": 350, "trainId": 10},
    {"name": "person, individual, someone, somebody, mortal, soul", "id": 1831, "trainId": 11},
    {"name": "grass", "id": 1125, "trainId": 12},
    {"name": "windowpane, window", "id": 3055, "trainId": 13},
    {"name": "car, auto, automobile, machine, motorcar", "id": 401, "trainId": 14},
    {"name": "mountain, mount", "id": 1610, "trainId": 15},
    {"name": "plant, flora, plant life", "id": 1910, "trainId": 16},
    {"name": "table", "id": 2684, "trainId": 17},
    {"name": "chair", "id": 471, "trainId": 18},
    {"name": "curtain, drape, drapery, mantle, pall", "id": 687, "trainId": 19},
    {"name": "door", "id": 774, "trainId": 20},
    {"name": "sofa, couch, lounge", "id": 2473, "trainId": 21},
    {"name": "sea", "id": 2264, "trainId": 22},
    {"name": "painting, picture", "id": 1735, "trainId": 23},
    {"name": "water", "id": 2994, "trainId": 24},
    {"name": "mirror", "id": 1564, "trainId": 25},
    {"name": "house", "id": 1276, "trainId": 26},
    {"name": "rug, carpet, carpeting", "id": 2178, "trainId": 27},
    {"name": "shelf", "id": 2329, "trainId": 28},
    {"name": "armchair", "id": 57, "trainId": 29},
    {"name": "fence, fencing", "id": 907, "trainId": 30},
    {"name": "field", "id": 913, "trainId": 31},
    {"name": "lamp", "id": 1395, "trainId": 32},
    {"name": "rock, stone", "id": 2138, "trainId": 33},
    {"name": "seat", "id": 2272, "trainId": 34},
    {"name": "river", "id": 2128, "trainId": 35},
    {"name": "desk", "id": 724, "trainId": 36},
    {"name": "bathtub, bathing tub, bath, tub", "id": 155, "trainId": 37},
    {"name": "railing, rail", "id": 2053, "trainId": 38},
    {"name": "signboard, sign", "id": 2380, "trainId": 39},
    {"name": "cushion", "id": 689, "trainId": 40},
    {"name": "path", "id": 1788, "trainId": 41},
    {"name": "work surface", "id": 3087, "trainId": 42},
    {"name": "stairs, steps", "id": 2530, "trainId": 43},
    {"name": "column, pillar", "id": 581, "trainId": 44},
    {"name": "sink", "id": 2388, "trainId": 45},
    {"name": "wardrobe, closet, press", "id": 2985, "trainId": 46},
    {"name": "snow", "id": 2454, "trainId": 47},
    {"name": "refrigerator, icebox", "id": 2096, "trainId": 48},
    {"name": "base, pedestal, stand", "id": 137, "trainId": 49},
    {"name": "bridge, span", "id": 294, "trainId": 50},
    {"name": "blind, screen", "id": 212, "trainId": 51},
    {"name": "runway", "id": 2185, "trainId": 52},
    {"name": "cliff, drop, drop-off", "id": 524, "trainId": 53},
    {"name": "sand", "id": 2212, "trainId": 54},
    {"name": "fireplace, hearth, open fireplace", "id": 943, "trainId": 55},
    {"name": "pillow", "id": 1869, "trainId": 56},
    {"name": "screen door, screen", "id": 2251, "trainId": 57},
    {"name": "toilet, can, commode, crapper, pot, potty, stool, throne", "id": 2793, "trainId": 58},
    {"name": "skyscraper", "id": 2423, "trainId": 59},
    {"name": "grandstand, covered stand", "id": 1121, "trainId": 60},
    {"name": "box", "id": 266, "trainId": 61},
    {"name": "pool table, billiard table, snooker table", "id": 1948, "trainId": 62},
    {"name": "palm, palm tree", "id": 1744, "trainId": 63},
    {"name": "double door", "id": 783, "trainId": 64},
    {"name": "coffee table, cocktail table", "id": 571, "trainId": 65},
    {"name": "counter", "id": 627, "trainId": 66},
    {"name": "countertop", "id": 629, "trainId": 67},
    {"name": "chest of drawers, chest, bureau, dresser", "id": 491, "trainId": 68},
    {"name": "kitchen island", "id": 1374, "trainId": 69},
    {"name": "boat", "id": 223, "trainId": 70},
    {"name": "waterfall, falls", "id": 3016, "trainId": 71},
    {
        "name": "stove, kitchen stove, range, kitchen range, cooking stove",
        "id": 2598,
        "trainId": 72,
    },
    {"name": "flower", "id": 978, "trainId": 73},
    {"name": "bookcase", "id": 239, "trainId": 74},
    {"name": "controls", "id": 608, "trainId": 75},
    {"name": "book", "id": 236, "trainId": 76},
    {"name": "stairway, staircase", "id": 2531, "trainId": 77},
    {"name": "streetlight, street lamp", "id": 2616, "trainId": 78},
    {
        "name": "computer, computing machine, computing device, data processor, electronic computer, information processing system",
        "id": 591,
        "trainId": 79,
    },
    {
        "name": "bus, autobus, coach, charabanc, double-decker, jitney, motorbus, motorcoach, omnibus, passenger vehicle",
        "id": 327,
        "trainId": 80,
    },
    {"name": "swivel chair", "id": 2679, "trainId": 81},
    {"name": "light, light source", "id": 1451, "trainId": 82},
    {"name": "bench", "id": 181, "trainId": 83},
    {"name": "case, display case, showcase, vitrine", "id": 420, "trainId": 84},
    {"name": "towel", "id": 2821, "trainId": 85},
    {"name": "fountain", "id": 1023, "trainId": 86},
    {"name": "embankment", "id": 855, "trainId": 87},
    {
        "name": "television receiver, television, television set, tv, tv set, idiot box, boob tube, telly, goggle box",
        "id": 2733,
        "trainId": 88,
    },
    {"name": "van", "id": 2928, "trainId": 89},
    {"name": "hill", "id": 1240, "trainId": 90},
    {"name": "awning, sunshade, sunblind", "id": 77, "trainId": 91},
    {"name": "poster, posting, placard, notice, bill, card", "id": 1969, "trainId": 92},
    {"name": "truck, motortruck", "id": 2880, "trainId": 93},
    {"name": "airplane, aeroplane, plane", "id": 14, "trainId": 94},
    {"name": "pole", "id": 1936, "trainId": 95},
    {"name": "tower", "id": 2828, "trainId": 96},
    {"name": "court", "id": 631, "trainId": 97},
    {"name": "ball", "id": 103, "trainId": 98},
    {
        "name": "aircraft carrier, carrier, flattop, attack aircraft carrier",
        "id": 3144,
        "trainId": 99,
    },
    {"name": "buffet, counter, sideboard", "id": 308, "trainId": 100},
    {"name": "hovel, hut, hutch, shack, shanty", "id": 1282, "trainId": 101},
    {"name": "apparel, wearing apparel, dress, clothes", "id": 38, "trainId": 102},
    {"name": "minibike, motorbike", "id": 1563, "trainId": 103},
    {"name": "animal, animate being, beast, brute, creature, fauna", "id": 29, "trainId": 104},
    {"name": "chandelier, pendant, pendent", "id": 480, "trainId": 105},
    {"name": "step, stair", "id": 2569, "trainId": 106},
    {"name": "booth, cubicle, stall, kiosk", "id": 247, "trainId": 107},
    {"name": "bicycle, bike, wheel, cycle", "id": 187, "trainId": 108},
    {"name": "doorframe, doorcase", "id": 778, "trainId": 109},
    {"name": "sconce", "id": 2243, "trainId": 110},
    {"name": "pond", "id": 1941, "trainId": 111},
    {"name": "trade name, brand name, brand, marque", "id": 2833, "trainId": 112},
    {"name": "bannister, banister, balustrade, balusters, handrail", "id": 120, "trainId": 113},
    {"name": "bag", "id": 95, "trainId": 114},
    {"name": "traffic light, traffic signal, stoplight", "id": 2836, "trainId": 115},
    {"name": "gazebo", "id": 1087, "trainId": 116},
    {"name": "escalator, moving staircase, moving stairway", "id": 868, "trainId": 117},
    {"name": "land, ground, soil", "id": 1401, "trainId": 118},
    {"name": "board, plank", "id": 220, "trainId": 119},
    {"name": "arcade machine", "id": 47, "trainId": 120},
    {"name": "eiderdown, duvet, continental quilt", "id": 843, "trainId": 121},
    {"name": "bar", "id": 123, "trainId": 122},
    {"name": "stall, stand, sales booth", "id": 2537, "trainId": 123},
    {"name": "playground", "id": 1927, "trainId": 124},
    {"name": "ship", "id": 2337, "trainId": 125},
    {"name": "ottoman, pouf, pouffe, puff, hassock", "id": 1702, "trainId": 126},
    {
        "name": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
        "id": 64,
        "trainId": 127,
    },
    {"name": "bottle", "id": 249, "trainId": 128},
    {"name": "cradle", "id": 642, "trainId": 129},
    {"name": "pot, flowerpot", "id": 1981, "trainId": 130},
    {
        "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
        "id": 609,
        "trainId": 131,
    },
    {"name": "train, railroad train", "id": 2840, "trainId": 132},
    {"name": "stool", "id": 2586, "trainId": 133},
    {"name": "lake", "id": 1393, "trainId": 134},
    {"name": "tank, storage tank", "id": 2704, "trainId": 135},
    {"name": "ice, water ice", "id": 1304, "trainId": 136},
    {"name": "basket, handbasket", "id": 146, "trainId": 137},
    {"name": "manhole", "id": 1494, "trainId": 138},
    {"name": "tent, collapsible shelter", "id": 2739, "trainId": 139},
    {"name": "canopy", "id": 389, "trainId": 140},
    {"name": "microwave, microwave oven", "id": 1551, "trainId": 141},
    {"name": "barrel, cask", "id": 131, "trainId": 142},
    {"name": "dirt track", "id": 738, "trainId": 143},
    {"name": "beam", "id": 161, "trainId": 144},
    {"name": "dishwasher, dish washer, dishwashing machine", "id": 747, "trainId": 145},
    {"name": "plate", "id": 1919, "trainId": 146},
    {"name": "screen, crt screen", "id": 3109, "trainId": 147},
    {"name": "ruins", "id": 2179, "trainId": 148},
    {"name": "washer, automatic washer, washing machine", "id": 2989, "trainId": 149},
    {"name": "blanket, cover", "id": 206, "trainId": 150},
    {"name": "plaything, toy", "id": 1930, "trainId": 151},
    {"name": "food, solid food", "id": 1002, "trainId": 152},
    {"name": "screen, silver screen, projection screen", "id": 2254, "trainId": 153},
    {"name": "oven", "id": 1708, "trainId": 154},
    {"name": "stage", "id": 2526, "trainId": 155},
    {"name": "beacon, lighthouse, beacon light, pharos", "id": 160, "trainId": 156},
    {"name": "umbrella", "id": 2901, "trainId": 157},
    {"name": "sculpture", "id": 2262, "trainId": 158},
    {"name": "aqueduct", "id": 44, "trainId": 159},
    {"name": "container", "id": 597, "trainId": 160},
    {"name": "scaffolding, staging", "id": 2235, "trainId": 161},
    {"name": "hood, exhaust hood", "id": 1260, "trainId": 162},
    {"name": "curb, curbing, kerb", "id": 682, "trainId": 163},
    {"name": "roller coaster", "id": 2151, "trainId": 164},
    {"name": "horse, equus caballus", "id": 3107, "trainId": 165},
    {"name": "catwalk", "id": 432, "trainId": 166},
    {"name": "glass, drinking glass", "id": 1098, "trainId": 167},
    {"name": "vase", "id": 2932, "trainId": 168},
    {"name": "central reservation", "id": 461, "trainId": 169},
    {"name": "carousel", "id": 410, "trainId": 170},
    {"name": "radiator", "id": 2046, "trainId": 171},
    {"name": "closet", "id": 533, "trainId": 172},
    {"name": "machine", "id": 1481, "trainId": 173},
    {"name": "pier, wharf, wharfage, dock", "id": 1858, "trainId": 174},
    {"name": "fan", "id": 894, "trainId": 175},
    {"name": "inflatable bounce game", "id": 1322, "trainId": 176},
    {"name": "pitch", "id": 1891, "trainId": 177},
    {"name": "paper", "id": 1756, "trainId": 178},
    {"name": "arcade, colonnade", "id": 49, "trainId": 179},
    {"name": "hot tub", "id": 1272, "trainId": 180},
    {"name": "helicopter", "id": 1229, "trainId": 181},
    {"name": "tray", "id": 2850, "trainId": 182},
    {"name": "partition, divider", "id": 1784, "trainId": 183},
    {"name": "vineyard", "id": 2962, "trainId": 184},
    {"name": "bowl", "id": 259, "trainId": 185},
    {"name": "bullring", "id": 319, "trainId": 186},
    {"name": "flag", "id": 954, "trainId": 187},
    {"name": "pot", "id": 1974, "trainId": 188},
    {"name": "footbridge, overcrossing, pedestrian bridge", "id": 1013, "trainId": 189},
    {"name": "shower", "id": 2356, "trainId": 190},
    {"name": "bag, traveling bag, travelling bag, grip, suitcase", "id": 97, "trainId": 191},
    {"name": "bulletin board, notice board", "id": 318, "trainId": 192},
    {"name": "confessional booth", "id": 592, "trainId": 193},
    {"name": "trunk, tree trunk, bole", "id": 2885, "trainId": 194},
    {"name": "forest", "id": 1017, "trainId": 195},
    {"name": "elevator door", "id": 851, "trainId": 196},
    {"name": "laptop, laptop computer", "id": 1407, "trainId": 197},
    {"name": "instrument panel", "id": 1332, "trainId": 198},
    {"name": "bucket, pail", "id": 303, "trainId": 199},
    {"name": "tapestry, tapis", "id": 2714, "trainId": 200},
    {"name": "platform", "id": 1924, "trainId": 201},
    {"name": "jacket", "id": 1346, "trainId": 202},
    {"name": "gate", "id": 1081, "trainId": 203},
    {"name": "monitor, monitoring device", "id": 1583, "trainId": 204},
    {
        "name": "telephone booth, phone booth, call box, telephone box, telephone kiosk",
        "id": 2727,
        "trainId": 205,
    },
    {"name": "spotlight, spot", "id": 2509, "trainId": 206},
    {"name": "ring", "id": 2123, "trainId": 207},
    {"name": "control panel", "id": 602, "trainId": 208},
    {"name": "blackboard, chalkboard", "id": 202, "trainId": 209},
    {"name": "air conditioner, air conditioning", "id": 10, "trainId": 210},
    {"name": "chest", "id": 490, "trainId": 211},
    {"name": "clock", "id": 530, "trainId": 212},
    {"name": "sand dune", "id": 2213, "trainId": 213},
    {"name": "pipe, pipage, piping", "id": 1884, "trainId": 214},
    {"name": "vault", "id": 2934, "trainId": 215},
    {"name": "table football", "id": 2687, "trainId": 216},
    {"name": "cannon", "id": 387, "trainId": 217},
    {"name": "swimming pool, swimming bath, natatorium", "id": 2668, "trainId": 218},
    {"name": "fluorescent, fluorescent fixture", "id": 982, "trainId": 219},
    {"name": "statue", "id": 2547, "trainId": 220},
    {
        "name": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
        "id": 1474,
        "trainId": 221,
    },
    {"name": "exhibitor", "id": 877, "trainId": 222},
    {"name": "ladder", "id": 1391, "trainId": 223},
    {"name": "carport", "id": 414, "trainId": 224},
    {"name": "dam", "id": 698, "trainId": 225},
    {"name": "pulpit", "id": 2019, "trainId": 226},
    {"name": "skylight, fanlight", "id": 2422, "trainId": 227},
    {"name": "water tower", "id": 3010, "trainId": 228},
    {"name": "grill, grille, grillwork", "id": 1139, "trainId": 229},
    {"name": "display board", "id": 753, "trainId": 230},
    {"name": "pane, pane of glass, window glass", "id": 1747, "trainId": 231},
    {"name": "rubbish, trash, scrap", "id": 2175, "trainId": 232},
    {"name": "ice rink", "id": 1301, "trainId": 233},
    {"name": "fruit", "id": 1033, "trainId": 234},
    {"name": "patio", "id": 1789, "trainId": 235},
    {"name": "vending machine", "id": 2939, "trainId": 236},
    {"name": "telephone, phone, telephone set", "id": 2730, "trainId": 237},
    {"name": "net", "id": 1652, "trainId": 238},
    {
        "name": "backpack, back pack, knapsack, packsack, rucksack, haversack",
        "id": 90,
        "trainId": 239,
    },
    {"name": "jar", "id": 1349, "trainId": 240},
    {"name": "track", "id": 2830, "trainId": 241},
    {"name": "magazine", "id": 1485, "trainId": 242},
    {"name": "shutter", "id": 2370, "trainId": 243},
    {"name": "roof", "id": 2155, "trainId": 244},
    {"name": "banner, streamer", "id": 118, "trainId": 245},
    {"name": "landfill", "id": 1402, "trainId": 246},
    {"name": "post", "id": 1957, "trainId": 247},
    {"name": "altarpiece, reredos", "id": 3130, "trainId": 248},
    {"name": "hat, chapeau, lid", "id": 1197, "trainId": 249},
    {"name": "arch, archway", "id": 52, "trainId": 250},
    {"name": "table game", "id": 2688, "trainId": 251},
    {"name": "bag, handbag, pocketbook, purse", "id": 96, "trainId": 252},
    {"name": "document, written document, papers", "id": 762, "trainId": 253},
    {"name": "dome", "id": 772, "trainId": 254},
    {"name": "pier", "id": 1857, "trainId": 255},
    {"name": "shanties", "id": 2315, "trainId": 256},
    {"name": "forecourt", "id": 1016, "trainId": 257},
    {"name": "crane", "id": 643, "trainId": 258},
    {"name": "dog, domestic dog, canis familiaris", "id": 3105, "trainId": 259},
    {"name": "piano, pianoforte, forte-piano", "id": 1849, "trainId": 260},
    {"name": "drawing", "id": 791, "trainId": 261},
    {"name": "cabin", "id": 349, "trainId": 262},
    {
        "name": "ad, advertisement, advertizement, advertising, advertizing, advert",
        "id": 6,
        "trainId": 263,
    },
    {"name": "amphitheater, amphitheatre, coliseum", "id": 3114, "trainId": 264},
    {"name": "monument", "id": 1587, "trainId": 265},
    {"name": "henhouse", "id": 1233, "trainId": 266},
    {"name": "cockpit", "id": 559, "trainId": 267},
    {"name": "heater, warmer", "id": 1223, "trainId": 268},
    {"name": "windmill, aerogenerator, wind generator", "id": 3049, "trainId": 269},
    {"name": "pool", "id": 1943, "trainId": 270},
    {"name": "elevator, lift", "id": 853, "trainId": 271},
    {"name": "decoration, ornament, ornamentation", "id": 709, "trainId": 272},
    {"name": "labyrinth", "id": 1390, "trainId": 273},
    {"name": "text, textual matter", "id": 2748, "trainId": 274},
    {"name": "printer", "id": 2007, "trainId": 275},
    {"name": "mezzanine, first balcony", "id": 1546, "trainId": 276},
    {"name": "mattress", "id": 1513, "trainId": 277},
    {"name": "straw", "id": 2600, "trainId": 278},
    {"name": "stalls", "id": 2538, "trainId": 279},
    {"name": "patio, terrace", "id": 1790, "trainId": 280},
    {"name": "billboard, hoarding", "id": 194, "trainId": 281},
    {"name": "bus stop", "id": 326, "trainId": 282},
    {"name": "trouser, pant", "id": 2877, "trainId": 283},
    {"name": "console table, console", "id": 594, "trainId": 284},
    {"name": "rack", "id": 2036, "trainId": 285},
    {"name": "notebook", "id": 1662, "trainId": 286},
    {"name": "shrine", "id": 2366, "trainId": 287},
    {"name": "pantry", "id": 1754, "trainId": 288},
    {"name": "cart", "id": 418, "trainId": 289},
    {"name": "steam shovel", "id": 2553, "trainId": 290},
    {"name": "porch", "id": 1951, "trainId": 291},
    {"name": "postbox, mailbox, letter box", "id": 1963, "trainId": 292},
    {"name": "figurine, statuette", "id": 918, "trainId": 293},
    {"name": "recycling bin", "id": 2086, "trainId": 294},
    {"name": "folding screen", "id": 997, "trainId": 295},
    {"name": "telescope", "id": 2731, "trainId": 296},
    {"name": "deck chair, beach chair", "id": 704, "trainId": 297},
    {"name": "kennel", "id": 1365, "trainId": 298},
    {"name": "coffee maker", "id": 569, "trainId": 299},
    {"name": "altar, communion table, lord's table", "id": 3108, "trainId": 300},
    {"name": "fish", "id": 948, "trainId": 301},
    {"name": "easel", "id": 839, "trainId": 302},
    {"name": "artificial golf green", "id": 63, "trainId": 303},
    {"name": "iceberg", "id": 1305, "trainId": 304},
    {"name": "candlestick, candle holder", "id": 378, "trainId": 305},
    {"name": "shower stall, shower bath", "id": 2362, "trainId": 306},
    {"name": "television stand", "id": 2734, "trainId": 307},
    {
        "name": "wall socket, wall plug, electric outlet, electrical outlet, outlet, electric receptacle",
        "id": 2982,
        "trainId": 308,
    },
    {"name": "skeleton", "id": 2398, "trainId": 309},
    {"name": "grand piano, grand", "id": 1119, "trainId": 310},
    {"name": "candy, confect", "id": 382, "trainId": 311},
    {"name": "grille door", "id": 1141, "trainId": 312},
    {"name": "pedestal, plinth, footstall", "id": 1805, "trainId": 313},
    {"name": "jersey, t-shirt, tee shirt", "id": 3102, "trainId": 314},
    {"name": "shoe", "id": 2341, "trainId": 315},
    {"name": "gravestone, headstone, tombstone", "id": 1131, "trainId": 316},
    {"name": "shanty", "id": 2316, "trainId": 317},
    {"name": "structure", "id": 2626, "trainId": 318},
    {"name": "rocking chair, rocker", "id": 3104, "trainId": 319},
    {"name": "bird", "id": 198, "trainId": 320},
    {"name": "place mat", "id": 1896, "trainId": 321},
    {"name": "tomb", "id": 2800, "trainId": 322},
    {"name": "big top", "id": 190, "trainId": 323},
    {"name": "gas pump, gasoline pump, petrol pump, island dispenser", "id": 3131, "trainId": 324},
    {"name": "lockers", "id": 1463, "trainId": 325},
    {"name": "cage", "id": 357, "trainId": 326},
    {"name": "finger", "id": 929, "trainId": 327},
    {"name": "bleachers", "id": 209, "trainId": 328},
    {"name": "ferris wheel", "id": 912, "trainId": 329},
    {"name": "hairdresser chair", "id": 1164, "trainId": 330},
    {"name": "mat", "id": 1509, "trainId": 331},
    {"name": "stands", "id": 2539, "trainId": 332},
    {"name": "aquarium, fish tank, marine museum", "id": 3116, "trainId": 333},
    {"name": "streetcar, tram, tramcar, trolley, trolley car", "id": 2615, "trainId": 334},
    {"name": "napkin, table napkin, serviette", "id": 1644, "trainId": 335},
    {"name": "dummy", "id": 818, "trainId": 336},
    {"name": "booklet, brochure, folder, leaflet, pamphlet", "id": 242, "trainId": 337},
    {"name": "sand trap", "id": 2217, "trainId": 338},
    {"name": "shop, store", "id": 2347, "trainId": 339},
    {"name": "table cloth", "id": 2686, "trainId": 340},
    {"name": "service station", "id": 2300, "trainId": 341},
    {"name": "coffin", "id": 572, "trainId": 342},
    {"name": "drawer", "id": 789, "trainId": 343},
    {"name": "cages", "id": 358, "trainId": 344},
    {"name": "slot machine, coin machine", "id": 2443, "trainId": 345},
    {"name": "balcony", "id": 101, "trainId": 346},
    {"name": "volleyball court", "id": 2969, "trainId": 347},
    {"name": "table tennis", "id": 2692, "trainId": 348},
    {"name": "control table", "id": 606, "trainId": 349},
    {"name": "shirt", "id": 2339, "trainId": 350},
    {"name": "merchandise, ware, product", "id": 1533, "trainId": 351},
    {"name": "railway", "id": 2060, "trainId": 352},
    {"name": "parterre", "id": 1782, "trainId": 353},
    {"name": "chimney", "id": 495, "trainId": 354},
    {"name": "can, tin, tin can", "id": 371, "trainId": 355},
    {"name": "tanks", "id": 2707, "trainId": 356},
    {"name": "fabric, cloth, material, textile", "id": 889, "trainId": 357},
    {"name": "alga, algae", "id": 3156, "trainId": 358},
    {"name": "system", "id": 2683, "trainId": 359},
    {"name": "map", "id": 1499, "trainId": 360},
    {"name": "greenhouse", "id": 1135, "trainId": 361},
    {"name": "mug", "id": 1619, "trainId": 362},
    {"name": "barbecue", "id": 125, "trainId": 363},
    {"name": "trailer", "id": 2838, "trainId": 364},
    {"name": "toilet tissue, toilet paper, bathroom tissue", "id": 2792, "trainId": 365},
    {"name": "organ", "id": 1695, "trainId": 366},
    {"name": "dishrag, dishcloth", "id": 746, "trainId": 367},
    {"name": "island", "id": 1343, "trainId": 368},
    {"name": "keyboard", "id": 1370, "trainId": 369},
    {"name": "trench", "id": 2858, "trainId": 370},
    {"name": "basket, basketball hoop, hoop", "id": 145, "trainId": 371},
    {"name": "steering wheel, wheel", "id": 2565, "trainId": 372},
    {"name": "pitcher, ewer", "id": 1892, "trainId": 373},
    {"name": "goal", "id": 1103, "trainId": 374},
    {"name": "bread, breadstuff, staff of life", "id": 286, "trainId": 375},
    {"name": "beds", "id": 170, "trainId": 376},
    {"name": "wood", "id": 3073, "trainId": 377},
    {"name": "file cabinet", "id": 922, "trainId": 378},
    {"name": "newspaper, paper", "id": 1655, "trainId": 379},
    {"name": "motorboat", "id": 1602, "trainId": 380},
    {"name": "rope", "id": 2160, "trainId": 381},
    {"name": "guitar", "id": 1151, "trainId": 382},
    {"name": "rubble", "id": 2176, "trainId": 383},
    {"name": "scarf", "id": 2239, "trainId": 384},
    {"name": "barrels", "id": 132, "trainId": 385},
    {"name": "cap", "id": 394, "trainId": 386},
    {"name": "leaves", "id": 1424, "trainId": 387},
    {"name": "control tower", "id": 607, "trainId": 388},
    {"name": "dashboard", "id": 700, "trainId": 389},
    {"name": "bandstand", "id": 116, "trainId": 390},
    {"name": "lectern", "id": 1425, "trainId": 391},
    {"name": "switch, electric switch, electrical switch", "id": 2676, "trainId": 392},
    {"name": "baseboard, mopboard, skirting board", "id": 141, "trainId": 393},
    {"name": "shower room", "id": 2360, "trainId": 394},
    {"name": "smoke", "id": 2449, "trainId": 395},
    {"name": "faucet, spigot", "id": 897, "trainId": 396},
    {"name": "bulldozer", "id": 317, "trainId": 397},
    {"name": "saucepan", "id": 2228, "trainId": 398},
    {"name": "shops", "id": 2351, "trainId": 399},
    {"name": "meter", "id": 1543, "trainId": 400},
    {"name": "crevasse", "id": 656, "trainId": 401},
    {"name": "gear", "id": 1088, "trainId": 402},
    {"name": "candelabrum, candelabra", "id": 373, "trainId": 403},
    {"name": "sofa bed", "id": 2472, "trainId": 404},
    {"name": "tunnel", "id": 2892, "trainId": 405},
    {"name": "pallet", "id": 1740, "trainId": 406},
    {"name": "wire, conducting wire", "id": 3067, "trainId": 407},
    {"name": "kettle, boiler", "id": 1367, "trainId": 408},
    {"name": "bidet", "id": 188, "trainId": 409},
    {
        "name": "baby buggy, baby carriage, carriage, perambulator, pram, stroller, go-cart, pushchair, pusher",
        "id": 79,
        "trainId": 410,
    },
    {"name": "music stand", "id": 1633, "trainId": 411},
    {"name": "pipe, tube", "id": 1885, "trainId": 412},
    {"name": "cup", "id": 677, "trainId": 413},
    {"name": "parking meter", "id": 1779, "trainId": 414},
    {"name": "ice hockey rink", "id": 1297, "trainId": 415},
    {"name": "shelter", "id": 2334, "trainId": 416},
    {"name": "weeds", "id": 3027, "trainId": 417},
    {"name": "temple", "id": 2735, "trainId": 418},
    {"name": "patty, cake", "id": 1791, "trainId": 419},
    {"name": "ski slope", "id": 2405, "trainId": 420},
    {"name": "panel", "id": 1748, "trainId": 421},
    {"name": "wallet", "id": 2983, "trainId": 422},
    {"name": "wheel", "id": 3035, "trainId": 423},
    {"name": "towel rack, towel horse", "id": 2824, "trainId": 424},
    {"name": "roundabout", "id": 2168, "trainId": 425},
    {"name": "canister, cannister, tin", "id": 385, "trainId": 426},
    {"name": "rod", "id": 2148, "trainId": 427},
    {"name": "soap dispenser", "id": 2465, "trainId": 428},
    {"name": "bell", "id": 175, "trainId": 429},
    {"name": "canvas", "id": 390, "trainId": 430},
    {"name": "box office, ticket office, ticket booth", "id": 268, "trainId": 431},
    {"name": "teacup", "id": 2722, "trainId": 432},
    {"name": "trellis", "id": 2857, "trainId": 433},
    {"name": "workbench", "id": 3088, "trainId": 434},
    {"name": "valley, vale", "id": 2926, "trainId": 435},
    {"name": "toaster", "id": 2782, "trainId": 436},
    {"name": "knife", "id": 1378, "trainId": 437},
    {"name": "podium", "id": 1934, "trainId": 438},
    {"name": "ramp", "id": 2072, "trainId": 439},
    {"name": "tumble dryer", "id": 2889, "trainId": 440},
    {"name": "fireplug, fire hydrant, plug", "id": 944, "trainId": 441},
    {"name": "gym shoe, sneaker, tennis shoe", "id": 1158, "trainId": 442},
    {"name": "lab bench", "id": 1383, "trainId": 443},
    {"name": "equipment", "id": 867, "trainId": 444},
    {"name": "rocky formation", "id": 2145, "trainId": 445},
    {"name": "plastic", "id": 1915, "trainId": 446},
    {"name": "calendar", "id": 361, "trainId": 447},
    {"name": "caravan", "id": 402, "trainId": 448},
    {"name": "check-in-desk", "id": 482, "trainId": 449},
    {"name": "ticket counter", "id": 2761, "trainId": 450},
    {"name": "brush", "id": 300, "trainId": 451},
    {"name": "mill", "id": 1554, "trainId": 452},
    {"name": "covered bridge", "id": 636, "trainId": 453},
    {"name": "bowling alley", "id": 260, "trainId": 454},
    {"name": "hanger", "id": 1186, "trainId": 455},
    {"name": "excavator", "id": 871, "trainId": 456},
    {"name": "trestle", "id": 2859, "trainId": 457},
    {"name": "revolving door", "id": 2103, "trainId": 458},
    {"name": "blast furnace", "id": 208, "trainId": 459},
    {"name": "scale, weighing machine", "id": 2236, "trainId": 460},
    {"name": "projector", "id": 2012, "trainId": 461},
    {"name": "soap", "id": 2462, "trainId": 462},
    {"name": "locker", "id": 1462, "trainId": 463},
    {"name": "tractor", "id": 2832, "trainId": 464},
    {"name": "stretcher", "id": 2617, "trainId": 465},
    {"name": "frame", "id": 1024, "trainId": 466},
    {"name": "grating", "id": 1129, "trainId": 467},
    {"name": "alembic", "id": 18, "trainId": 468},
    {"name": "candle, taper, wax light", "id": 376, "trainId": 469},
    {"name": "barrier", "id": 134, "trainId": 470},
    {"name": "cardboard", "id": 407, "trainId": 471},
    {"name": "cave", "id": 434, "trainId": 472},
    {"name": "puddle", "id": 2017, "trainId": 473},
    {"name": "tarp", "id": 2717, "trainId": 474},
    {"name": "price tag", "id": 2005, "trainId": 475},
    {"name": "watchtower", "id": 2993, "trainId": 476},
    {"name": "meters", "id": 1545, "trainId": 477},
    {
        "name": "light bulb, lightbulb, bulb, incandescent lamp, electric light, electric-light bulb",
        "id": 1445,
        "trainId": 478,
    },
    {"name": "tracks", "id": 2831, "trainId": 479},
    {"name": "hair dryer", "id": 1161, "trainId": 480},
    {"name": "skirt", "id": 2411, "trainId": 481},
    {"name": "viaduct", "id": 2949, "trainId": 482},
    {"name": "paper towel", "id": 1769, "trainId": 483},
    {"name": "coat", "id": 552, "trainId": 484},
    {"name": "sheet", "id": 2327, "trainId": 485},
    {"name": "fire extinguisher, extinguisher, asphyxiator", "id": 939, "trainId": 486},
    {"name": "water wheel", "id": 3013, "trainId": 487},
    {"name": "pottery, clayware", "id": 1986, "trainId": 488},
    {"name": "magazine rack", "id": 1486, "trainId": 489},
    {"name": "teapot", "id": 2723, "trainId": 490},
    {"name": "microphone, mike", "id": 1549, "trainId": 491},
    {"name": "support", "id": 2649, "trainId": 492},
    {"name": "forklift", "id": 1020, "trainId": 493},
    {"name": "canyon", "id": 392, "trainId": 494},
    {"name": "cash register, register", "id": 422, "trainId": 495},
    {"name": "leaf, leafage, foliage", "id": 1419, "trainId": 496},
    {"name": "remote control, remote", "id": 2099, "trainId": 497},
    {"name": "soap dish", "id": 2464, "trainId": 498},
    {"name": "windshield, windscreen", "id": 3058, "trainId": 499},
    {"name": "cat", "id": 430, "trainId": 500},
    {"name": "cue, cue stick, pool cue, pool stick", "id": 675, "trainId": 501},
    {"name": "vent, venthole, vent-hole, blowhole", "id": 2941, "trainId": 502},
    {"name": "videos", "id": 2955, "trainId": 503},
    {"name": "shovel", "id": 2355, "trainId": 504},
    {"name": "eaves", "id": 840, "trainId": 505},
    {"name": "antenna, aerial, transmitting aerial", "id": 32, "trainId": 506},
    {"name": "shipyard", "id": 2338, "trainId": 507},
    {"name": "hen, biddy", "id": 1232, "trainId": 508},
    {"name": "traffic cone", "id": 2834, "trainId": 509},
    {"name": "washing machines", "id": 2991, "trainId": 510},
    {"name": "truck crane", "id": 2879, "trainId": 511},
    {"name": "cds", "id": 444, "trainId": 512},
    {"name": "niche", "id": 1657, "trainId": 513},
    {"name": "scoreboard", "id": 2246, "trainId": 514},
    {"name": "briefcase", "id": 296, "trainId": 515},
    {"name": "boot", "id": 245, "trainId": 516},
    {"name": "sweater, jumper", "id": 2661, "trainId": 517},
    {"name": "hay", "id": 1202, "trainId": 518},
    {"name": "pack", "id": 1714, "trainId": 519},
    {"name": "bottle rack", "id": 251, "trainId": 520},
    {"name": "glacier", "id": 1095, "trainId": 521},
    {"name": "pergola", "id": 1828, "trainId": 522},
    {"name": "building materials", "id": 311, "trainId": 523},
    {"name": "television camera", "id": 2732, "trainId": 524},
    {"name": "first floor", "id": 947, "trainId": 525},
    {"name": "rifle", "id": 2115, "trainId": 526},
    {"name": "tennis table", "id": 2738, "trainId": 527},
    {"name": "stadium", "id": 2525, "trainId": 528},
    {"name": "safety belt", "id": 2194, "trainId": 529},
    {"name": "cover", "id": 634, "trainId": 530},
    {"name": "dish rack", "id": 740, "trainId": 531},
    {"name": "synthesizer", "id": 2682, "trainId": 532},
    {"name": "pumpkin", "id": 2020, "trainId": 533},
    {"name": "gutter", "id": 1156, "trainId": 534},
    {"name": "fruit stand", "id": 1036, "trainId": 535},
    {"name": "ice floe, floe", "id": 1295, "trainId": 536},
    {"name": "handle, grip, handgrip, hold", "id": 1181, "trainId": 537},
    {"name": "wheelchair", "id": 3037, "trainId": 538},
    {"name": "mousepad, mouse mat", "id": 1614, "trainId": 539},
    {"name": "diploma", "id": 736, "trainId": 540},
    {"name": "fairground ride", "id": 893, "trainId": 541},
    {"name": "radio", "id": 2047, "trainId": 542},
    {"name": "hotplate", "id": 1274, "trainId": 543},
    {"name": "junk", "id": 1361, "trainId": 544},
    {"name": "wheelbarrow", "id": 3036, "trainId": 545},
    {"name": "stream", "id": 2606, "trainId": 546},
    {"name": "toll plaza", "id": 2797, "trainId": 547},
    {"name": "punching bag", "id": 2022, "trainId": 548},
    {"name": "trough", "id": 2876, "trainId": 549},
    {"name": "throne", "id": 2758, "trainId": 550},
    {"name": "chair desk", "id": 472, "trainId": 551},
    {"name": "weighbridge", "id": 3028, "trainId": 552},
    {"name": "extractor fan", "id": 882, "trainId": 553},
    {"name": "hanging clothes", "id": 1189, "trainId": 554},
    {"name": "dish, dish aerial, dish antenna, saucer", "id": 743, "trainId": 555},
    {"name": "alarm clock, alarm", "id": 3122, "trainId": 556},
    {"name": "ski lift", "id": 2401, "trainId": 557},
    {"name": "chain", "id": 468, "trainId": 558},
    {"name": "garage", "id": 1061, "trainId": 559},
    {"name": "mechanical shovel", "id": 1523, "trainId": 560},
    {"name": "wine rack", "id": 3059, "trainId": 561},
    {"name": "tramway", "id": 2843, "trainId": 562},
    {"name": "treadmill", "id": 2853, "trainId": 563},
    {"name": "menu", "id": 1529, "trainId": 564},
    {"name": "block", "id": 214, "trainId": 565},
    {"name": "well", "id": 3032, "trainId": 566},
    {"name": "witness stand", "id": 3071, "trainId": 567},
    {"name": "branch", "id": 277, "trainId": 568},
    {"name": "duck", "id": 813, "trainId": 569},
    {"name": "casserole", "id": 426, "trainId": 570},
    {"name": "frying pan", "id": 1039, "trainId": 571},
    {"name": "desk organizer", "id": 727, "trainId": 572},
    {"name": "mast", "id": 1508, "trainId": 573},
    {"name": "spectacles, specs, eyeglasses, glasses", "id": 2490, "trainId": 574},
    {"name": "service elevator", "id": 2299, "trainId": 575},
    {"name": "dollhouse", "id": 768, "trainId": 576},
    {"name": "hammock", "id": 1172, "trainId": 577},
    {"name": "clothes hanging", "id": 537, "trainId": 578},
    {"name": "photocopier", "id": 1847, "trainId": 579},
    {"name": "notepad", "id": 1664, "trainId": 580},
    {"name": "golf cart", "id": 1110, "trainId": 581},
    {"name": "footpath", "id": 1014, "trainId": 582},
    {"name": "cross", "id": 662, "trainId": 583},
    {"name": "baptismal font", "id": 121, "trainId": 584},
    {"name": "boiler", "id": 227, "trainId": 585},
    {"name": "skip", "id": 2410, "trainId": 586},
    {"name": "rotisserie", "id": 2165, "trainId": 587},
    {"name": "tables", "id": 2696, "trainId": 588},
    {"name": "water mill", "id": 3005, "trainId": 589},
    {"name": "helmet", "id": 1231, "trainId": 590},
    {"name": "cover curtain", "id": 635, "trainId": 591},
    {"name": "brick", "id": 292, "trainId": 592},
    {"name": "table runner", "id": 2690, "trainId": 593},
    {"name": "ashtray", "id": 65, "trainId": 594},
    {"name": "street box", "id": 2607, "trainId": 595},
    {"name": "stick", "id": 2574, "trainId": 596},
    {"name": "hangers", "id": 1188, "trainId": 597},
    {"name": "cells", "id": 456, "trainId": 598},
    {"name": "urinal", "id": 2913, "trainId": 599},
    {"name": "centerpiece", "id": 459, "trainId": 600},
    {"name": "portable fridge", "id": 1955, "trainId": 601},
    {"name": "dvds", "id": 827, "trainId": 602},
    {"name": "golf club", "id": 1111, "trainId": 603},
    {"name": "skirting board", "id": 2412, "trainId": 604},
    {"name": "water cooler", "id": 2997, "trainId": 605},
    {"name": "clipboard", "id": 528, "trainId": 606},
    {"name": "camera, photographic camera", "id": 366, "trainId": 607},
    {"name": "pigeonhole", "id": 1863, "trainId": 608},
    {"name": "chips", "id": 500, "trainId": 609},
    {"name": "food processor", "id": 1001, "trainId": 610},
    {"name": "post box", "id": 1958, "trainId": 611},
    {"name": "lid", "id": 1441, "trainId": 612},
    {"name": "drum", "id": 809, "trainId": 613},
    {"name": "blender", "id": 210, "trainId": 614},
    {"name": "cave entrance", "id": 435, "trainId": 615},
    {"name": "dental chair", "id": 718, "trainId": 616},
    {"name": "obelisk", "id": 1674, "trainId": 617},
    {"name": "canoe", "id": 388, "trainId": 618},
    {"name": "mobile", "id": 1572, "trainId": 619},
    {"name": "monitors", "id": 1584, "trainId": 620},
    {"name": "pool ball", "id": 1944, "trainId": 621},
    {"name": "cue rack", "id": 674, "trainId": 622},
    {"name": "baggage carts", "id": 99, "trainId": 623},
    {"name": "shore", "id": 2352, "trainId": 624},
    {"name": "fork", "id": 1019, "trainId": 625},
    {"name": "paper filer", "id": 1763, "trainId": 626},
    {"name": "bicycle rack", "id": 185, "trainId": 627},
    {"name": "coat rack", "id": 554, "trainId": 628},
    {"name": "garland", "id": 1066, "trainId": 629},
    {"name": "sports bag", "id": 2508, "trainId": 630},
    {"name": "fish tank", "id": 951, "trainId": 631},
    {"name": "towel dispenser", "id": 2822, "trainId": 632},
    {"name": "carriage", "id": 415, "trainId": 633},
    {"name": "brochure", "id": 297, "trainId": 634},
    {"name": "plaque", "id": 1914, "trainId": 635},
    {"name": "stringer", "id": 2619, "trainId": 636},
    {"name": "iron", "id": 1338, "trainId": 637},
    {"name": "spoon", "id": 2505, "trainId": 638},
    {"name": "flag pole", "id": 955, "trainId": 639},
    {"name": "toilet brush", "id": 2786, "trainId": 640},
    {"name": "book stand", "id": 238, "trainId": 641},
    {"name": "water faucet, water tap, tap, hydrant", "id": 3000, "trainId": 642},
    {"name": "ticket office", "id": 2763, "trainId": 643},
    {"name": "broom", "id": 299, "trainId": 644},
    {"name": "dvd", "id": 822, "trainId": 645},
    {"name": "ice bucket", "id": 1288, "trainId": 646},
    {"name": "carapace, shell, cuticle, shield", "id": 3101, "trainId": 647},
    {"name": "tureen", "id": 2894, "trainId": 648},
    {"name": "folders", "id": 992, "trainId": 649},
    {"name": "chess", "id": 489, "trainId": 650},
    {"name": "root", "id": 2157, "trainId": 651},
    {"name": "sewing machine", "id": 2309, "trainId": 652},
    {"name": "model", "id": 1576, "trainId": 653},
    {"name": "pen", "id": 1810, "trainId": 654},
    {"name": "violin", "id": 2964, "trainId": 655},
    {"name": "sweatshirt", "id": 2662, "trainId": 656},
    {"name": "recycling materials", "id": 2087, "trainId": 657},
    {"name": "mitten", "id": 1569, "trainId": 658},
    {"name": "chopping board, cutting board", "id": 503, "trainId": 659},
    {"name": "mask", "id": 1505, "trainId": 660},
    {"name": "log", "id": 1468, "trainId": 661},
    {"name": "mouse, computer mouse", "id": 1613, "trainId": 662},
    {"name": "grill", "id": 1138, "trainId": 663},
    {"name": "hole", "id": 1256, "trainId": 664},
    {"name": "target", "id": 2715, "trainId": 665},
    {"name": "trash bag", "id": 2846, "trainId": 666},
    {"name": "chalk", "id": 477, "trainId": 667},
    {"name": "sticks", "id": 2576, "trainId": 668},
    {"name": "balloon", "id": 108, "trainId": 669},
    {"name": "score", "id": 2245, "trainId": 670},
    {"name": "hair spray", "id": 1162, "trainId": 671},
    {"name": "roll", "id": 2149, "trainId": 672},
    {"name": "runner", "id": 2183, "trainId": 673},
    {"name": "engine", "id": 858, "trainId": 674},
    {"name": "inflatable glove", "id": 1324, "trainId": 675},
    {"name": "games", "id": 1055, "trainId": 676},
    {"name": "pallets", "id": 1741, "trainId": 677},
    {"name": "baskets", "id": 149, "trainId": 678},
    {"name": "coop", "id": 615, "trainId": 679},
    {"name": "dvd player", "id": 825, "trainId": 680},
    {"name": "rocking horse", "id": 2143, "trainId": 681},
    {"name": "buckets", "id": 304, "trainId": 682},
    {"name": "bread rolls", "id": 283, "trainId": 683},
    {"name": "shawl", "id": 2322, "trainId": 684},
    {"name": "watering can", "id": 3017, "trainId": 685},
    {"name": "spotlights", "id": 2510, "trainId": 686},
    {"name": "post-it", "id": 1960, "trainId": 687},
    {"name": "bowls", "id": 265, "trainId": 688},
    {"name": "security camera", "id": 2282, "trainId": 689},
    {"name": "runner cloth", "id": 2184, "trainId": 690},
    {"name": "lock", "id": 1461, "trainId": 691},
    {"name": "alarm, warning device, alarm system", "id": 3113, "trainId": 692},
    {"name": "side", "id": 2372, "trainId": 693},
    {"name": "roulette", "id": 2166, "trainId": 694},
    {"name": "bone", "id": 232, "trainId": 695},
    {"name": "cutlery", "id": 693, "trainId": 696},
    {"name": "pool balls", "id": 1945, "trainId": 697},
    {"name": "wheels", "id": 3039, "trainId": 698},
    {"name": "spice rack", "id": 2494, "trainId": 699},
    {"name": "plant pots", "id": 1908, "trainId": 700},
    {"name": "towel ring", "id": 2827, "trainId": 701},
    {"name": "bread box", "id": 280, "trainId": 702},
    {"name": "video", "id": 2950, "trainId": 703},
    {"name": "funfair", "id": 1044, "trainId": 704},
    {"name": "breads", "id": 288, "trainId": 705},
    {"name": "tripod", "id": 2863, "trainId": 706},
    {"name": "ironing board", "id": 1342, "trainId": 707},
    {"name": "skimmer", "id": 2409, "trainId": 708},
    {"name": "hollow", "id": 1258, "trainId": 709},
    {"name": "scratching post", "id": 2249, "trainId": 710},
    {"name": "tricycle", "id": 2862, "trainId": 711},
    {"name": "file box", "id": 920, "trainId": 712},
    {"name": "mountain pass", "id": 1607, "trainId": 713},
    {"name": "tombstones", "id": 2802, "trainId": 714},
    {"name": "cooker", "id": 610, "trainId": 715},
    {"name": "card game, cards", "id": 3129, "trainId": 716},
    {"name": "golf bag", "id": 1108, "trainId": 717},
    {"name": "towel paper", "id": 2823, "trainId": 718},
    {"name": "chaise lounge", "id": 476, "trainId": 719},
    {"name": "sun", "id": 2641, "trainId": 720},
    {"name": "toilet paper holder", "id": 2788, "trainId": 721},
    {"name": "rake", "id": 2070, "trainId": 722},
    {"name": "key", "id": 1368, "trainId": 723},
    {"name": "umbrella stand", "id": 2903, "trainId": 724},
    {"name": "dartboard", "id": 699, "trainId": 725},
    {"name": "transformer", "id": 2844, "trainId": 726},
    {"name": "fireplace utensils", "id": 942, "trainId": 727},
    {"name": "sweatshirts", "id": 2663, "trainId": 728},
    {
        "name": "cellular telephone, cellular phone, cellphone, cell, mobile phone",
        "id": 457,
        "trainId": 729,
    },
    {"name": "tallboy", "id": 2701, "trainId": 730},
    {"name": "stapler", "id": 2540, "trainId": 731},
    {"name": "sauna", "id": 2231, "trainId": 732},
    {"name": "test tube", "id": 2746, "trainId": 733},
    {"name": "palette", "id": 1738, "trainId": 734},
    {"name": "shopping carts", "id": 2350, "trainId": 735},
    {"name": "tools", "id": 2808, "trainId": 736},
    {"name": "push button, push, button", "id": 2025, "trainId": 737},
    {"name": "star", "id": 2541, "trainId": 738},
    {"name": "roof rack", "id": 2156, "trainId": 739},
    {"name": "barbed wire", "id": 126, "trainId": 740},
    {"name": "spray", "id": 2512, "trainId": 741},
    {"name": "ear", "id": 831, "trainId": 742},
    {"name": "sponge", "id": 2503, "trainId": 743},
    {"name": "racket", "id": 2039, "trainId": 744},
    {"name": "tins", "id": 2774, "trainId": 745},
    {"name": "eyeglasses", "id": 886, "trainId": 746},
    {"name": "file", "id": 919, "trainId": 747},
    {"name": "scarfs", "id": 2240, "trainId": 748},
    {"name": "sugar bowl", "id": 2636, "trainId": 749},
    {"name": "flip flop", "id": 963, "trainId": 750},
    {"name": "headstones", "id": 1218, "trainId": 751},
    {"name": "laptop bag", "id": 1406, "trainId": 752},
    {"name": "leash", "id": 1420, "trainId": 753},
    {"name": "climbing frame", "id": 526, "trainId": 754},
    {"name": "suit hanger", "id": 2639, "trainId": 755},
    {"name": "floor spotlight", "id": 975, "trainId": 756},
    {"name": "plate rack", "id": 1921, "trainId": 757},
    {"name": "sewer", "id": 2305, "trainId": 758},
    {"name": "hard drive", "id": 1193, "trainId": 759},
    {"name": "sprinkler", "id": 2517, "trainId": 760},
    {"name": "tools box", "id": 2809, "trainId": 761},
    {"name": "necklace", "id": 1647, "trainId": 762},
    {"name": "bulbs", "id": 314, "trainId": 763},
    {"name": "steel industry", "id": 2560, "trainId": 764},
    {"name": "club", "id": 545, "trainId": 765},
    {"name": "jack", "id": 1345, "trainId": 766},
    {"name": "door bars", "id": 775, "trainId": 767},
    {
        "name": "control panel, instrument panel, control board, board, panel",
        "id": 603,
        "trainId": 768,
    },
    {"name": "hairbrush", "id": 1163, "trainId": 769},
    {"name": "napkin holder", "id": 1641, "trainId": 770},
    {"name": "office", "id": 1678, "trainId": 771},
    {"name": "smoke detector", "id": 2450, "trainId": 772},
    {"name": "utensils", "id": 2915, "trainId": 773},
    {"name": "apron", "id": 42, "trainId": 774},
    {"name": "scissors", "id": 2242, "trainId": 775},
    {"name": "terminal", "id": 2741, "trainId": 776},
    {"name": "grinder", "id": 1143, "trainId": 777},
    {"name": "entry phone", "id": 862, "trainId": 778},
    {"name": "newspaper stand", "id": 1654, "trainId": 779},
    {"name": "pepper shaker", "id": 1826, "trainId": 780},
    {"name": "onions", "id": 1689, "trainId": 781},
    {
        "name": "central processing unit, cpu, c p u , central processor, processor, mainframe",
        "id": 3124,
        "trainId": 782,
    },
    {"name": "tape", "id": 2710, "trainId": 783},
    {"name": "bat", "id": 152, "trainId": 784},
    {"name": "coaster", "id": 549, "trainId": 785},
    {"name": "calculator", "id": 360, "trainId": 786},
    {"name": "potatoes", "id": 1982, "trainId": 787},
    {"name": "luggage rack", "id": 1478, "trainId": 788},
    {"name": "salt", "id": 2203, "trainId": 789},
    {"name": "street number", "id": 2612, "trainId": 790},
    {"name": "viewpoint", "id": 2956, "trainId": 791},
    {"name": "sword", "id": 2681, "trainId": 792},
    {"name": "cd", "id": 437, "trainId": 793},
    {"name": "rowing machine", "id": 2171, "trainId": 794},
    {"name": "plug", "id": 1933, "trainId": 795},
    {"name": "andiron, firedog, dog, dog-iron", "id": 3110, "trainId": 796},
    {"name": "pepper", "id": 1824, "trainId": 797},
    {"name": "tongs", "id": 2803, "trainId": 798},
    {"name": "bonfire", "id": 234, "trainId": 799},
    {"name": "dog dish", "id": 764, "trainId": 800},
    {"name": "belt", "id": 177, "trainId": 801},
    {"name": "dumbbells", "id": 817, "trainId": 802},
    {"name": "videocassette recorder, vcr", "id": 3145, "trainId": 803},
    {"name": "hook", "id": 1262, "trainId": 804},
    {"name": "envelopes", "id": 864, "trainId": 805},
    {"name": "shower faucet", "id": 2359, "trainId": 806},
    {"name": "watch", "id": 2992, "trainId": 807},
    {"name": "padlock", "id": 1725, "trainId": 808},
    {"name": "swimming pool ladder", "id": 2667, "trainId": 809},
    {"name": "spanners", "id": 2484, "trainId": 810},
    {"name": "gravy boat", "id": 1133, "trainId": 811},
    {"name": "notice board", "id": 1667, "trainId": 812},
    {"name": "trash bags", "id": 2847, "trainId": 813},
    {"name": "fire alarm", "id": 932, "trainId": 814},
    {"name": "ladle", "id": 1392, "trainId": 815},
    {"name": "stethoscope", "id": 2573, "trainId": 816},
    {"name": "rocket", "id": 2140, "trainId": 817},
    {"name": "funnel", "id": 1046, "trainId": 818},
    {"name": "bowling pins", "id": 264, "trainId": 819},
    {"name": "valve", "id": 2927, "trainId": 820},
    {"name": "thermometer", "id": 2752, "trainId": 821},
    {"name": "cups", "id": 679, "trainId": 822},
    {"name": "spice jar", "id": 2493, "trainId": 823},
    {"name": "night light", "id": 1658, "trainId": 824},
    {"name": "soaps", "id": 2466, "trainId": 825},
    {"name": "games table", "id": 1057, "trainId": 826},
    {"name": "slotted spoon", "id": 2444, "trainId": 827},
    {"name": "reel", "id": 2093, "trainId": 828},
    {"name": "scourer", "id": 2248, "trainId": 829},
    {"name": "sleeping robe", "id": 2432, "trainId": 830},
    {"name": "desk mat", "id": 726, "trainId": 831},
    {"name": "dumbbell", "id": 816, "trainId": 832},
    {"name": "hammer", "id": 1171, "trainId": 833},
    {"name": "tie", "id": 2766, "trainId": 834},
    {"name": "typewriter", "id": 2900, "trainId": 835},
    {"name": "shaker", "id": 2313, "trainId": 836},
    {"name": "cheese dish", "id": 488, "trainId": 837},
    {"name": "sea star", "id": 2265, "trainId": 838},
    {"name": "racquet", "id": 2043, "trainId": 839},
    {"name": "butane gas cylinder", "id": 332, "trainId": 840},
    {"name": "paper weight", "id": 1771, "trainId": 841},
    {"name": "shaving brush", "id": 2320, "trainId": 842},
    {"name": "sunglasses", "id": 2646, "trainId": 843},
    {"name": "gear shift", "id": 1089, "trainId": 844},
    {"name": "towel rail", "id": 2826, "trainId": 845},
    {"name": "adding machine, totalizer, totaliser", "id": 3148, "trainId": 846},
]


def _get_ade20k_full_meta():
    # Id 0 is reserved for ignore_label, we change ignore_label for 0
    # to 255 in our pre-processing, so all ids are shifted by 1.
    stuff_ids = [k["id"] for k in ADE20K_SEM_SEG_FULL_CATEGORIES]
    assert len(stuff_ids) == 847, len(stuff_ids)

    # For semantic segmentation, this mapping maps from contiguous stuff id
    # (in [0, 91], used in models) to ids in the dataset (used for processing results)
    stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)}
    stuff_classes = [k["name"] for k in ADE20K_SEM_SEG_FULL_CATEGORIES]

    ret = {
        "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
        "stuff_classes": stuff_classes,
    }
    return ret


def register_all_ade20k_full(root):
    root = os.path.join(root, "ADE20K_2021_17_01")
    meta = _get_ade20k_full_meta()
    for name, dirname in [("train", "training"), ("val", "validation")]:
        image_dir = os.path.join(root, "images_detectron2", dirname)
        gt_dir = os.path.join(root, "annotations_detectron2", dirname)
        name = f"ade20k_full_sem_seg_{name}"
        DatasetCatalog.register(
            name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="tif", image_ext="jpg")
        )
        MetadataCatalog.get(name).set(
            stuff_classes=meta["stuff_classes"][:],
            image_root=image_dir,
            sem_seg_root=gt_dir,
            evaluator_type="sem_seg",
            ignore_label=65535,  # NOTE: gt is saved in 16-bit TIFF images
        )


_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_ade20k_full(_root)


================================================
FILE: mask2former/data/datasets/register_ade20k_instance.py
================================================
import json
import logging
import numpy as np
import os
from PIL import Image

from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets.coco import load_coco_json, register_coco_instances
from detectron2.utils.file_io import PathManager

ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}]


_PREDEFINED_SPLITS = {
    # point annotations without masks
    "ade20k_instance_train": (
        "ADEChallengeData2016/images/training",
        "ADEChallengeData2016/ade20k_instance_train.json",
    ),
    "ade20k_instance_val": (
        "ADEChallengeData2016/images/validation",
        "ADEChallengeData2016/ade20k_instance_val.json",
    ),
}


def _get_ade_instances_meta():
    thing_ids = [k["id"] for k in ADE_CATEGORIES]
    assert len(thing_ids) == 100, len(thing_ids)
    # Mapping from the incontiguous ADE category id to an id in [0, 99]
    thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
    thing_classes = [k["name"] for k in ADE_CATEGORIES]
    ret = {
        "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
        "thing_classes": thing_classes,
    }
    return ret


def register_all_ade20k_instance(root):
    for key, (image_root, json_file) in _PREDEFINED_SPLITS.items():
        # Assume pre-defined datasets live in `./datasets`.
        register_coco_instances(
            key,
            _get_ade_instances_meta(),
            os.path.join(root, json_file) if "://" not in json_file else json_file,
            os.path.join(root, image_root),
        )


_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_ade20k_instance(_root)


================================================
FILE: mask2former/data/datasets/register_ade20k_panoptic.py
================================================
import json
import os

from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.utils.file_io import PathManager

ADE20K_150_CATEGORIES = [
    {"color": [120, 120, 120], "id": 0, "isthing": 0, "name": "wall"},
    {"color": [180, 120, 120], "id": 1, "isthing": 0, "name": "building"},
    {"color": [6, 230, 230], "id": 2, "isthing": 0, "name": "sky"},
    {"color": [80, 50, 50], "id": 3, "isthing": 0, "name": "floor"},
    {"color": [4, 200, 3], "id": 4, "isthing": 0, "name": "tree"},
    {"color": [120, 120, 80], "id": 5, "isthing": 0, "name": "ceiling"},
    {"color": [140, 140, 140], "id": 6, "isthing": 0, "name": "road, route"},
    {"color": [204, 5, 255], "id": 7, "isthing": 1, "name": "bed"},
    {"color": [230, 230, 230], "id": 8, "isthing": 1, "name": "window "},
    {"color": [4, 250, 7], "id": 9, "isthing": 0, "name": "grass"},
    {"color": [224, 5, 255], "id": 10, "isthing": 1, "name": "cabinet"},
    {"color": [235, 255, 7], "id": 11, "isthing": 0, "name": "sidewalk, pavement"},
    {"color": [150, 5, 61], "id": 12, "isthing": 1, "name": "person"},
    {"color": [120, 120, 70], "id": 13, "isthing": 0, "name": "earth, ground"},
    {"color": [8, 255, 51], "id": 14, "isthing": 1, "name": "door"},
    {"color": [255, 6, 82], "id": 15, "isthing": 1, "name": "table"},
    {"color": [143, 255, 140], "id": 16, "isthing": 0, "name": "mountain, mount"},
    {"color": [204, 255, 4], "id": 17, "isthing": 0, "name": "plant"},
    {"color": [255, 51, 7], "id": 18, "isthing": 1, "name": "curtain"},
    {"color": [204, 70, 3], "id": 19, "isthing": 1, "name": "chair"},
    {"color": [0, 102, 200], "id": 20, "isthing": 1, "name": "car"},
    {"color": [61, 230, 250], "id": 21, "isthing": 0, "name": "water"},
    {"color": [255, 6, 51], "id": 22, "isthing": 1, "name": "painting, picture"},
    {"color": [11, 102, 255], "id": 23, "isthing": 1, "name": "sofa"},
    {"color": [255, 7, 71], "id": 24, "isthing": 1, "name": "shelf"},
    {"color": [255, 9, 224], "id": 25, "isthing": 0, "name": "house"},
    {"color": [9, 7, 230], "id": 26, "isthing": 0, "name": "sea"},
    {"color": [220, 220, 220], "id": 27, "isthing": 1, "name": "mirror"},
    {"color": [255, 9, 92], "id": 28, "isthing": 0, "name": "rug"},
    {"color": [112, 9, 255], "id": 29, "isthing": 0, "name": "field"},
    {"color": [8, 255, 214], "id": 30, "isthing": 1, "name": "armchair"},
    {"color": [7, 255, 224], "id": 31, "isthing": 1, "name": "seat"},
    {"color": [255, 184, 6], "id": 32, "isthing": 1, "name": "fence"},
    {"color": [10, 255, 71], "id": 33, "isthing": 1, "name": "desk"},
    {"color": [255, 41, 10], "id": 34, "isthing": 0, "name": "rock, stone"},
    {"color": [7, 255, 255], "id": 35, "isthing": 1, "name": "wardrobe, closet, press"},
    {"color": [224, 255, 8], "id": 36, "isthing": 1, "name": "lamp"},
    {"color": [102, 8, 255], "id": 37, "isthing": 1, "name": "tub"},
    {"color": [255, 61, 6], "id": 38, "isthing": 1, "name": "rail"},
    {"color": [255, 194, 7], "id": 39, "isthing": 1, "name": "cushion"},
    {"color": [255, 122, 8], "id": 40, "isthing": 0, "name": "base, pedestal, stand"},
    {"color": [0, 255, 20], "id": 41, "isthing": 1, "name": "box"},
    {"color": [255, 8, 41], "id": 42, "isthing": 1, "name": "column, pillar"},
    {"color": [255, 5, 153], "id": 43, "isthing": 1, "name": "signboard, sign"},
    {
        "color": [6, 51, 255],
        "id": 44,
        "isthing": 1,
        "name": "chest of drawers, chest, bureau, dresser",
    },
    {"color": [235, 12, 255], "id": 45, "isthing": 1, "name": "counter"},
    {"color": [160, 150, 20], "id": 46, "isthing": 0, "name": "sand"},
    {"color": [0, 163, 255], "id": 47, "isthing": 1, "name": "sink"},
    {"color": [140, 140, 140], "id": 48, "isthing": 0, "name": "skyscraper"},
    {"color": [250, 10, 15], "id": 49, "isthing": 1, "name": "fireplace"},
    {"color": [20, 255, 0], "id": 50, "isthing": 1, "name": "refrigerator, icebox"},
    {"color": [31, 255, 0], "id": 51, "isthing": 0, "name": "grandstand, covered stand"},
    {"color": [255, 31, 0], "id": 52, "isthing": 0, "name": "path"},
    {"color": [255, 224, 0], "id": 53, "isthing": 1, "name": "stairs"},
    {"color": [153, 255, 0], "id": 54, "isthing": 0, "name": "runway"},
    {"color": [0, 0, 255], "id": 55, "isthing": 1, "name": "case, display case, showcase, vitrine"},
    {
        "color": [255, 71, 0],
        "id": 56,
        "isthing": 1,
        "name": "pool table, billiard table, snooker table",
    },
    {"color": [0, 235, 255], "id": 57, "isthing": 1, "name": "pillow"},
    {"color": [0, 173, 255], "id": 58, "isthing": 1, "name": "screen door, screen"},
    {"color": [31, 0, 255], "id": 59, "isthing": 0, "name": "stairway, staircase"},
    {"color": [11, 200, 200], "id": 60, "isthing": 0, "name": "river"},
    {"color": [255, 82, 0], "id": 61, "isthing": 0, "name": "bridge, span"},
    {"color": [0, 255, 245], "id": 62, "isthing": 1, "name": "bookcase"},
    {"color": [0, 61, 255], "id": 63, "isthing": 0, "name": "blind, screen"},
    {"color": [0, 255, 112], "id": 64, "isthing": 1, "name": "coffee table"},
    {
        "color": [0, 255, 133],
        "id": 65,
        "isthing": 1,
        "name": "toilet, can, commode, crapper, pot, potty, stool, throne",
    },
    {"color": [255, 0, 0], "id": 66, "isthing": 1, "name": "flower"},
    {"color": [255, 163, 0], "id": 67, "isthing": 1, "name": "book"},
    {"color": [255, 102, 0], "id": 68, "isthing": 0, "name": "hill"},
    {"color": [194, 255, 0], "id": 69, "isthing": 1, "name": "bench"},
    {"color": [0, 143, 255], "id": 70, "isthing": 1, "name": "countertop"},
    {"color": [51, 255, 0], "id": 71, "isthing": 1, "name": "stove"},
    {"color": [0, 82, 255], "id": 72, "isthing": 1, "name": "palm, palm tree"},
    {"color": [0, 255, 41], "id": 73, "isthing": 1, "name": "kitchen island"},
    {"color": [0, 255, 173], "id": 74, "isthing": 1, "name": "computer"},
    {"color": [10, 0, 255], "id": 75, "isthing": 1, "name": "swivel chair"},
    {"color": [173, 255, 0], "id": 76, "isthing": 1, "name": "boat"},
    {"color": [0, 255, 153], "id": 77, "isthing": 0, "name": "bar"},
    {"color": [255, 92, 0], "id": 78, "isthing": 1, "name": "arcade machine"},
    {"color": [255, 0, 255], "id": 79, "isthing": 0, "name": "hovel, hut, hutch, shack, shanty"},
    {"color": [255, 0, 245], "id": 80, "isthing": 1, "name": "bus"},
    {"color": [255, 0, 102], "id": 81, "isthing": 1, "name": "towel"},
    {"color": [255, 173, 0], "id": 82, "isthing": 1, "name": "light"},
    {"color": [255, 0, 20], "id": 83, "isthing": 1, "name": "truck"},
    {"color": [255, 184, 184], "id": 84, "isthing": 0, "name": "tower"},
    {"color": [0, 31, 255], "id": 85, "isthing": 1, "name": "chandelier"},
    {"color": [0, 255, 61], "id": 86, "isthing": 1, "name": "awning, sunshade, sunblind"},
    {"color": [0, 71, 255], "id": 87, "isthing": 1, "name": "street lamp"},
    {"color": [255, 0, 204], "id": 88, "isthing": 1, "name": "booth"},
    {"color": [0, 255, 194], "id": 89, "isthing": 1, "name": "tv"},
    {"color": [0, 255, 82], "id": 90, "isthing": 1, "name": "plane"},
    {"color": [0, 10, 255], "id": 91, "isthing": 0, "name": "dirt track"},
    {"color": [0, 112, 255], "id": 92, "isthing": 1, "name": "clothes"},
    {"color": [51, 0, 255], "id": 93, "isthing": 1, "name": "pole"},
    {"color": [0, 194, 255], "id": 94, "isthing": 0, "name": "land, ground, soil"},
    {
        "color": [0, 122, 255],
        "id": 95,
        "isthing": 1,
        "name": "bannister, banister, balustrade, balusters, handrail",
    },
    {
        "color": [0, 255, 163],
        "id": 96,
        "isthing": 0,
        "name": "escalator, moving staircase, moving stairway",
    },
    {
        "color": [255, 153, 0],
        "id": 97,
        "isthing": 1,
        "name": "ottoman, pouf, pouffe, puff, hassock",
    },
    {"color": [0, 255, 10], "id": 98, "isthing": 1, "name": "bottle"},
    {"color": [255, 112, 0], "id": 99, "isthing": 0, "name": "buffet, counter, sideboard"},
    {
        "color": [143, 255, 0],
        "id": 100,
        "isthing": 0,
        "name": "poster, posting, placard, notice, bill, card",
    },
    {"color": [82, 0, 255], "id": 101, "isthing": 0, "name": "stage"},
    {"color": [163, 255, 0], "id": 102, "isthing": 1, "name": "van"},
    {"color": [255, 235, 0], "id": 103, "isthing": 1, "name": "ship"},
    {"color": [8, 184, 170], "id": 104, "isthing": 1, "name": "fountain"},
    {
        "color": [133, 0, 255],
        "id": 105,
        "isthing": 0,
        "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
    },
    {"color": [0, 255, 92], "id": 106, "isthing": 0, "name": "canopy"},
    {
        "color": [184, 0, 255],
        "id": 107,
        "isthing": 1,
        "name": "washer, automatic washer, washing machine",
    },
    {"color": [255, 0, 31], "id": 108, "isthing": 1, "name": "plaything, toy"},
    {"color": [0, 184, 255], "id": 109, "isthing": 0, "name": "pool"},
    {"color": [0, 214, 255], "id": 110, "isthing": 1, "name": "stool"},
    {"color": [255, 0, 112], "id": 111, "isthing": 1, "name": "barrel, cask"},
    {"color": [92, 255, 0], "id": 112, "isthing": 1, "name": "basket, handbasket"},
    {"color": [0, 224, 255], "id": 113, "isthing": 0, "name": "falls"},
    {"color": [112, 224, 255], "id": 114, "isthing": 0, "name": "tent"},
    {"color": [70, 184, 160], "id": 115, "isthing": 1, "name": "bag"},
    {"color": [163, 0, 255], "id": 116, "isthing": 1, "name": "minibike, motorbike"},
    {"color": [153, 0, 255], "id": 117, "isthing": 0, "name": "cradle"},
    {"color": [71, 255, 0], "id": 118, "isthing": 1, "name": "oven"},
    {"color": [255, 0, 163], "id": 119, "isthing": 1, "name": "ball"},
    {"color": [255, 204, 0], "id": 120, "isthing": 1, "name": "food, solid food"},
    {"color": [255, 0, 143], "id": 121, "isthing": 1, "name": "step, stair"},
    {"color": [0, 255, 235], "id": 122, "isthing": 0, "name": "tank, storage tank"},
    {"color": [133, 255, 0], "id": 123, "isthing": 1, "name": "trade name"},
    {"color": [255, 0, 235], "id": 124, "isthing": 1, "name": "microwave"},
    {"color": [245, 0, 255], "id": 125, "isthing": 1, "name": "pot"},
    {"color": [255, 0, 122], "id": 126, "isthing": 1, "name": "animal"},
    {"color": [255, 245, 0], "id": 127, "isthing": 1, "name": "bicycle"},
    {"color": [10, 190, 212], "id": 128, "isthing": 0, "name": "lake"},
    {"color": [214, 255, 0], "id": 129, "isthing": 1, "name": "dishwasher"},
    {"color": [0, 204, 255], "id": 130, "isthing": 1, "name": "screen"},
    {"color": [20, 0, 255], "id": 131, "isthing": 0, "name": "blanket, cover"},
    {"color": [255, 255, 0], "id": 132, "isthing": 1, "name": "sculpture"},
    {"color": [0, 153, 255], "id": 133, "isthing": 1, "name": "hood, exhaust hood"},
    {"color": [0, 41, 255], "id": 134, "isthing": 1, "name": "sconce"},
    {"color": [0, 255, 204], "id": 135, "isthing": 1, "name": "vase"},
    {"color": [41, 0, 255], "id": 136, "isthing": 1, "name": "traffic light"},
    {"color": [41, 255, 0], "id": 137, "isthing": 1, "name": "tray"},
    {"color": [173, 0, 255], "id": 138, "isthing": 1, "name": "trash can"},
    {"color": [0, 245, 255], "id": 139, "isthing": 1, "name": "fan"},
    {"color": [71, 0, 255], "id": 140, "isthing": 0, "name": "pier"},
    {"color": [122, 0, 255], "id": 141, "isthing": 0, "name": "crt screen"},
    {"color": [0, 255, 184], "id": 142, "isthing": 1, "name": "plate"},
    {"color": [0, 92, 255], "id": 143, "isthing": 1, "name": "monitor"},
    {"color": [184, 255, 0], "id": 144, "isthing": 1, "name": "bulletin board"},
    {"color": [0, 133, 255], "id": 145, "isthing": 0, "name": "shower"},
    {"color": [255, 214, 0], "id": 146, "isthing": 1, "name": "radiator"},
    {"color": [25, 194, 194], "id": 147, "isthing": 1, "name": "glass, drinking glass"},
    {"color": [102, 255, 0], "id": 148, "isthing": 1, "name": "clock"},
    {"color": [92, 0, 255], "id": 149, "isthing": 1, "name": "flag"},
]

ADE20k_COLORS = [k["color"] for k in ADE20K_150_CATEGORIES]

MetadataCatalog.get("ade20k_sem_seg_train").set(
    stuff_colors=ADE20k_COLORS[:],
)

MetadataCatalog.get("ade20k_sem_seg_val").set(
    stuff_colors=ADE20k_COLORS[:],
)


def load_ade20k_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
    """
    Args:
        image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
        gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
        json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
    Returns:
        list[dict]: a list of dicts in Detectron2 standard format. (See
        `Using Custom Datasets </tutorials/datasets.html>`_ )
    """

    def _convert_category_id(segment_info, meta):
        if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
            segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
                segment_info["category_id"]
            ]
            segment_info["isthing"] = True
        else:
            segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
                segment_info["category_id"]
            ]
            segment_info["isthing"] = False
        return segment_info

    with PathManager.open(json_file) as f:
        json_info = json.load(f)

    ret = []
    for ann in json_info["annotations"]:
        image_id = ann["image_id"]
        # TODO: currently we assume image and label has the same filename but
        # different extension, and images have extension ".jpg" for COCO. Need
        # to make image extension a user-provided argument if we extend this
        # function to support other COCO-like datasets.
        image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
        label_file = os.path.join(gt_dir, ann["file_name"])
        sem_label_file = os.path.join(semseg_dir, ann["file_name"])
        segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
        ret.append(
            {
                "file_name": image_file,
                "image_id": image_id,
                "pan_seg_file_name": label_file,
                "sem_seg_file_name": sem_label_file,
                "segments_info": segments_info,
            }
        )
    assert len(ret), f"No images found in {image_dir}!"
    assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
    assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
    assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
    return ret


def register_ade20k_panoptic(
    name, metadata, image_root, panoptic_root, semantic_root, panoptic_json, instances_json=None
):
    """
    Register a "standard" version of ADE20k panoptic segmentation dataset named `name`.
    The dictionaries in this registered dataset follows detectron2's standard format.
    Hence it's called "standard".
    Args:
        name (str): the name that identifies a dataset,
            e.g. "ade20k_panoptic_train"
        metadata (dict): extra metadata associated with this dataset.
        image_root (str): directory which contains all the images
        panoptic_root (str): directory which contains panoptic annotation images in COCO format
        panoptic_json (str): path to the json panoptic annotation file in COCO format
        sem_seg_root (none): not used, to be consistent with
            `register_coco_panoptic_separated`.
        instances_json (str): path to the json instance annotation file
    """
    panoptic_name = name
    DatasetCatalog.register(
        panoptic_name,
        lambda: load_ade20k_panoptic_json(
            panoptic_json, image_root, panoptic_root, semantic_root, metadata
        ),
    )
    MetadataCatalog.get(panoptic_name).set(
        panop

Download .txt

gitextract_tlc1nw96/

├── DATASET_prepare.md
├── LICENSE
├── README.md
├── configs/
│   ├── coco/
│   │   └── instance-segmentation/
│   │       ├── Base-COCO-InstanceSegmentation.yaml
│   │       └── maskformer2_R50_bs16_50ep.yaml
│   └── youtubevis_2019/
│       ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
│       ├── Base-YouTubeVIS-VideoInstanceSegmentation_long.yaml
│       ├── Base-YouTubeVIS-VideoInstanceSegmentation_long_bs16.yaml
│       ├── swin/
│       │   └── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml
│       ├── video_maskformer2_R101_bs16_8ep.yaml
│       ├── video_maskformer2_R50_bs16_8ep.yaml
│       └── video_maskformer2_R50_bs16_8ep_swin.yaml
├── demo/
│   ├── README.md
│   ├── demo.py
│   └── predictor.py
├── demo_video/
│   ├── README.md
│   ├── demo.py
│   ├── predictor.py
│   └── visualizer.py
├── mask2former/
│   ├── __init__.py
│   ├── config.py
│   ├── data/
│   │   ├── __init__.py
│   │   ├── dataset_mappers/
│   │   │   ├── __init__.py
│   │   │   ├── __init__.py.new
│   │   │   ├── coco_instance_new_baseline_dataset_mapper.py
│   │   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
│   │   │   ├── mask_former_instance_dataset_mapper.py
│   │   │   ├── mask_former_panoptic_dataset_mapper.py
│   │   │   └── mask_former_semantic_dataset_mapper.py
│   │   └── datasets/
│   │       ├── __init__.py
│   │       ├── register_ade20k_full.py
│   │       ├── register_ade20k_instance.py
│   │       ├── register_ade20k_panoptic.py
│   │       ├── register_coco_panoptic_annos_semseg.py
│   │       ├── register_coco_stuff_10k.py
│   │       ├── register_mapillary_vistas.py
│   │       └── register_mapillary_vistas_panoptic.py
│   ├── evaluation/
│   │   ├── __init__.py
│   │   ├── __init__.py.new
│   │   └── instance_evaluation.py
│   ├── maskformer_model.py
│   ├── modeling/
│   │   ├── __init__.py
│   │   ├── backbone/
│   │   │   ├── __init__.py
│   │   │   ├── __init__.py.new
│   │   │   └── swin.py
│   │   ├── criterion.py
│   │   ├── matcher.py
│   │   ├── meta_arch/
│   │   │   ├── __init__.py
│   │   │   ├── __init__.py.new
│   │   │   ├── mask_former_head.py
│   │   │   └── per_pixel_baseline.py
│   │   ├── pixel_decoder/
│   │   │   ├── __init__.py
│   │   │   ├── __init__.py.new
│   │   │   ├── fpn.py
│   │   │   ├── msdeformattn.py
│   │   │   └── ops/
│   │   │       ├── functions/
│   │   │       │   ├── __init__.py
│   │   │       │   └── ms_deform_attn_func.py
│   │   │       ├── make.sh
│   │   │       ├── modules/
│   │   │       │   ├── __init__.py
│   │   │       │   └── ms_deform_attn.py
│   │   │       ├── setup.py
│   │   │       ├── src/
│   │   │       │   ├── cpu/
│   │   │       │   │   ├── ms_deform_attn_cpu.cpp
│   │   │       │   │   └── ms_deform_attn_cpu.h
│   │   │       │   ├── cuda/
│   │   │       │   │   ├── ms_deform_attn_cuda.cu
│   │   │       │   │   ├── ms_deform_attn_cuda.h
│   │   │       │   │   └── ms_deform_im2col_cuda.cuh
│   │   │       │   ├── ms_deform_attn.h
│   │   │       │   └── vision.cpp
│   │   │       └── test.py
│   │   └── transformer_decoder/
│   │       ├── __init__.py
│   │       ├── mask2former_transformer_decoder.py
│   │       ├── maskformer_transformer_decoder.py
│   │       ├── position_encoding.py
│   │       └── transformer.py
│   ├── test_time_augmentation.py
│   └── utils/
│       ├── __init__.py
│       ├── __init__.py.new
│       └── misc.py
├── mask2former_video/
│   ├── __init__.py
│   ├── config.py
│   ├── data_video/
│   │   ├── __init__.py
│   │   ├── augmentation.py
│   │   ├── build.py
│   │   ├── combined_loader.py
│   │   ├── dataset_mapper.py
│   │   ├── datasets/
│   │   │   ├── __init__.py
│   │   │   ├── builtin.py
│   │   │   ├── ytvis.py
│   │   │   └── ytvis_api/
│   │   │       ├── __init__.py
│   │   │       ├── ytvos.py
│   │   │       └── ytvoseval.py
│   │   └── ytvis_eval.py
│   ├── modeling/
│   │   ├── __init__.py
│   │   ├── criterion.py
│   │   ├── matcher.py
│   │   └── transformer_decoder/
│   │       ├── __init__.py
│   │       ├── position_encoding.py
│   │       └── video_mask2former_transformer_decoder.py
│   ├── utils/
│   │   ├── __init__.py
│   │   ├── __init__.py.new
│   │   └── memory.py
│   └── video_maskformer_model.py
├── mfvis_nococo/
│   ├── __init__.py
│   ├── configs/
│   │   └── youtubevis_2019/
│   │       ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
│   │       ├── video_maskformer2_R101_bs16_8ep_coco.yaml
│   │       ├── video_maskformer2_R50_bs16_8ep.yaml
│   │       └── video_maskformer2_R50_bs16_8ep_coco.yaml
│   ├── mask2former/
│   │   ├── __init__.py
│   │   ├── config.py
│   │   ├── data/
│   │   │   ├── __init__.py
│   │   │   ├── dataset_mappers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __init__.py.new
│   │   │   │   ├── coco_instance_new_baseline_dataset_mapper.py
│   │   │   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
│   │   │   │   ├── mask_former_instance_dataset_mapper.py
│   │   │   │   ├── mask_former_panoptic_dataset_mapper.py
│   │   │   │   └── mask_former_semantic_dataset_mapper.py
│   │   │   └── datasets/
│   │   │       ├── __init__.py
│   │   │       ├── register_ade20k_full.py
│   │   │       ├── register_ade20k_instance.py
│   │   │       ├── register_ade20k_panoptic.py
│   │   │       ├── register_coco_panoptic_annos_semseg.py
│   │   │       ├── register_coco_stuff_10k.py
│   │   │       ├── register_mapillary_vistas.py
│   │   │       └── register_mapillary_vistas_panoptic.py
│   │   ├── evaluation/
│   │   │   ├── __init__.py
│   │   │   ├── __init__.py.new
│   │   │   └── instance_evaluation.py
│   │   ├── maskformer_model.py
│   │   ├── modeling/
│   │   │   ├── __init__.py
│   │   │   ├── backbone/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __init__.py.new
│   │   │   │   └── swin.py
│   │   │   ├── criterion.py
│   │   │   ├── matcher.py
│   │   │   ├── meta_arch/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __init__.py.new
│   │   │   │   ├── mask_former_head.py
│   │   │   │   └── per_pixel_baseline.py
│   │   │   ├── pixel_decoder/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __init__.py.new
│   │   │   │   ├── fpn.py
│   │   │   │   ├── msdeformattn.py
│   │   │   │   └── ops/
│   │   │   │       ├── functions/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   └── ms_deform_attn_func.py
│   │   │   │       ├── make.sh
│   │   │   │       ├── modules/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   └── ms_deform_attn.py
│   │   │   │       ├── setup.py
│   │   │   │       ├── src/
│   │   │   │       │   ├── cpu/
│   │   │   │       │   │   ├── ms_deform_attn_cpu.cpp
│   │   │   │       │   │   └── ms_deform_attn_cpu.h
│   │   │   │       │   ├── cuda/
│   │   │   │       │   │   ├── ms_deform_attn_cuda.cu
│   │   │   │       │   │   ├── ms_deform_attn_cuda.h
│   │   │   │       │   │   └── ms_deform_im2col_cuda.cuh
│   │   │   │       │   ├── ms_deform_attn.h
│   │   │   │       │   └── vision.cpp
│   │   │   │       └── test.py
│   │   │   └── transformer_decoder/
│   │   │       ├── __init__.py
│   │   │       ├── mask2former_transformer_decoder.py
│   │   │       ├── maskformer_transformer_decoder.py
│   │   │       ├── position_encoding.py
│   │   │       └── transformer.py
│   │   ├── test_time_augmentation.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── __init__.py.new
│   │       └── misc.py
│   ├── mask2former_video/
│   │   ├── __init__.py
│   │   ├── config.py
│   │   ├── data_video/
│   │   │   ├── __init__.py
│   │   │   ├── augmentation.py
│   │   │   ├── build.py
│   │   │   ├── dataset_mapper.py
│   │   │   ├── datasets/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── builtin.py
│   │   │   │   ├── ytvis.py
│   │   │   │   └── ytvis_api/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── ytvos.py
│   │   │   │       └── ytvoseval.py
│   │   │   └── ytvis_eval.py
│   │   ├── modeling/
│   │   │   ├── __init__.py
│   │   │   ├── criterion.py
│   │   │   ├── matcher.py
│   │   │   └── transformer_decoder/
│   │   │       ├── __init__.py
│   │   │       ├── position_encoding.py
│   │   │       └── video_mask2former_transformer_decoder.py
│   │   ├── utils/
│   │   │   ├── __init__.py
│   │   │   └── memory.py
│   │   └── video_maskformer_model.py
│   ├── scripts/
│   │   ├── eval_8gpu_mask2former_r101_video.sh
│   │   ├── train_8gpu_mask2former_r101_video_coco.sh
│   │   ├── train_8gpu_mask2former_r50_video.sh
│   │   ├── train_8gpu_mask2former_r50_video_coco.sh
│   │   ├── visual_video_r101.sh
│   │   └── visual_video_r50.sh
│   └── train_net_video.py
├── requirements.txt
├── scripts/
│   ├── eval_8gpu_mask2former_r101_video.sh
│   ├── eval_8gpu_mask2former_r50_video.sh
│   ├── eval_8gpu_mask2former_swinl_video.sh
│   ├── train_8gpu_mask2former_r101_video.sh
│   ├── train_8gpu_mask2former_r50_video.sh
│   ├── train_8gpu_mask2former_swinl_video.sh
│   └── visual_video.sh
├── tools/
│   ├── README.md
│   ├── analyze_model.py
│   ├── convert-pretrained-swin-model-to-d2.py
│   ├── convert-torchvision-to-d2.py
│   ├── evaluate_coco_boundary_ap.py
│   └── evaluate_pq_for_semantic_segmentation.py
├── train_net.py
├── train_net_video.py
└── util/
    ├── __init__.py
    ├── box_ops.py
    ├── misc.py
    └── plot_utils.py

Download .txt

SYMBOL INDEX (1000 symbols across 115 files)

FILE: demo/demo.py
  function setup_cfg (line 25) | def setup_cfg(args):
  function get_parser (line 35) | def get_parser():
  function test_opencv_video_format (line 70) | def test_opencv_video_format(codec, file_ext):

FILE: demo/predictor.py
  class VisualizationDemo (line 15) | class VisualizationDemo(object):
    method __init__ (line 16) | def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
    method run_on_image (line 37) | def run_on_image(self, image, conf_thre):
    method _frame_from_video (line 84) | def _frame_from_video(self, video):
    method run_on_video (line 92) | def run_on_video(self, video):
  class AsyncPredictor (line 138) | class AsyncPredictor:
    class _StopToken (line 144) | class _StopToken:
    class _PredictWorker (line 147) | class _PredictWorker(mp.Process):
      method __init__ (line 148) | def __init__(self, cfg, task_queue, result_queue):
      method run (line 153) | def run(self):
    method __init__ (line 163) | def __init__(self, cfg, num_gpus: int = 1):
    method put (line 188) | def put(self, image):
    method get (line 192) | def get(self):
    method __len__ (line 207) | def __len__(self):
    method __call__ (line 210) | def __call__(self, image):
    method shutdown (line 214) | def shutdown(self):
    method default_buffer_size (line 219) | def default_buffer_size(self):

FILE: demo_video/demo.py
  function setup_cfg (line 28) | def setup_cfg(args):
  function get_parser (line 38) | def get_parser():
  function test_opencv_video_format (line 78) | def test_opencv_video_format(codec, file_ext):

FILE: demo_video/predictor.py
  class VisualizationDemo (line 15) | class VisualizationDemo(object):
    method __init__ (line 16) | def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
    method run_on_video (line 36) | def run_on_video(self, frames, conf_thre):
  class VideoPredictor (line 70) | class VideoPredictor(DefaultPredictor):
    method __call__ (line 90) | def __call__(self, frames):
  class AsyncPredictor (line 114) | class AsyncPredictor:
    class _StopToken (line 120) | class _StopToken:
    class _PredictWorker (line 122) | class _PredictWorker(mp.Process):
      method __init__ (line 123) | def __init__(self, cfg, task_queue, result_queue):
      method run (line 128) | def run(self):
    method __init__ (line 138) | def __init__(self, cfg, num_gpus: int = 1):
    method put (line 163) | def put(self, image):
    method get (line 167) | def get(self):
    method __len__ (line 182) | def __len__(self):
    method __call__ (line 185) | def __call__(self, image):
    method shutdown (line 189) | def shutdown(self):
    method default_buffer_size (line 194) | def default_buffer_size(self):

FILE: demo_video/visualizer.py
  class TrackVisualizer (line 9) | class TrackVisualizer(Visualizer):
    method __init__ (line 10) | def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=Co...
    method _jitter (line 15) | def _jitter(self, color, id):
    method overlay_instances (line 32) | def overlay_instances(
    method draw_instance_predictions (line 173) | def draw_instance_predictions(self, predictions):

FILE: mask2former/config.py
  function add_maskformer2_config (line 5) | def add_maskformer2_config(cfg):

FILE: mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py
  function masks_to_boxes (line 18) | def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
  function convert_coco_poly_to_mask (line 47) | def convert_coco_poly_to_mask(segmentations, height, width):
  function build_transform_gen (line 66) | def build_transform_gen(cfg, is_train):
  class COCOInstanceNewBaselineDatasetMapper (line 99) | class COCOInstanceNewBaselineDatasetMapper:
    method __init__ (line 115) | def __init__(
    method from_config (line 139) | def from_config(cls, cfg, is_train=True):
    method __call__ (line 150) | def __call__(self, dataset_dict):

FILE: mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py
  function build_transform_gen (line 17) | def build_transform_gen(cfg, is_train):
  class COCOPanopticNewBaselineDatasetMapper (line 50) | class COCOPanopticNewBaselineDatasetMapper:
    method __init__ (line 66) | def __init__(
    method from_config (line 93) | def from_config(cls, cfg, is_train=True):
    method __call__ (line 104) | def __call__(self, dataset_dict):

FILE: mask2former/data/dataset_mappers/mask_former_instance_dataset_mapper.py
  class MaskFormerInstanceDatasetMapper (line 18) | class MaskFormerInstanceDatasetMapper:
    method __init__ (line 32) | def __init__(
    method from_config (line 58) | def from_config(cls, cfg, is_train=True):
    method __call__ (line 86) | def __call__(self, dataset_dict):

FILE: mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py
  class MaskFormerPanopticDatasetMapper (line 18) | class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper):
    method __init__ (line 32) | def __init__(
    method __call__ (line 58) | def __call__(self, dataset_dict):

FILE: mask2former/data/dataset_mappers/mask_former_semantic_dataset_mapper.py
  class MaskFormerSemanticDatasetMapper (line 18) | class MaskFormerSemanticDatasetMapper:
    method __init__ (line 32) | def __init__(
    method from_config (line 61) | def from_config(cls, cfg, is_train=True):
    method __call__ (line 97) | def __call__(self, dataset_dict):

FILE: mask2former/data/datasets/register_ade20k_full.py
  function _get_ade20k_full_meta (line 925) | def _get_ade20k_full_meta():
  function register_all_ade20k_full (line 943) | def register_all_ade20k_full(root):

FILE: mask2former/data/datasets/register_ade20k_instance.py
  function _get_ade_instances_meta (line 27) | def _get_ade_instances_meta():
  function register_all_ade20k_instance (line 40) | def register_all_ade20k_instance(root):

FILE: mask2former/data/datasets/register_ade20k_panoptic.py
  function load_ade20k_panoptic_json (line 216) | def load_ade20k_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, ...
  function register_ade20k_panoptic (line 270) | def register_ade20k_panoptic(
  function get_metadata (line 325) | def get_metadata():
  function register_all_ade20k_panoptic (line 369) | def register_all_ade20k_panoptic(root):

FILE: mask2former/data/datasets/register_coco_panoptic_annos_semseg.py
  function get_metadata (line 30) | def get_metadata():
  function load_coco_panoptic_json (line 74) | def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, me...
  function register_coco_panoptic_annos_sem_seg (line 128) | def register_coco_panoptic_annos_sem_seg(
  function register_all_coco_panoptic_annos_sem_seg (line 159) | def register_all_coco_panoptic_annos_sem_seg(root):

FILE: mask2former/data/datasets/register_coco_stuff_10k.py
  function _get_coco_stuff_meta (line 181) | def _get_coco_stuff_meta():
  function register_all_coco_stuff_10k (line 199) | def register_all_coco_stuff_10k(root):

FILE: mask2former/data/datasets/register_mapillary_vistas.py
  function _get_mapillary_vistas_meta (line 472) | def _get_mapillary_vistas_meta():
  function register_all_mapillary_vistas (line 486) | def register_all_mapillary_vistas(root):

FILE: mask2former/data/datasets/register_mapillary_vistas_panoptic.py
  function load_mapillary_vistas_panoptic_json (line 337) | def load_mapillary_vistas_panoptic_json(json_file, image_dir, gt_dir, se...
  function register_mapillary_vistas_panoptic (line 391) | def register_mapillary_vistas_panoptic(
  function get_metadata (line 444) | def get_metadata():
  function register_all_mapillary_vistas_panoptic (line 488) | def register_all_mapillary_vistas_panoptic(root):

FILE: mask2former/evaluation/instance_evaluation.py
  class InstanceSegEvaluator (line 29) | class InstanceSegEvaluator(COCOEvaluator):
    method _eval_predictions (line 42) | def _eval_predictions(self, predictions, img_ids=None):

FILE: mask2former/maskformer_model.py
  function unfold_wo_center (line 22) | def unfold_wo_center(x, kernel_size, dilation):
  function get_images_color_similarity (line 47) | def get_images_color_similarity(images, kernel_size, dilation):
  class MaskFormer (line 62) | class MaskFormer(nn.Module):
    method __init__ (line 68) | def __init__(
    method from_config (line 138) | def from_config(cls, cfg):
    method device (line 205) | def device(self):
    method forward (line 208) | def forward(self, batched_inputs):
    method prepare_targets (line 352) | def prepare_targets(self, targets, images):
    method semantic_inference (line 368) | def semantic_inference(self, mask_cls, mask_pred):
    method panoptic_inference (line 374) | def panoptic_inference(self, mask_cls, mask_pred):
    method instance_inference (line 432) | def instance_inference(self, mask_cls, mask_pred):

FILE: mask2former/modeling/backbone/swin.py
  class Mlp (line 20) | class Mlp(nn.Module):
    method __init__ (line 23) | def __init__(
    method forward (line 34) | def forward(self, x):
  function window_partition (line 43) | def window_partition(x, window_size):
  function window_reverse (line 57) | def window_reverse(windows, window_size, H, W):
  class WindowAttention (line 73) | class WindowAttention(nn.Module):
    method __init__ (line 86) | def __init__(
    method forward (line 130) | def forward(self, x, mask=None):
  class SwinTransformerBlock (line 173) | class SwinTransformerBlock(nn.Module):
    method __init__ (line 190) | def __init__(
    method forward (line 234) | def forward(self, x, mask_matrix):
  class PatchMerging (line 297) | class PatchMerging(nn.Module):
    method __init__ (line 304) | def __init__(self, dim, norm_layer=nn.LayerNorm):
    method forward (line 310) | def forward(self, x, H, W):
  class BasicLayer (line 339) | class BasicLayer(nn.Module):
    method __init__ (line 357) | def __init__(
    method forward (line 405) | def forward(self, x, H, W):
  class PatchEmbed (line 455) | class PatchEmbed(nn.Module):
    method __init__ (line 464) | def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=...
    method forward (line 478) | def forward(self, x):
  class SwinTransformer (line 497) | class SwinTransformer(nn.Module):
    method __init__ (line 525) | def __init__(
    method _freeze_stages (line 617) | def _freeze_stages(self):
    method init_weights (line 634) | def init_weights(self, pretrained=None):
    method forward (line 650) | def forward(self, x):
    method train (line 679) | def train(self, mode=True):
  class D2SwinTransformer (line 686) | class D2SwinTransformer(SwinTransformer, Backbone):
    method __init__ (line 687) | def __init__(self, cfg, input_shape):
    method forward (line 742) | def forward(self, x):
    method output_shape (line 759) | def output_shape(self):
    method size_divisibility (line 768) | def size_divisibility(self):

FILE: mask2former/modeling/criterion.py
  function unfold_wo_center (line 19) | def unfold_wo_center(x, kernel_size, dilation):
  function compute_pairwise_term (line 44) | def compute_pairwise_term(mask_logits, pairwise_size, pairwise_dilation):
  function get_incoherent_mask (line 73) | def get_incoherent_mask(input_masks, sfact):
  function dice_coefficient (line 84) | def dice_coefficient(x, target):
  function compute_project_term (line 94) | def compute_project_term(mask_scores, gt_bitmasks):
  function dice_loss (line 105) | def dice_loss(
  function sigmoid_ce_loss (line 132) | def sigmoid_ce_loss(
  function calculate_uncertainty (line 157) | def calculate_uncertainty(logits):
  class SetCriterion (line 174) | class SetCriterion(nn.Module):
    method __init__ (line 181) | def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses,
    method loss_labels (line 210) | def loss_labels(self, outputs, targets, indices, num_masks):
    method loss_masks_proj (line 229) | def loss_masks_proj(self, outputs, targets, indices, num_masks, images...
    method loss_masks (line 285) | def loss_masks(self, outputs, targets, indices, num_masks):
    method _get_src_permutation_idx (line 337) | def _get_src_permutation_idx(self, indices):
    method _get_tgt_permutation_idx (line 343) | def _get_tgt_permutation_idx(self, indices):
    method get_loss (line 349) | def get_loss(self, loss, outputs, targets, indices, num_masks, images_...
    method forward (line 360) | def forward(self, outputs, targets, images_lab_sim):
    method __repr__ (line 397) | def __repr__(self):

FILE: mask2former/modeling/matcher.py
  function batch_dice_loss (line 14) | def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
  function batch_sigmoid_ce_loss (line 37) | def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
  function masks_to_boxes (line 69) | def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
  function masks_to_boxes_cc (line 98) | def masks_to_boxes_cc(masks: torch.Tensor) -> torch.Tensor:
  class HungarianMatcher (line 134) | class HungarianMatcher(nn.Module):
    method __init__ (line 142) | def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_d...
    method memory_efficient_forward (line 163) | def memory_efficient_forward(self, outputs, targets):
    method forward (line 212) | def forward(self, outputs, targets):
    method __repr__ (line 234) | def __repr__(self, _repr_indent=4):

FILE: mask2former/modeling/meta_arch/mask_former_head.py
  class MaskFormerHead (line 18) | class MaskFormerHead(nn.Module):
    method _load_from_state_dict (line 22) | def _load_from_state_dict(
    method __init__ (line 49) | def __init__(
    method from_config (line 89) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
    method forward (line 116) | def forward(self, features, mask=None):
    method layers (line 119) | def layers(self, features, mask=None):

FILE: mask2former/modeling/meta_arch/per_pixel_baseline.py
  class PerPixelBaselineHead (line 17) | class PerPixelBaselineHead(nn.Module):
    method _load_from_state_dict (line 21) | def _load_from_state_dict(
    method __init__ (line 47) | def __init__(
    method from_config (line 82) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
    method forward (line 93) | def forward(self, features, targets=None):
    method layers (line 108) | def layers(self, features):
    method losses (line 113) | def losses(self, predictions, targets):
  class PerPixelBaselinePlusHead (line 126) | class PerPixelBaselinePlusHead(PerPixelBaselineHead):
    method _load_from_state_dict (line 127) | def _load_from_state_dict(
    method __init__ (line 152) | def __init__(
    method from_config (line 194) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
    method forward (line 207) | def forward(self, features, targets=None):
    method layers (line 230) | def layers(self, features):

FILE: mask2former/modeling/pixel_decoder/fpn.py
  function build_pixel_decoder (line 20) | def build_pixel_decoder(cfg, input_shape):
  class BasePixelDecoder (line 37) | class BasePixelDecoder(nn.Module):
    method __init__ (line 39) | def __init__(
    method from_config (line 125) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
    method forward_features (line 135) | def forward_features(self, features):
    method forward (line 155) | def forward(self, features, targets=None):
  class TransformerEncoderOnly (line 161) | class TransformerEncoderOnly(nn.Module):
    method __init__ (line 162) | def __init__(
    method _reset_parameters (line 185) | def _reset_parameters(self):
    method forward (line 190) | def forward(self, src, mask, pos_embed):
  class TransformerEncoderPixelDecoder (line 204) | class TransformerEncoderPixelDecoder(BasePixelDecoder):
    method __init__ (line 206) | def __init__(
    method from_config (line 272) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
    method forward_features (line 283) | def forward_features(self, features):
    method forward (line 308) | def forward(self, features, targets=None):

FILE: mask2former/modeling/pixel_decoder/msdeformattn.py
  class MSDeformAttnTransformerEncoderOnly (line 22) | class MSDeformAttnTransformerEncoderOnly(nn.Module):
    method __init__ (line 23) | def __init__(self, d_model=256, nhead=8,
    method _reset_parameters (line 42) | def _reset_parameters(self):
    method get_valid_ratio (line 51) | def get_valid_ratio(self, mask):
    method forward (line 60) | def forward(self, srcs, pos_embeds):
  class MSDeformAttnTransformerEncoderLayer (line 91) | class MSDeformAttnTransformerEncoderLayer(nn.Module):
    method __init__ (line 92) | def __init__(self,
    method with_pos_embed (line 112) | def with_pos_embed(tensor, pos):
    method forward_ffn (line 115) | def forward_ffn(self, src):
    method forward (line 121) | def forward(self, src, pos, reference_points, spatial_shapes, level_st...
  class MSDeformAttnTransformerEncoder (line 133) | class MSDeformAttnTransformerEncoder(nn.Module):
    method __init__ (line 134) | def __init__(self, encoder_layer, num_layers):
    method get_reference_points (line 140) | def get_reference_points(spatial_shapes, valid_ratios, device):
    method forward (line 154) | def forward(self, src, spatial_shapes, level_start_index, valid_ratios...
  class MSDeformAttnPixelDecoder (line 164) | class MSDeformAttnPixelDecoder(nn.Module):
    method __init__ (line 166) | def __init__(
    method from_config (line 294) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
    method forward_features (line 314) | def forward_features(self, features):

FILE: mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py
  class MSDeformAttnFunction (line 31) | class MSDeformAttnFunction(Function):
    method forward (line 33) | def forward(ctx, value, value_spatial_shapes, value_level_start_index,...
    method backward (line 42) | def backward(ctx, grad_output):
  function ms_deform_attn_core_pytorch (line 51) | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_lo...

FILE: mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py
  function _is_power_of_2 (line 27) | def _is_power_of_2(n):
  class MSDeformAttn (line 33) | class MSDeformAttn(nn.Module):
    method __init__ (line 34) | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
    method _reset_parameters (line 65) | def _reset_parameters(self):
    method forward (line 81) | def forward(self, query, reference_points, input_flatten, input_spatia...

FILE: mask2former/modeling/pixel_decoder/ops/setup.py
  function get_extensions (line 25) | def get_extensions():

FILE: mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp
  function ms_deform_attn_cpu_forward (line 22) | at::Tensor
  function ms_deform_attn_cpu_backward (line 34) | std::vector<at::Tensor>

FILE: mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h
  function im2col_step (line 32) | int im2col_step)

FILE: mask2former/modeling/pixel_decoder/ops/src/vision.cpp
  function PYBIND11_MODULE (line 18) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: mask2former/modeling/pixel_decoder/ops/test.py
  function check_forward_equal_with_pytorch_double (line 34) | def check_forward_equal_with_pytorch_double():
  function check_forward_equal_with_pytorch_float (line 50) | def check_forward_equal_with_pytorch_float():
  function check_gradient_numerical (line 65) | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_...

FILE: mask2former/modeling/transformer_decoder/mask2former_transformer_decoder.py
  class SelfAttentionLayer (line 16) | class SelfAttentionLayer(nn.Module):
    method __init__ (line 18) | def __init__(self, d_model, nhead, dropout=0.0,
    method _reset_parameters (line 31) | def _reset_parameters(self):
    method with_pos_embed (line 36) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 39) | def forward_post(self, tgt,
    method forward_pre (line 51) | def forward_pre(self, tgt,
    method forward (line 63) | def forward(self, tgt,
  class CrossAttentionLayer (line 74) | class CrossAttentionLayer(nn.Module):
    method __init__ (line 76) | def __init__(self, d_model, nhead, dropout=0.0,
    method _reset_parameters (line 89) | def _reset_parameters(self):
    method with_pos_embed (line 94) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 97) | def forward_post(self, tgt, memory,
    method forward_pre (line 111) | def forward_pre(self, tgt, memory,
    method forward (line 125) | def forward(self, tgt, memory,
  class FFNLayer (line 137) | class FFNLayer(nn.Module):
    method __init__ (line 139) | def __init__(self, d_model, dim_feedforward=2048, dropout=0.0,
    method _reset_parameters (line 154) | def _reset_parameters(self):
    method with_pos_embed (line 159) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 162) | def forward_post(self, tgt):
    method forward_pre (line 168) | def forward_pre(self, tgt):
    method forward (line 174) | def forward(self, tgt):
  function _get_activation_fn (line 180) | def _get_activation_fn(activation):
  class MLP (line 191) | class MLP(nn.Module):
    method __init__ (line 194) | def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
    method forward (line 200) | def forward(self, x):
  class MultiScaleMaskedTransformerDecoder (line 207) | class MultiScaleMaskedTransformerDecoder(nn.Module):
    method _load_from_state_dict (line 211) | def _load_from_state_dict(
    method __init__ (line 235) | def __init__(
    method from_config (line 336) | def from_config(cls, cfg, in_channels, mask_classification):
    method forward (line 362) | def forward(self, x, mask_features, mask = None):
    method forward_prediction_heads (line 434) | def forward_prediction_heads(self, output, mask_features, attn_mask_ta...
    method _set_aux_loss (line 452) | def _set_aux_loss(self, outputs_class, outputs_seg_masks):

FILE: mask2former/modeling/transformer_decoder/maskformer_transformer_decoder.py
  function build_transformer_decoder (line 21) | def build_transformer_decoder(cfg, in_channels, mask_classification=True):
  class StandardTransformerDecoder (line 30) | class StandardTransformerDecoder(nn.Module):
    method __init__ (line 32) | def __init__(
    method from_config (line 107) | def from_config(cls, cfg, in_channels, mask_classification):
    method forward (line 129) | def forward(self, x, mask_features, mask=None):
    method _set_aux_loss (line 160) | def _set_aux_loss(self, outputs_class, outputs_seg_masks):
  class MLP (line 173) | class MLP(nn.Module):
    method __init__ (line 176) | def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
    method forward (line 184) | def forward(self, x):

FILE: mask2former/modeling/transformer_decoder/position_encoding.py
  class PositionEmbeddingSine (line 11) | class PositionEmbeddingSine(nn.Module):
    method __init__ (line 17) | def __init__(self, num_pos_feats=64, temperature=10000, normalize=Fals...
    method forward (line 28) | def forward(self, x, mask=None):
    method __repr__ (line 53) | def __repr__(self, _repr_indent=4):

FILE: mask2former/modeling/transformer_decoder/transformer.py
  class Transformer (line 18) | class Transformer(nn.Module):
    method __init__ (line 19) | def __init__(
    method _reset_parameters (line 55) | def _reset_parameters(self):
    method forward (line 60) | def forward(self, src, mask, query_embed, pos_embed):
  class TransformerEncoder (line 77) | class TransformerEncoder(nn.Module):
    method __init__ (line 78) | def __init__(self, encoder_layer, num_layers, norm=None):
    method forward (line 84) | def forward(
  class TransformerDecoder (line 104) | class TransformerDecoder(nn.Module):
    method __init__ (line 105) | def __init__(self, decoder_layer, num_layers, norm=None, return_interm...
    method forward (line 112) | def forward(
  class TransformerEncoderLayer (line 153) | class TransformerEncoderLayer(nn.Module):
    method __init__ (line 154) | def __init__(
    method with_pos_embed (line 178) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 181) | def forward_post(
    method forward_pre (line 199) | def forward_pre(
    method forward (line 217) | def forward(
  class TransformerDecoderLayer (line 229) | class TransformerDecoderLayer(nn.Module):
    method __init__ (line 230) | def __init__(
    method with_pos_embed (line 257) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 260) | def forward_post(
    method forward_pre (line 291) | def forward_pre(
    method forward (line 322) | def forward(
  function _get_clones (line 356) | def _get_clones(module, N):
  function _get_activation_fn (line 360) | def _get_activation_fn(activation):

FILE: mask2former/test_time_augmentation.py
  class SemanticSegmentorWithTTA (line 20) | class SemanticSegmentorWithTTA(nn.Module):
    method __init__ (line 26) | def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
    method __call__ (line 48) | def __call__(self, batched_inputs):
    method _inference_one_image (line 70) | def _inference_one_image(self, input):
    method _get_augmented_inputs (line 99) | def _get_augmented_inputs(self, input):

FILE: mask2former/utils/misc.py
  function _max_by_axis (line 15) | def _max_by_axis(the_list):
  class NestedTensor (line 24) | class NestedTensor(object):
    method __init__ (line 25) | def __init__(self, tensors, mask: Optional[Tensor]):
    method to (line 29) | def to(self, device):
    method decompose (line 40) | def decompose(self):
    method __repr__ (line 43) | def __repr__(self):
  function nested_tensor_from_tensor_list (line 47) | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
  function _onnx_nested_tensor_from_tensor_list (line 75) | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> N...
  function is_dist_avail_and_initialized (line 105) | def is_dist_avail_and_initialized():

FILE: mask2former_video/config.py
  function add_maskformer2_video_config (line 5) | def add_maskformer2_video_config(cfg):

FILE: mask2former_video/data_video/augmentation.py
  class RandomApplyClip (line 15) | class RandomApplyClip(T.Augmentation):
    method __init__ (line 20) | def __init__(self, tfm_or_aug, prob=0.5, clip_frame_cnt=1):
    method get_transform (line 36) | def get_transform(self, *args):
    method __call__ (line 47) | def __call__(self, aug_input):
  class RandomRotationClip (line 59) | class RandomRotationClip(T.Augmentation):
    method __init__ (line 65) | def __init__(self, angle, prob=0.5, expand=True, center=None, interp=N...
    method get_transform (line 90) | def get_transform(self, image):
  class ResizeShortestEdge (line 122) | class ResizeShortestEdge(T.Augmentation):
    method __init__ (line 128) | def __init__(
    method get_transform (line 153) | def get_transform(self, image):
  class RandomFlip (line 181) | class RandomFlip(T.Augmentation):
    method __init__ (line 186) | def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_...
    method get_transform (line 203) | def get_transform(self, image):
  class RandomCropClip (line 219) | class RandomCropClip(T.Augmentation):
    method __init__ (line 224) | def __init__(self, crop_type: str, crop_size, clip_frame_cnt=1):
    method get_transform (line 246) | def get_transform(self, image):
    method get_crop_size (line 277) | def get_crop_size(self, image_size):
  class FixedSizeCropClip (line 303) | class FixedSizeCropClip(T.Augmentation):
    method __init__ (line 311) | def __init__(self, crop_size: Tuple[int], pad: bool = True, pad_value:...
    method _get_crop (line 322) | def _get_crop(self, image: np.ndarray):
    method _get_pad (line 342) | def _get_pad(self, image: np.ndarray):
    method get_transform (line 355) | def get_transform(self, image: np.ndarray):
  class ResizeShortestEdgeClip (line 361) | class ResizeShortestEdgeClip(T.Augmentation):
    method __init__ (line 367) | def __init__(
    method get_transform (line 392) | def get_transform(self, image):
  class RandomFlipClip (line 420) | class RandomFlipClip(T.Augmentation):
    method __init__ (line 425) | def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_...
    method get_transform (line 442) | def get_transform(self, image):
  function build_augmentation (line 459) | def build_augmentation(cfg, is_train):
  function build_pseudo_augmentation (line 516) | def build_pseudo_augmentation(cfg, is_train):

FILE: mask2former_video/data_video/build.py
  function _compute_num_images_per_worker (line 22) | def _compute_num_images_per_worker(cfg: CfgNode):
  function filter_images_with_only_crowd_annotations (line 39) | def filter_images_with_only_crowd_annotations(dataset_dicts, dataset_nam...
  function get_detection_dataset_dicts (line 75) | def get_detection_dataset_dicts(
  function _train_loader_from_config (line 115) | def _train_loader_from_config(cfg, mapper, dataset_name=None, *, dataset...
  function build_detection_train_loader (line 145) | def build_detection_train_loader(
  function build_combined_loader (line 193) | def build_combined_loader(cfg: CfgNode, loaders: Collection[Loader], rat...
  function _test_loader_from_config (line 197) | def _test_loader_from_config(cfg, dataset_name, mapper=None):
  function build_detection_test_loader (line 217) | def build_detection_test_loader(dataset, *, mapper, num_workers=0):

FILE: mask2former_video/data_video/combined_loader.py
  function _pooled_next (line 8) | def _pooled_next(iterator: Iterator[Any], pool: Deque[Any]):
  class CombinedDataLoader (line 14) | class CombinedDataLoader:
    method __init__ (line 21) | def __init__(self, loaders: Collection[Loader], batch_size: int, ratio...
    method __iter__ (line 26) | def __iter__(self) -> Iterator[List[Any]]:

FILE: mask2former_video/data_video/dataset_mapper.py
  function seed_everything (line 30) | def seed_everything(seed):
  function filter_empty_instances (line 36) | def filter_empty_instances(instances, by_box=True, by_mask=True, box_thr...
  function _get_dummy_anno (line 67) | def _get_dummy_anno():
  function ytvis_annotations_to_instances (line 77) | def ytvis_annotations_to_instances(annos, image_size):
  function convert_coco_poly_to_mask (line 122) | def convert_coco_poly_to_mask(segmentations, height, width):
  class YTVISDatasetMapper (line 138) | class YTVISDatasetMapper:
    method __init__ (line 145) | def __init__(
    method from_config (line 209) | def from_config(cls, cfg, is_train: bool = True, is_tgt: bool = True):
    method __call__ (line 231) | def __call__(self, dataset_dict):
  class CocoClipDatasetMapper (line 334) | class CocoClipDatasetMapper:
    method __init__ (line 341) | def __init__(
    method from_config (line 392) | def from_config(cls, cfg, is_train: bool = True, is_tgt: bool = True):
    method __call__ (line 414) | def __call__(self, dataset_dict):

FILE: mask2former_video/data_video/datasets/builtin.py
  function register_all_ytvis_2019 (line 41) | def register_all_ytvis_2019(root):
  function register_all_ytvis_2021 (line 52) | def register_all_ytvis_2021(root):
  function register_all_coco (line 62) | def register_all_coco(root):

FILE: mask2former_video/data_video/datasets/ytvis.py
  function _get_ytvis_2019_instances_meta (line 120) | def _get_ytvis_2019_instances_meta():
  function _get_ytvis_2021_instances_meta (line 135) | def _get_ytvis_2021_instances_meta():
  function load_ytvis_json (line 150) | def load_ytvis_json(json_file, image_root, dataset_name=None, extra_anno...
  function register_ytvis_instances (line 276) | def register_ytvis_instances(name, metadata, json_file, image_root):
  function extract_frame_dic (line 323) | def extract_frame_dic(dic, frame_idx):

FILE: mask2former_video/data_video/datasets/ytvis_api/ytvos.py
  function _isArrayLike (line 43) | def _isArrayLike(obj):
  class YTVOS (line 47) | class YTVOS:
    method __init__ (line 48) | def __init__(self, annotation_file=None):
    method createIndex (line 67) | def createIndex(self):
    method info (line 98) | def info(self):
    method getAnnIds (line 106) | def getAnnIds(self, vidIds=[], catIds=[], areaRng=[], iscrowd=None):
    method getCatIds (line 134) | def getCatIds(self, catNms=[], supNms=[], catIds=[]):
    method getVidIds (line 156) | def getVidIds(self, vidIds=[], catIds=[]):
    method loadAnns (line 177) | def loadAnns(self, ids=[]):
    method loadCats (line 188) | def loadCats(self, ids=[]):
    method loadVids (line 199) | def loadVids(self, ids=[]):
    method loadRes (line 211) | def loadRes(self, resFile):
    method annToRLE (line 261) | def annToRLE(self, ann, frameId):
    method annToMask (line 282) | def annToMask(self, ann, frameId):

FILE: mask2former_video/data_video/datasets/ytvis_api/ytvoseval.py
  class YTVOSeval (line 12) | class YTVOSeval:
    method __init__ (line 62) | def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'):
    method _prepare (line 87) | def _prepare(self):
    method evaluate (line 131) | def evaluate(self):
    method computeIoU (line 175) | def computeIoU(self, vidId, catId):
    method computeOks (line 223) | def computeOks(self, imgId, catId):
    method evaluateVid (line 266) | def evaluateVid(self, vidId, catId, aRng, maxDet):
    method accumulate (line 346) | def accumulate(self, p = None):
    method summarize (line 453) | def summarize(self):
    method __str__ (line 526) | def __str__(self):
  class Params (line 529) | class Params:
    method setDetParams (line 533) | def setDetParams(self):
    method setKpParams (line 546) | def setKpParams(self):
    method __init__ (line 557) | def __init__(self, iouType='segm'):

FILE: mask2former_video/data_video/ytvis_eval.py
  class YTVISEvaluator (line 26) | class YTVISEvaluator(DatasetEvaluator):
    method __init__ (line 37) | def __init__(
    method reset (line 99) | def reset(self):
    method process (line 102) | def process(self, inputs, outputs):
    method evaluate (line 114) | def evaluate(self):
    method _eval_predictions (line 144) | def _eval_predictions(self, predictions):
    method _derive_coco_results (line 192) | def _derive_coco_results(self, coco_eval, class_names=None):
  function instances_to_coco_json_video (line 255) | def instances_to_coco_json_video(inputs, outputs):
  function _evaluate_predictions_on_coco (line 295) | def _evaluate_predictions_on_coco(

FILE: mask2former_video/modeling/criterion.py
  function unfold_wo_center (line 20) | def unfold_wo_center(x, kernel_size, dilation):
  function unfold_w_center (line 45) | def unfold_w_center(x, kernel_size, dilation):
  function compute_pairwise_term (line 64) | def compute_pairwise_term(mask_logits, pairwise_size, pairwise_dilation):
  function compute_pairwise_term_neighbor (line 93) | def compute_pairwise_term_neighbor(mask_logits, mask_logits_neighbor, pa...
  function dice_coefficient (line 126) | def dice_coefficient(x, target):
  function compute_project_term (line 136) | def compute_project_term(mask_scores, gt_bitmasks):
  function dice_loss (line 147) | def dice_loss(
  function sigmoid_ce_loss (line 174) | def sigmoid_ce_loss(
  function visualize_masks (line 198) | def visualize_masks(masks, output_dir='masks'):
  function calculate_uncertainty (line 215) | def calculate_uncertainty(logits):
  class VideoSetCriterion (line 232) | class VideoSetCriterion(nn.Module):
    method __init__ (line 239) | def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses,
    method loss_labels (line 267) | def loss_labels(self, outputs, targets, indices, num_masks):
    method loss_masks (line 285) | def loss_masks(self, outputs, targets, indices, num_masks):
    method topk_mask (line 338) | def topk_mask(self, images_lab_sim):
    method loss_masks_proj (line 344) | def loss_masks_proj(self, outputs, targets, indices, num_masks, images...
    method _get_src_permutation_idx (line 431) | def _get_src_permutation_idx(self, indices):
    method _get_tgt_permutation_idx (line 437) | def _get_tgt_permutation_idx(self, indices):
    method get_loss (line 443) | def get_loss(self, loss, outputs, targets, indices, num_masks, images_...
    method forward (line 454) | def forward(self, outputs, targets, images_lab_sim, images_lab_sim_nei...
    method __repr__ (line 491) | def __repr__(self):

FILE: mask2former_video/modeling/matcher.py
  function masks_to_boxes (line 31) | def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
  function masks_to_boxes_new (line 60) | def masks_to_boxes_new(masks: torch.Tensor) -> torch.Tensor:
  function batch_dice_loss (line 104) | def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
  function batch_dice_loss_nosig (line 121) | def batch_dice_loss_nosig(inputs: torch.Tensor, targets: torch.Tensor):
  function batch_sigmoid_ce_loss (line 146) | def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
  function batch_sigmoid_ce_loss_nosig (line 171) | def batch_sigmoid_ce_loss_nosig(inputs: torch.Tensor, targets: torch.Ten...
  class VideoHungarianMatcher (line 205) | class VideoHungarianMatcher(nn.Module):
    method __init__ (line 213) | def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_d...
    method memory_efficient_forward (line 231) | def memory_efficient_forward(self, outputs, targets):
    method forward (line 311) | def forward(self, outputs, targets):
    method __repr__ (line 333) | def __repr__(self, _repr_indent=4):

FILE: mask2former_video/modeling/transformer_decoder/position_encoding.py
  class PositionEmbeddingSine3D (line 11) | class PositionEmbeddingSine3D(nn.Module):
    method __init__ (line 17) | def __init__(self, num_pos_feats=64, temperature=10000, normalize=Fals...
    method forward (line 28) | def forward(self, x, mask=None):

FILE: mask2former_video/modeling/transformer_decoder/video_mask2former_transformer_decoder.py
  class SelfAttentionLayer (line 17) | class SelfAttentionLayer(nn.Module):
    method __init__ (line 19) | def __init__(self, d_model, nhead, dropout=0.0,
    method _reset_parameters (line 32) | def _reset_parameters(self):
    method with_pos_embed (line 37) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 40) | def forward_post(self, tgt,
    method forward_pre (line 52) | def forward_pre(self, tgt,
    method forward (line 64) | def forward(self, tgt,
  class CrossAttentionLayer (line 75) | class CrossAttentionLayer(nn.Module):
    method __init__ (line 77) | def __init__(self, d_model, nhead, dropout=0.0,
    method _reset_parameters (line 90) | def _reset_parameters(self):
    method with_pos_embed (line 95) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 98) | def forward_post(self, tgt, memory,
    method forward_pre (line 112) | def forward_pre(self, tgt, memory,
    method forward (line 126) | def forward(self, tgt, memory,
  class FFNLayer (line 138) | class FFNLayer(nn.Module):
    method __init__ (line 140) | def __init__(self, d_model, dim_feedforward=2048, dropout=0.0,
    method _reset_parameters (line 155) | def _reset_parameters(self):
    method with_pos_embed (line 160) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 163) | def forward_post(self, tgt):
    method forward_pre (line 169) | def forward_pre(self, tgt):
    method forward (line 175) | def forward(self, tgt):
  function _get_activation_fn (line 181) | def _get_activation_fn(activation):
  class MLP (line 192) | class MLP(nn.Module):
    method __init__ (line 195) | def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
    method forward (line 201) | def forward(self, x):
  class VideoMultiScaleMaskedTransformerDecoder (line 208) | class VideoMultiScaleMaskedTransformerDecoder(nn.Module):
    method _load_from_state_dict (line 212) | def _load_from_state_dict(
    method __init__ (line 236) | def __init__(
    method from_config (line 341) | def from_config(cls, cfg, in_channels, mask_classification):
    method forward (line 369) | def forward(self, x, mask_features, mask = None):
    method forward_prediction_heads (line 447) | def forward_prediction_heads(self, output, mask_features, attn_mask_ta...
    method _set_aux_loss (line 467) | def _set_aux_loss(self, outputs_class, outputs_seg_masks):

FILE: mask2former_video/utils/memory.py
  function _ignore_torch_cuda_oom (line 12) | def _ignore_torch_cuda_oom():
  function retry_if_cuda_oom (line 26) | def retry_if_cuda_oom(func):

FILE: mask2former_video/video_maskformer_model.py
  function unfold_wo_center (line 24) | def unfold_wo_center(x, kernel_size, dilation):
  function unfold_w_center (line 49) | def unfold_w_center(x, kernel_size, dilation):
  function get_images_color_similarity (line 68) | def get_images_color_similarity(images, kernel_size, dilation):
  function get_neighbor_images_color_similarity (line 80) | def get_neighbor_images_color_similarity(images, images_neighbor, kernel...
  function get_neighbor_images_patch_color_similarity (line 94) | def get_neighbor_images_patch_color_similarity(images, images_neighbor, ...
  class VideoMaskFormer (line 114) | class VideoMaskFormer(nn.Module):
    method __init__ (line 120) | def __init__(
    method from_config (line 181) | def from_config(cls, cfg):
    method device (line 241) | def device(self):
    method forward (line 243) | def forward(self, batched_inputs):
    method prepare_targets (line 333) | def prepare_targets(self, targets, images, is_coco):
    method inference_video (line 380) | def inference_video(self, pred_cls, pred_masks, img_size, output_heigh...

FILE: mfvis_nococo/mask2former/config.py
  function add_maskformer2_config (line 5) | def add_maskformer2_config(cfg):

FILE: mfvis_nococo/mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py
  function masks_to_boxes (line 18) | def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
  function convert_coco_poly_to_mask (line 47) | def convert_coco_poly_to_mask(segmentations, height, width):
  function build_transform_gen (line 66) | def build_transform_gen(cfg, is_train):
  class COCOInstanceNewBaselineDatasetMapper (line 99) | class COCOInstanceNewBaselineDatasetMapper:
    method __init__ (line 115) | def __init__(
    method from_config (line 139) | def from_config(cls, cfg, is_train=True):
    method __call__ (line 150) | def __call__(self, dataset_dict):

FILE: mfvis_nococo/mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py
  function build_transform_gen (line 17) | def build_transform_gen(cfg, is_train):
  class COCOPanopticNewBaselineDatasetMapper (line 50) | class COCOPanopticNewBaselineDatasetMapper:
    method __init__ (line 66) | def __init__(
    method from_config (line 93) | def from_config(cls, cfg, is_train=True):
    method __call__ (line 104) | def __call__(self, dataset_dict):

FILE: mfvis_nococo/mask2former/data/dataset_mappers/mask_former_instance_dataset_mapper.py
  class MaskFormerInstanceDatasetMapper (line 18) | class MaskFormerInstanceDatasetMapper:
    method __init__ (line 32) | def __init__(
    method from_config (line 58) | def from_config(cls, cfg, is_train=True):
    method __call__ (line 86) | def __call__(self, dataset_dict):

FILE: mfvis_nococo/mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py
  class MaskFormerPanopticDatasetMapper (line 18) | class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper):
    method __init__ (line 32) | def __init__(
    method __call__ (line 58) | def __call__(self, dataset_dict):

FILE: mfvis_nococo/mask2former/data/dataset_mappers/mask_former_semantic_dataset_mapper.py
  class MaskFormerSemanticDatasetMapper (line 18) | class MaskFormerSemanticDatasetMapper:
    method __init__ (line 32) | def __init__(
    method from_config (line 61) | def from_config(cls, cfg, is_train=True):
    method __call__ (line 97) | def __call__(self, dataset_dict):

FILE: mfvis_nococo/mask2former/data/datasets/register_ade20k_full.py
  function _get_ade20k_full_meta (line 925) | def _get_ade20k_full_meta():
  function register_all_ade20k_full (line 943) | def register_all_ade20k_full(root):

FILE: mfvis_nococo/mask2former/data/datasets/register_ade20k_instance.py
  function _get_ade_instances_meta (line 27) | def _get_ade_instances_meta():
  function register_all_ade20k_instance (line 40) | def register_all_ade20k_instance(root):

FILE: mfvis_nococo/mask2former/data/datasets/register_ade20k_panoptic.py
  function load_ade20k_panoptic_json (line 216) | def load_ade20k_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, ...
  function register_ade20k_panoptic (line 270) | def register_ade20k_panoptic(
  function get_metadata (line 325) | def get_metadata():
  function register_all_ade20k_panoptic (line 369) | def register_all_ade20k_panoptic(root):

FILE: mfvis_nococo/mask2former/data/datasets/register_coco_panoptic_annos_semseg.py
  function get_metadata (line 30) | def get_metadata():
  function load_coco_panoptic_json (line 74) | def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, me...
  function register_coco_panoptic_annos_sem_seg (line 128) | def register_coco_panoptic_annos_sem_seg(
  function register_all_coco_panoptic_annos_sem_seg (line 159) | def register_all_coco_panoptic_annos_sem_seg(root):

FILE: mfvis_nococo/mask2former/data/datasets/register_coco_stuff_10k.py
  function _get_coco_stuff_meta (line 181) | def _get_coco_stuff_meta():
  function register_all_coco_stuff_10k (line 199) | def register_all_coco_stuff_10k(root):

FILE: mfvis_nococo/mask2former/data/datasets/register_mapillary_vistas.py
  function _get_mapillary_vistas_meta (line 472) | def _get_mapillary_vistas_meta():
  function register_all_mapillary_vistas (line 486) | def register_all_mapillary_vistas(root):

FILE: mfvis_nococo/mask2former/data/datasets/register_mapillary_vistas_panoptic.py
  function load_mapillary_vistas_panoptic_json (line 337) | def load_mapillary_vistas_panoptic_json(json_file, image_dir, gt_dir, se...
  function register_mapillary_vistas_panoptic (line 391) | def register_mapillary_vistas_panoptic(
  function get_metadata (line 444) | def get_metadata():
  function register_all_mapillary_vistas_panoptic (line 488) | def register_all_mapillary_vistas_panoptic(root):

FILE: mfvis_nococo/mask2former/evaluation/instance_evaluation.py
  class InstanceSegEvaluator (line 29) | class InstanceSegEvaluator(COCOEvaluator):
    method _eval_predictions (line 42) | def _eval_predictions(self, predictions, img_ids=None):

FILE: mfvis_nococo/mask2former/maskformer_model.py
  function unfold_wo_center (line 22) | def unfold_wo_center(x, kernel_size, dilation):
  function get_images_color_similarity (line 47) | def get_images_color_similarity(images, kernel_size, dilation):
  class MaskFormer (line 62) | class MaskFormer(nn.Module):
    method __init__ (line 68) | def __init__(
    method from_config (line 138) | def from_config(cls, cfg):
    method device (line 205) | def device(self):
    method forward (line 208) | def forward(self, batched_inputs):
    method prepare_targets (line 315) | def prepare_targets(self, targets, images):
    method semantic_inference (line 331) | def semantic_inference(self, mask_cls, mask_pred):
    method panoptic_inference (line 337) | def panoptic_inference(self, mask_cls, mask_pred):
    method instance_inference (line 395) | def instance_inference(self, mask_cls, mask_pred):

FILE: mfvis_nococo/mask2former/modeling/backbone/swin.py
  class Mlp (line 20) | class Mlp(nn.Module):
    method __init__ (line 23) | def __init__(
    method forward (line 34) | def forward(self, x):
  function window_partition (line 43) | def window_partition(x, window_size):
  function window_reverse (line 57) | def window_reverse(windows, window_size, H, W):
  class WindowAttention (line 73) | class WindowAttention(nn.Module):
    method __init__ (line 86) | def __init__(
    method forward (line 130) | def forward(self, x, mask=None):
  class SwinTransformerBlock (line 173) | class SwinTransformerBlock(nn.Module):
    method __init__ (line 190) | def __init__(
    method forward (line 234) | def forward(self, x, mask_matrix):
  class PatchMerging (line 297) | class PatchMerging(nn.Module):
    method __init__ (line 304) | def __init__(self, dim, norm_layer=nn.LayerNorm):
    method forward (line 310) | def forward(self, x, H, W):
  class BasicLayer (line 339) | class BasicLayer(nn.Module):
    method __init__ (line 357) | def __init__(
    method forward (line 405) | def forward(self, x, H, W):
  class PatchEmbed (line 455) | class PatchEmbed(nn.Module):
    method __init__ (line 464) | def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=...
    method forward (line 478) | def forward(self, x):
  class SwinTransformer (line 497) | class SwinTransformer(nn.Module):
    method __init__ (line 525) | def __init__(
    method _freeze_stages (line 617) | def _freeze_stages(self):
    method init_weights (line 634) | def init_weights(self, pretrained=None):
    method forward (line 650) | def forward(self, x):
    method train (line 679) | def train(self, mode=True):
  class D2SwinTransformer (line 686) | class D2SwinTransformer(SwinTransformer, Backbone):
    method __init__ (line 687) | def __init__(self, cfg, input_shape):
    method forward (line 742) | def forward(self, x):
    method output_shape (line 759) | def output_shape(self):
    method size_divisibility (line 768) | def size_divisibility(self):

FILE: mfvis_nococo/mask2former/modeling/criterion.py
  function unfold_wo_center (line 19) | def unfold_wo_center(x, kernel_size, dilation):
  function compute_pairwise_term (line 44) | def compute_pairwise_term(mask_logits, pairwise_size, pairwise_dilation):
  function get_incoherent_mask (line 73) | def get_incoherent_mask(input_masks, sfact):
  function dice_coefficient (line 84) | def dice_coefficient(x, target):
  function compute_project_term (line 94) | def compute_project_term(mask_scores, gt_bitmasks):
  function dice_loss (line 105) | def dice_loss(
  function sigmoid_ce_loss (line 132) | def sigmoid_ce_loss(
  function calculate_uncertainty (line 157) | def calculate_uncertainty(logits):
  class SetCriterion (line 174) | class SetCriterion(nn.Module):
    method __init__ (line 181) | def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses,
    method loss_labels (line 210) | def loss_labels(self, outputs, targets, indices, num_masks):
    method loss_masks_proj (line 229) | def loss_masks_proj(self, outputs, targets, indices, num_masks, images...
    method loss_masks (line 287) | def loss_masks(self, outputs, targets, indices, num_masks):
    method _get_src_permutation_idx (line 339) | def _get_src_permutation_idx(self, indices):
    method _get_tgt_permutation_idx (line 345) | def _get_tgt_permutation_idx(self, indices):
    method get_loss (line 351) | def get_loss(self, loss, outputs, targets, indices, num_masks, images_...
    method forward (line 362) | def forward(self, outputs, targets, images_lab_sim):
    method __repr__ (line 399) | def __repr__(self):

FILE: mfvis_nococo/mask2former/modeling/matcher.py
  function batch_dice_loss (line 14) | def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
  function batch_sigmoid_ce_loss (line 37) | def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
  function masks_to_boxes (line 69) | def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
  function masks_to_boxes_cc (line 98) | def masks_to_boxes_cc(masks: torch.Tensor) -> torch.Tensor:
  class HungarianMatcher (line 134) | class HungarianMatcher(nn.Module):
    method __init__ (line 142) | def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_d...
    method memory_efficient_forward (line 163) | def memory_efficient_forward(self, outputs, targets):
    method forward (line 212) | def forward(self, outputs, targets):
    method __repr__ (line 234) | def __repr__(self, _repr_indent=4):

FILE: mfvis_nococo/mask2former/modeling/meta_arch/mask_former_head.py
  class MaskFormerHead (line 18) | class MaskFormerHead(nn.Module):
    method _load_from_state_dict (line 22) | def _load_from_state_dict(
    method __init__ (line 49) | def __init__(
    method from_config (line 89) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
    method forward (line 116) | def forward(self, features, mask=None):
    method layers (line 119) | def layers(self, features, mask=None):

FILE: mfvis_nococo/mask2former/modeling/meta_arch/per_pixel_baseline.py
  class PerPixelBaselineHead (line 17) | class PerPixelBaselineHead(nn.Module):
    method _load_from_state_dict (line 21) | def _load_from_state_dict(
    method __init__ (line 47) | def __init__(
    method from_config (line 82) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
    method forward (line 93) | def forward(self, features, targets=None):
    method layers (line 108) | def layers(self, features):
    method losses (line 113) | def losses(self, predictions, targets):
  class PerPixelBaselinePlusHead (line 126) | class PerPixelBaselinePlusHead(PerPixelBaselineHead):
    method _load_from_state_dict (line 127) | def _load_from_state_dict(
    method __init__ (line 152) | def __init__(
    method from_config (line 194) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
    method forward (line 207) | def forward(self, features, targets=None):
    method layers (line 230) | def layers(self, features):

FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/fpn.py
  function build_pixel_decoder (line 20) | def build_pixel_decoder(cfg, input_shape):
  class BasePixelDecoder (line 37) | class BasePixelDecoder(nn.Module):
    method __init__ (line 39) | def __init__(
    method from_config (line 125) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
    method forward_features (line 135) | def forward_features(self, features):
    method forward (line 155) | def forward(self, features, targets=None):
  class TransformerEncoderOnly (line 161) | class TransformerEncoderOnly(nn.Module):
    method __init__ (line 162) | def __init__(
    method _reset_parameters (line 185) | def _reset_parameters(self):
    method forward (line 190) | def forward(self, src, mask, pos_embed):
  class TransformerEncoderPixelDecoder (line 204) | class TransformerEncoderPixelDecoder(BasePixelDecoder):
    method __init__ (line 206) | def __init__(
    method from_config (line 272) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
    method forward_features (line 283) | def forward_features(self, features):
    method forward (line 308) | def forward(self, features, targets=None):

FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/msdeformattn.py
  class MSDeformAttnTransformerEncoderOnly (line 22) | class MSDeformAttnTransformerEncoderOnly(nn.Module):
    method __init__ (line 23) | def __init__(self, d_model=256, nhead=8,
    method _reset_parameters (line 42) | def _reset_parameters(self):
    method get_valid_ratio (line 51) | def get_valid_ratio(self, mask):
    method forward (line 60) | def forward(self, srcs, pos_embeds):
  class MSDeformAttnTransformerEncoderLayer (line 91) | class MSDeformAttnTransformerEncoderLayer(nn.Module):
    method __init__ (line 92) | def __init__(self,
    method with_pos_embed (line 112) | def with_pos_embed(tensor, pos):
    method forward_ffn (line 115) | def forward_ffn(self, src):
    method forward (line 121) | def forward(self, src, pos, reference_points, spatial_shapes, level_st...
  class MSDeformAttnTransformerEncoder (line 133) | class MSDeformAttnTransformerEncoder(nn.Module):
    method __init__ (line 134) | def __init__(self, encoder_layer, num_layers):
    method get_reference_points (line 140) | def get_reference_points(spatial_shapes, valid_ratios, device):
    method forward (line 154) | def forward(self, src, spatial_shapes, level_start_index, valid_ratios...
  class MSDeformAttnPixelDecoder (line 164) | class MSDeformAttnPixelDecoder(nn.Module):
    method __init__ (line 166) | def __init__(
    method from_config (line 294) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
    method forward_features (line 314) | def forward_features(self, features):

FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py
  class MSDeformAttnFunction (line 31) | class MSDeformAttnFunction(Function):
    method forward (line 33) | def forward(ctx, value, value_spatial_shapes, value_level_start_index,...
    method backward (line 42) | def backward(ctx, grad_output):
  function ms_deform_attn_core_pytorch (line 51) | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_lo...

FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py
  function _is_power_of_2 (line 27) | def _is_power_of_2(n):
  class MSDeformAttn (line 33) | class MSDeformAttn(nn.Module):
    method __init__ (line 34) | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
    method _reset_parameters (line 65) | def _reset_parameters(self):
    method forward (line 81) | def forward(self, query, reference_points, input_flatten, input_spatia...

FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/setup.py
  function get_extensions (line 25) | def get_extensions():

FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp
  function ms_deform_attn_cpu_forward (line 22) | at::Tensor
  function ms_deform_attn_cpu_backward (line 34) | std::vector<at::Tensor>

FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h
  function im2col_step (line 32) | int im2col_step)

FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/vision.cpp
  function PYBIND11_MODULE (line 18) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: mfvis_nococo/mask2former/modeling/pixel_decoder/ops/test.py
  function check_forward_equal_with_pytorch_double (line 34) | def check_forward_equal_with_pytorch_double():
  function check_forward_equal_with_pytorch_float (line 50) | def check_forward_equal_with_pytorch_float():
  function check_gradient_numerical (line 65) | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_...

FILE: mfvis_nococo/mask2former/modeling/transformer_decoder/mask2former_transformer_decoder.py
  class SelfAttentionLayer (line 16) | class SelfAttentionLayer(nn.Module):
    method __init__ (line 18) | def __init__(self, d_model, nhead, dropout=0.0,
    method _reset_parameters (line 31) | def _reset_parameters(self):
    method with_pos_embed (line 36) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 39) | def forward_post(self, tgt,
    method forward_pre (line 51) | def forward_pre(self, tgt,
    method forward (line 63) | def forward(self, tgt,
  class CrossAttentionLayer (line 74) | class CrossAttentionLayer(nn.Module):
    method __init__ (line 76) | def __init__(self, d_model, nhead, dropout=0.0,
    method _reset_parameters (line 89) | def _reset_parameters(self):
    method with_pos_embed (line 94) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 97) | def forward_post(self, tgt, memory,
    method forward_pre (line 111) | def forward_pre(self, tgt, memory,
    method forward (line 125) | def forward(self, tgt, memory,
  class FFNLayer (line 137) | class FFNLayer(nn.Module):
    method __init__ (line 139) | def __init__(self, d_model, dim_feedforward=2048, dropout=0.0,
    method _reset_parameters (line 154) | def _reset_parameters(self):
    method with_pos_embed (line 159) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 162) | def forward_post(self, tgt):
    method forward_pre (line 168) | def forward_pre(self, tgt):
    method forward (line 174) | def forward(self, tgt):
  function _get_activation_fn (line 180) | def _get_activation_fn(activation):
  class MLP (line 191) | class MLP(nn.Module):
    method __init__ (line 194) | def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
    method forward (line 200) | def forward(self, x):
  class MultiScaleMaskedTransformerDecoder (line 207) | class MultiScaleMaskedTransformerDecoder(nn.Module):
    method _load_from_state_dict (line 211) | def _load_from_state_dict(
    method __init__ (line 235) | def __init__(
    method from_config (line 336) | def from_config(cls, cfg, in_channels, mask_classification):
    method forward (line 362) | def forward(self, x, mask_features, mask = None):
    method forward_prediction_heads (line 434) | def forward_prediction_heads(self, output, mask_features, attn_mask_ta...
    method _set_aux_loss (line 452) | def _set_aux_loss(self, outputs_class, outputs_seg_masks):

FILE: mfvis_nococo/mask2former/modeling/transformer_decoder/maskformer_transformer_decoder.py
  function build_transformer_decoder (line 21) | def build_transformer_decoder(cfg, in_channels, mask_classification=True):
  class StandardTransformerDecoder (line 30) | class StandardTransformerDecoder(nn.Module):
    method __init__ (line 32) | def __init__(
    method from_config (line 107) | def from_config(cls, cfg, in_channels, mask_classification):
    method forward (line 129) | def forward(self, x, mask_features, mask=None):
    method _set_aux_loss (line 160) | def _set_aux_loss(self, outputs_class, outputs_seg_masks):
  class MLP (line 173) | class MLP(nn.Module):
    method __init__ (line 176) | def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
    method forward (line 184) | def forward(self, x):

FILE: mfvis_nococo/mask2former/modeling/transformer_decoder/position_encoding.py
  class PositionEmbeddingSine (line 11) | class PositionEmbeddingSine(nn.Module):
    method __init__ (line 17) | def __init__(self, num_pos_feats=64, temperature=10000, normalize=Fals...
    method forward (line 28) | def forward(self, x, mask=None):
    method __repr__ (line 53) | def __repr__(self, _repr_indent=4):

FILE: mfvis_nococo/mask2former/modeling/transformer_decoder/transformer.py
  class Transformer (line 18) | class Transformer(nn.Module):
    method __init__ (line 19) | def __init__(
    method _reset_parameters (line 55) | def _reset_parameters(self):
    method forward (line 60) | def forward(self, src, mask, query_embed, pos_embed):
  class TransformerEncoder (line 77) | class TransformerEncoder(nn.Module):
    method __init__ (line 78) | def __init__(self, encoder_layer, num_layers, norm=None):
    method forward (line 84) | def forward(
  class TransformerDecoder (line 104) | class TransformerDecoder(nn.Module):
    method __init__ (line 105) | def __init__(self, decoder_layer, num_layers, norm=None, return_interm...
    method forward (line 112) | def forward(
  class TransformerEncoderLayer (line 153) | class TransformerEncoderLayer(nn.Module):
    method __init__ (line 154) | def __init__(
    method with_pos_embed (line 178) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 181) | def forward_post(
    method forward_pre (line 199) | def forward_pre(
    method forward (line 217) | def forward(
  class TransformerDecoderLayer (line 229) | class TransformerDecoderLayer(nn.Module):
    method __init__ (line 230) | def __init__(
    method with_pos_embed (line 257) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 260) | def forward_post(
    method forward_pre (line 291) | def forward_pre(
    method forward (line 322) | def forward(
  function _get_clones (line 356) | def _get_clones(module, N):
  function _get_activation_fn (line 360) | def _get_activation_fn(activation):

FILE: mfvis_nococo/mask2former/test_time_augmentation.py
  class SemanticSegmentorWithTTA (line 20) | class SemanticSegmentorWithTTA(nn.Module):
    method __init__ (line 26) | def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
    method __call__ (line 48) | def __call__(self, batched_inputs):
    method _inference_one_image (line 70) | def _inference_one_image(self, input):
    method _get_augmented_inputs (line 99) | def _get_augmented_inputs(self, input):

FILE: mfvis_nococo/mask2former/utils/misc.py
  function _max_by_axis (line 15) | def _max_by_axis(the_list):
  class NestedTensor (line 24) | class NestedTensor(object):
    method __init__ (line 25) | def __init__(self, tensors, mask: Optional[Tensor]):
    method to (line 29) | def to(self, device):
    method decompose (line 40) | def decompose(self):
    method __repr__ (line 43) | def __repr__(self):
  function nested_tensor_from_tensor_list (line 47) | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
  function _onnx_nested_tensor_from_tensor_list (line 75) | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> N...
  function is_dist_avail_and_initialized (line 105) | def is_dist_avail_and_initialized():

FILE: mfvis_nococo/mask2former_video/config.py
  function add_maskformer2_video_config (line 6) | def add_maskformer2_video_config(cfg):

FILE: mfvis_nococo/mask2former_video/data_video/augmentation.py
  class ResizeShortestEdge (line 17) | class ResizeShortestEdge(T.Augmentation):
    method __init__ (line 23) | def __init__(
    method get_transform (line 48) | def get_transform(self, image):
  class RandomFlip (line 76) | class RandomFlip(T.Augmentation):
    method __init__ (line 81) | def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_...
    method get_transform (line 98) | def get_transform(self, image):
  function build_augmentation (line 115) | def build_augmentation(cfg, is_train):

FILE: mfvis_nococo/mask2former_video/data_video/build.py
  function _compute_num_images_per_worker (line 21) | def _compute_num_images_per_worker(cfg: CfgNode):
  function filter_images_with_only_crowd_annotations (line 38) | def filter_images_with_only_crowd_annotations(dataset_dicts, dataset_nam...
  function get_detection_dataset_dicts (line 74) | def get_detection_dataset_dicts(
  function _train_loader_from_config (line 114) | def _train_loader_from_config(cfg, mapper, *, dataset=None, sampler=None):
  function build_detection_train_loader (line 143) | def build_detection_train_loader(
  function _test_loader_from_config (line 189) | def _test_loader_from_config(cfg, dataset_name, mapper=None):
  function build_detection_test_loader (line 209) | def build_detection_test_loader(dataset, *, mapper, num_workers=0):

FILE: mfvis_nococo/mask2former_video/data_video/dataset_mapper.py
  function seed_everything (line 27) | def seed_everything(seed):
  function filter_empty_instances (line 36) | def filter_empty_instances(instances, by_box=True, by_mask=True, box_thr...
  function _get_dummy_anno (line 66) | def _get_dummy_anno(num_classes):
  function ytvis_annotations_to_instances (line 77) | def ytvis_annotations_to_instances(annos, image_size):
  class YTVISDatasetMapper (line 123) | class YTVISDatasetMapper:
    method __init__ (line 130) | def __init__(
    method from_config (line 166) | def from_config(cls, cfg, is_train: bool = True):
    method __call__ (line 186) | def __call__(self, dataset_dict):
  class CocoClipDatasetMapper (line 283) | class CocoClipDatasetMapper:
    method __init__ (line 290) | def __init__(
    method from_config (line 319) | def from_config(cls, cfg, is_train: bool = True):
    method __call__ (line 334) | def __call__(self, dataset_dict):

FILE: mfvis_nococo/mask2former_video/data_video/datasets/builtin.py
  function register_all_ytvis_2019 (line 34) | def register_all_ytvis_2019(root):
  function register_all_ytvis_2021 (line 45) | def register_all_ytvis_2021(root):

FILE: mfvis_nococo/mask2former_video/data_video/datasets/ytvis.py
  function _get_ytvis_2019_instances_meta (line 115) | def _get_ytvis_2019_instances_meta():
  function _get_ytvis_2021_instances_meta (line 130) | def _get_ytvis_2021_instances_meta():
  function load_ytvis_json (line 145) | def load_ytvis_json(json_file, image_root, dataset_name=None, extra_anno...
  function register_ytvis_instances (line 271) | def register_ytvis_instances(name, metadata, json_file, image_root):
  function extract_frame_dic (line 318) | def extract_frame_dic(dic, frame_idx):

FILE: mfvis_nococo/mask2former_video/data_video/datasets/ytvis_api/ytvos.py
  function _isArrayLike (line 44) | def _isArrayLike(obj):
  class YTVOS (line 48) | class YTVOS:
    method __init__ (line 49) | def __init__(self, annotation_file=None):
    method createIndex (line 68) | def createIndex(self):
    method info (line 99) | def info(self):
    method getAnnIds (line 107) | def getAnnIds(self, vidIds=[], catIds=[], areaRng=[], iscrowd=None):
    method getCatIds (line 135) | def getCatIds(self, catNms=[], supNms=[], catIds=[]):
    method getVidIds (line 157) | def getVidIds(self, vidIds=[], catIds=[]):
    method loadAnns (line 178) | def loadAnns(self, ids=[]):
    method loadCats (line 189) | def loadCats(self, ids=[]):
    method loadVids (line 200) | def loadVids(self, ids=[]):
    method loadRes (line 212) | def loadRes(self, resFile):
    method annToRLE (line 262) | def annToRLE(self, ann, frameId):
    method annToMask (line 283) | def annToMask(self, ann, frameId):

FILE: mfvis_nococo/mask2former_video/data_video/datasets/ytvis_api/ytvoseval.py
  class YTVOSeval (line 13) | class YTVOSeval:
    method __init__ (line 63) | def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'):
    method _prepare (line 88) | def _prepare(self):
    method evaluate (line 132) | def evaluate(self):
    method computeIoU (line 176) | def computeIoU(self, vidId, catId):
    method computeOks (line 224) | def computeOks(self, imgId, catId):
    method evaluateVid (line 267) | def evaluateVid(self, vidId, catId, aRng, maxDet):
    method accumulate (line 347) | def accumulate(self, p = None):
    method summarize (line 454) | def summarize(self):
    method __str__ (line 527) | def __str__(self):
  class Params (line 530) | class Params:
    method setDetParams (line 534) | def setDetParams(self):
    method setKpParams (line 547) | def setKpParams(self):
    method __init__ (line 558) | def __init__(self, iouType='segm'):

FILE: mfvis_nococo/mask2former_video/data_video/ytvis_eval.py
  class YTVISEvaluator (line 27) | class YTVISEvaluator(DatasetEvaluator):
    method __init__ (line 38) | def __init__(
    method reset (line 100) | def reset(self):
    method process (line 103) | def process(self, inputs, outputs):
    method evaluate (line 115) | def evaluate(self):
    method _eval_predictions (line 145) | def _eval_predictions(self, predictions):
    method _derive_coco_results (line 193) | def _derive_coco_results(self, coco_eval, class_names=None):
  function instances_to_coco_json_video (line 256) | def instances_to_coco_json_video(inputs, outputs):
  function _evaluate_predictions_on_coco (line 296) | def _evaluate_predictions_on_coco(

FILE: mfvis_nococo/mask2former_video/modeling/criterion.py
  function unfold_wo_center (line 20) | def unfold_wo_center(x, kernel_size, dilation):
  function unfold_w_center (line 45) | def unfold_w_center(x, kernel_size, dilation):
  function compute_pairwise_term (line 63) | def compute_pairwise_term(mask_logits, pairwise_size, pairwise_dilation):
  function compute_pairwise_term_neighbor (line 92) | def compute_pairwise_term_neighbor(mask_logits, mask_logits_neighbor, pa...
  function dice_coefficient (line 123) | def dice_coefficient(x, target):
  function compute_project_term (line 133) | def compute_project_term(mask_scores, gt_bitmasks):
  function dice_loss (line 144) | def dice_loss(
  function sigmoid_ce_loss (line 171) | def sigmoid_ce_loss(
  function calculate_uncertainty (line 196) | def calculate_uncertainty(logits):
  class VideoSetCriterion (line 213) | class VideoSetCriterion(nn.Module):
    method __init__ (line 220) | def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses,
    method loss_labels (line 248) | def loss_labels(self, outputs, targets, indices, num_masks):
    method loss_masks (line 266) | def loss_masks(self, outputs, targets, indices, num_masks):
    method topk_mask (line 314) | def topk_mask(self, images_lab_sim, k):
    method loss_masks_proj (line 320) | def loss_masks_proj(self, outputs, targets, indices, num_masks, images...
    method _get_src_permutation_idx (line 428) | def _get_src_permutation_idx(self, indices):
    method _get_tgt_permutation_idx (line 434) | def _get_tgt_permutation_idx(self, indices):
    method get_loss (line 440) | def get_loss(self, loss, outputs, targets, indices, num_masks, images_...
    method forward (line 451) | def forward(self, outputs, targets, images_lab_sim, images_lab_sim_nei...
    method __repr__ (line 488) | def __repr__(self):

FILE: mfvis_nococo/mask2former_video/modeling/matcher.py
  function masks_to_boxes (line 13) | def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
  function masks_to_boxes_new (line 43) | def masks_to_boxes_new(masks: torch.Tensor) -> torch.Tensor:
  function batch_dice_loss (line 85) | def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
  function batch_dice_loss_nosig (line 102) | def batch_dice_loss_nosig(inputs: torch.Tensor, targets: torch.Tensor):
  function batch_sigmoid_ce_loss (line 127) | def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
  class VideoHungarianMatcher (line 158) | class VideoHungarianMatcher(nn.Module):
    method __init__ (line 166) | def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_d...
    method memory_efficient_forward (line 184) | def memory_efficient_forward(self, outputs, targets):
    method forward (line 242) | def forward(self, outputs, targets):
    method __repr__ (line 264) | def __repr__(self, _repr_indent=4):

FILE: mfvis_nococo/mask2former_video/modeling/transformer_decoder/position_encoding.py
  class PositionEmbeddingSine3D (line 12) | class PositionEmbeddingSine3D(nn.Module):
    method __init__ (line 18) | def __init__(self, num_pos_feats=64, temperature=10000, normalize=Fals...
    method forward (line 29) | def forward(self, x, mask=None):

FILE: mfvis_nococo/mask2former_video/modeling/transformer_decoder/video_mask2former_transformer_decoder.py
  class SelfAttentionLayer (line 18) | class SelfAttentionLayer(nn.Module):
    method __init__ (line 20) | def __init__(self, d_model, nhead, dropout=0.0,
    method _reset_parameters (line 33) | def _reset_parameters(self):
    method with_pos_embed (line 38) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 41) | def forward_post(self, tgt,
    method forward_pre (line 53) | def forward_pre(self, tgt,
    method forward (line 65) | def forward(self, tgt,
  class CrossAttentionLayer (line 76) | class CrossAttentionLayer(nn.Module):
    method __init__ (line 78) | def __init__(self, d_model, nhead, dropout=0.0,
    method _reset_parameters (line 91) | def _reset_parameters(self):
    method with_pos_embed (line 96) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 99) | def forward_post(self, tgt, memory,
    method forward_pre (line 113) | def forward_pre(self, tgt, memory,
    method forward (line 127) | def forward(self, tgt, memory,
  class FFNLayer (line 139) | class FFNLayer(nn.Module):
    method __init__ (line 141) | def __init__(self, d_model, dim_feedforward=2048, dropout=0.0,
    method _reset_parameters (line 156) | def _reset_parameters(self):
    method with_pos_embed (line 161) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 164) | def forward_post(self, tgt):
    method forward_pre (line 170) | def forward_pre(self, tgt):
    method forward (line 176) | def forward(self, tgt):
  function _get_activation_fn (line 182) | def _get_activation_fn(activation):
  class MLP (line 193) | class MLP(nn.Module):
    method __init__ (line 196) | def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
    method forward (line 202) | def forward(self, x):
  class VideoMultiScaleMaskedTransformerDecoder (line 209) | class VideoMultiScaleMaskedTransformerDecoder(nn.Module):
    method _load_from_state_dict (line 213) | def _load_from_state_dict(
    method __init__ (line 237) | def __init__(
    method from_config (line 342) | def from_config(cls, cfg, in_channels, mask_classification):
    method forward (line 370) | def forward(self, x, mask_features, mask = None):
    method forward_prediction_heads (line 444) | def forward_prediction_heads(self, output, mask_features, attn_mask_ta...
    method _set_aux_loss (line 464) | def _set_aux_loss(self, outputs_class, outputs_seg_masks):

FILE: mfvis_nococo/mask2former_video/utils/memory.py
  function _ignore_torch_cuda_oom (line 13) | def _ignore_torch_cuda_oom():
  function retry_if_cuda_oom (line 27) | def retry_if_cuda_oom(func):

FILE: mfvis_nococo/mask2former_video/video_maskformer_model.py
  function unfold_wo_center (line 25) | def unfold_wo_center(x, kernel_size, dilation):
  function unfold_w_center (line 50) | def unfold_w_center(x, kernel_size, dilation):
  function get_images_color_similarity (line 68) | def get_images_color_similarity(images, kernel_size, dilation):
  function get_neighbor_images_color_similarity (line 81) | def get_neighbor_images_color_similarity(images, images_neighbor, kernel...
  function get_neighbor_images_patch_color_similarity (line 94) | def get_neighbor_images_patch_color_similarity(images, images_neighbor, ...
  class VideoMaskFormer (line 116) | class VideoMaskFormer(nn.Module):
    method __init__ (line 122) | def __init__(
    method from_config (line 183) | def from_config(cls, cfg):
    method device (line 243) | def device(self):
    method forward (line 246) | def forward(self, batched_inputs):
    method prepare_targets (line 335) | def prepare_targets(self, targets, images):
    method inference_video (line 363) | def inference_video(self, pred_cls, pred_masks, img_size, output_heigh...

FILE: mfvis_nococo/train_net_video.py
  class Trainer (line 54) | class Trainer(DefaultTrainer):
    method build_evaluator (line 60) | def build_evaluator(cls, cfg, dataset_name, output_folder=None):
    method build_train_loader (line 74) | def build_train_loader(cls, cfg):
    method build_test_loader (line 87) | def build_test_loader(cls, cfg, dataset_name):
    method build_lr_scheduler (line 93) | def build_lr_scheduler(cls, cfg, optimizer):
    method build_optimizer (line 101) | def build_optimizer(cls, cfg, model):
    method test (line 182) | def test(cls, cfg, model, evaluators=None):
  function setup (line 238) | def setup(args):
  function main (line 257) | def main(args):

FILE: tools/analyze_model.py
  function setup (line 34) | def setup(args):
  function do_flop (line 51) | def do_flop(cfg):
  function do_activation (line 86) | def do_activation(cfg):
  function do_parameter (line 115) | def do_parameter(cfg):
  function do_structure (line 123) | def do_structure(cfg):

FILE: tools/evaluate_coco_boundary_ap.py
  function main (line 17) | def main():

FILE: tools/evaluate_pq_for_semantic_segmentation.py
  function default_argument_parser (line 20) | def default_argument_parser():
  function pq_compute_single_image (line 40) | def pq_compute_single_image(segm_gt, segm_dt, categories, ignore_label):
  function main (line 138) | def main():

FILE: train_net.py
  class Trainer (line 61) | class Trainer(DefaultTrainer):
    method build_evaluator (line 67) | def build_evaluator(cls, cfg, dataset_name, output_folder=None):
    method build_train_loader (line 149) | def build_train_loader(cls, cfg):
    method build_lr_scheduler (line 175) | def build_lr_scheduler(cls, cfg, optimizer):
    method build_optimizer (line 183) | def build_optimizer(cls, cfg, model):
    method test_with_TTA (line 264) | def test_with_TTA(cls, cfg, model):
  function setup (line 280) | def setup(args):
  function main (line 297) | def main(args):

FILE: train_net_video.py
  class Trainer (line 57) | class Trainer(DefaultTrainer):
    method build_evaluator (line 63) | def build_evaluator(cls, cfg, dataset_name, output_folder=None):
    method build_train_loader (line 77) | def build_train_loader(cls, cfg):
    method build_test_loader (line 101) | def build_test_loader(cls, cfg, dataset_name):
    method build_lr_scheduler (line 107) | def build_lr_scheduler(cls, cfg, optimizer):
    method build_optimizer (line 115) | def build_optimizer(cls, cfg, model):
    method test (line 196) | def test(cls, cfg, model, evaluators=None):
  function setup (line 252) | def setup(args):
  function main (line 271) | def main(args):

FILE: util/box_ops.py
  function box_cxcywh_to_xyxy (line 15) | def box_cxcywh_to_xyxy(x):
  function box_xyxy_to_cxcywh (line 24) | def box_xyxy_to_cxcywh(x):
  function box_iou (line 32) | def box_iou(boxes1, boxes2):
  function multi_box_iou (line 47) | def multi_box_iou(boxes1, boxes2):
  function generalized_box_iou (line 62) | def generalized_box_iou(boxes1, boxes2):
  function generalized_multi_box_iou (line 87) | def generalized_multi_box_iou(boxes1, boxes2):
  function masks_to_boxes (line 114) | def masks_to_boxes(masks):

FILE: util/misc.py
  function _check_size_scale_factor (line 31) | def _check_size_scale_factor(dim, size, scale_factor):
  function _output_size (line 42) | def _output_size(dim, input, size, scale_factor):
  class SmoothedValue (line 60) | class SmoothedValue(object):
    method __init__ (line 65) | def __init__(self, window_size=20, fmt=None):
    method update (line 73) | def update(self, value, n=1):
    method synchronize_between_processes (line 78) | def synchronize_between_processes(self):
    method median (line 92) | def median(self):
    method avg (line 97) | def avg(self):
    method global_avg (line 102) | def global_avg(self):
    method max (line 106) | def max(self):
    method value (line 110) | def value(self):
    method __str__ (line 113) | def __str__(self):
  function all_gather (line 122) | def all_gather(data):
  function reduce_dict (line 165) | def reduce_dict(input_dict, average=True):
  class MetricLogger (line 192) | class MetricLogger(object):
    method __init__ (line 193) | def __init__(self, delimiter="\t"):
    method update (line 197) | def update(self, **kwargs):
    method __getattr__ (line 204) | def __getattr__(self, attr):
    method __str__ (line 212) | def __str__(self):
    method synchronize_between_processes (line 220) | def synchronize_between_processes(self):
    method add_meter (line 224) | def add_meter(self, name, meter):
    method log_every (line 227) | def log_every(self, iterable, print_freq, header=None):
  function get_sha (line 282) | def get_sha():
  function collate_fn (line 302) | def collate_fn(batch):
  function _max_by_axis (line 308) | def _max_by_axis(the_list):
  function nested_tensor_from_tensor_list (line 317) | def nested_tensor_from_tensor_list(tensor_list: List[Tensor], size_divis...
  class NestedTensor (line 348) | class NestedTensor(object):
    method __init__ (line 349) | def __init__(self, tensors, mask: Optional[Tensor]):
    method to (line 353) | def to(self, device, non_blocking=False):
    method record_stream (line 364) | def record_stream(self, *args, **kwargs):
    method decompose (line 369) | def decompose(self):
    method __repr__ (line 372) | def __repr__(self):
  function setup_for_distributed (line 376) | def setup_for_distributed(is_master):
  function is_dist_avail_and_initialized (line 391) | def is_dist_avail_and_initialized():
  function get_world_size (line 399) | def get_world_size():
  function get_rank (line 405) | def get_rank():
  function get_local_size (line 411) | def get_local_size():
  function get_local_rank (line 417) | def get_local_rank():
  function is_main_process (line 423) | def is_main_process():
  function save_on_master (line 427) | def save_on_master(*args, **kwargs):
  function init_distributed_mode (line 432) | def init_distributed_mode(args):
  function accuracy (line 474) | def accuracy(output, target, topk=(1,)):
  function interpolate (line 492) | def interpolate(input, size=None, scale_factor=None, mode="nearest", ali...
  function get_total_grad_norm (line 514) | def get_total_grad_norm(parameters, norm_type=2):
  function inverse_sigmoid (line 522) | def inverse_sigmoid(x, eps=1e-5):

FILE: util/plot_utils.py
  function plot_logs (line 18) | def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'),...
  function plot_precision_recall (line 74) | def plot_precision_recall(files, naming_scheme='iter'):

Download .json

Condensed preview — 215 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (1,479K chars).

[
  {
    "path": "DATASET_prepare.md",
    "chars": 2204,
    "preview": "# Prepare Datasets for MaskFreeVIS\n\nA dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.i"
  },
  {
    "path": "LICENSE",
    "chars": 11357,
    "preview": "                                 Apache License\n                           Version 2.0, January 2004\n                   "
  },
  {
    "path": "README.md",
    "chars": 11417,
    "preview": "# MaskFreeVIS\n\nMask-Free Video Instance Segmentation [CVPR 2023].\n\nThis is the official pytorch implementation of [MaskF"
  },
  {
    "path": "configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml",
    "chars": 1056,
    "preview": "MODEL:\n  BACKBONE:\n    FREEZE_AT: 0\n    NAME: \"build_resnet_backbone\"\n  WEIGHTS: \"detectron2://ImageNetPretrained/torchv"
  },
  {
    "path": "configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml",
    "chars": 1280,
    "preview": "_BASE_: Base-COCO-InstanceSegmentation.yaml\nOUTPUT_DIR: './output/'\nMODEL:\n  META_ARCHITECTURE: \"MaskFormer\"\n  SEM_SEG_H"
  },
  {
    "path": "configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation.yaml",
    "chars": 1208,
    "preview": "MODEL:\n  BACKBONE:\n    FREEZE_AT: 0\n    NAME: \"build_resnet_backbone\"\n  WEIGHTS: \"detectron2://ImageNetPretrained/torchv"
  },
  {
    "path": "configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation_long.yaml",
    "chars": 1211,
    "preview": "MODEL:\n  BACKBONE:\n    FREEZE_AT: 0\n    NAME: \"build_resnet_backbone\"\n  WEIGHTS: \"detectron2://ImageNetPretrained/torchv"
  },
  {
    "path": "configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation_long_bs16.yaml",
    "chars": 1210,
    "preview": "MODEL:\n  BACKBONE:\n    FREEZE_AT: 0\n    NAME: \"build_resnet_backbone\"\n  WEIGHTS: \"detectron2://ImageNetPretrained/torchv"
  },
  {
    "path": "configs/youtubevis_2019/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml",
    "chars": 570,
    "preview": "_BASE_: ../video_maskformer2_R50_bs16_8ep_swin.yaml\nOUTPUT_DIR: 'swinl_joint_withcoco'\nMODEL:\n  WEIGHTS: \"./pretrained_m"
  },
  {
    "path": "configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml",
    "chars": 364,
    "preview": "_BASE_: video_maskformer2_R50_bs16_8ep.yaml\nOUTPUT_DIR: './r101_coco_joint/'\nMODEL:\n  WEIGHTS: \"pretrained_model/model_f"
  },
  {
    "path": "configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml",
    "chars": 1871,
    "preview": "_BASE_: Base-YouTubeVIS-VideoInstanceSegmentation_long_bs16.yaml\nOUTPUT_DIR: './r50_coco_joint/'\nSEED: 29118357\nMODEL:\n "
  },
  {
    "path": "configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep_swin.yaml",
    "chars": 1872,
    "preview": "_BASE_: Base-YouTubeVIS-VideoInstanceSegmentation_long.yaml\nOUTPUT_DIR: './swinl_joint_withcoco/'\nSEED: 29118357\nMODEL:\n"
  },
  {
    "path": "demo/README.md",
    "chars": 164,
    "preview": "## Mask2Former Demo\n\nWe provide a command line tool to run a simple demo of builtin configs.\nThe usage is explained in ["
  },
  {
    "path": "demo/demo.py",
    "chars": 7045,
    "preview": "# Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py\nimport argparse\n"
  },
  {
    "path": "demo/predictor.py",
    "chars": 9024,
    "preview": "# Copied from: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py\nimport atexit\nimport bisect\n"
  },
  {
    "path": "demo_video/README.md",
    "chars": 170,
    "preview": "## Video Mask2Former Demo\n\nWe provide a command line tool to run a simple demo of builtin configs.\nThe usage is explaine"
  },
  {
    "path": "demo_video/demo.py",
    "chars": 6995,
    "preview": "# Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py\nimport argparse\n"
  },
  {
    "path": "demo_video/predictor.py",
    "chars": 7899,
    "preview": "# reference: https://github.com/sukjunhwang/IFC/blob/master/projects/IFC/demo/predictor.py\nimport atexit\nimport bisect\ni"
  },
  {
    "path": "demo_video/visualizer.py",
    "chars": 16375,
    "preview": "# reference: https://github.com/sukjunhwang/IFC/blob/master/projects/IFC/demo/visualizer.py\nimport torch\nimport numpy as"
  },
  {
    "path": "mask2former/__init__.py",
    "chars": 893,
    "preview": "from . import data  # register all new datasets\nfrom . import modeling\n\n# config\nfrom .config import add_maskformer2_con"
  },
  {
    "path": "mask2former/config.py",
    "chars": 4220,
    "preview": "# -*- coding: utf-8 -*-\nfrom detectron2.config import CfgNode as CN\n\n\ndef add_maskformer2_config(cfg):\n    \"\"\"\n    Add c"
  },
  {
    "path": "mask2former/data/__init__.py",
    "chars": 23,
    "preview": "from . import datasets\n"
  },
  {
    "path": "mask2former/data/dataset_mappers/__init__.py",
    "chars": 51,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n"
  },
  {
    "path": "mask2former/data/dataset_mappers/__init__.py.new",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py",
    "chars": 8068,
    "preview": "# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py\nimport cop"
  },
  {
    "path": "mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py",
    "chars": 5759,
    "preview": "# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py\nimport cop"
  },
  {
    "path": "mask2former/data/dataset_mappers/mask_former_instance_dataset_mapper.py",
    "chars": 6544,
    "preview": "import copy\nimport logging\n\nimport numpy as np\nimport pycocotools.mask as mask_util\nimport torch\nfrom torch.nn import fu"
  },
  {
    "path": "mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py",
    "chars": 6179,
    "preview": "import copy\nimport logging\n\nimport numpy as np\nimport torch\nfrom torch.nn import functional as F\n\nfrom detectron2.config"
  },
  {
    "path": "mask2former/data/dataset_mappers/mask_former_semantic_dataset_mapper.py",
    "chars": 6822,
    "preview": "import copy\nimport logging\n\nimport numpy as np\nimport torch\nfrom torch.nn import functional as F\n\nfrom detectron2.config"
  },
  {
    "path": "mask2former/data/datasets/__init__.py",
    "chars": 245,
    "preview": "from . import (\n    register_ade20k_full,\n    register_ade20k_panoptic,\n    register_coco_stuff_10k,\n    register_mapill"
  },
  {
    "path": "mask2former/data/datasets/register_ade20k_full.py",
    "chars": 52164,
    "preview": "import os\n\nfrom detectron2.data import DatasetCatalog, MetadataCatalog\nfrom detectron2.data.datasets import load_sem_seg"
  },
  {
    "path": "mask2former/data/datasets/register_ade20k_instance.py",
    "chars": 4731,
    "preview": "import json\nimport logging\nimport numpy as np\nimport os\nfrom PIL import Image\n\nfrom detectron2.data import DatasetCatalo"
  },
  {
    "path": "mask2former/data/datasets/register_ade20k_panoptic.py",
    "chars": 19699,
    "preview": "import json\nimport os\n\nfrom detectron2.data import DatasetCatalog, MetadataCatalog\nfrom detectron2.utils.file_io import "
  },
  {
    "path": "mask2former/data/datasets/register_coco_panoptic_annos_semseg.py",
    "chars": 7372,
    "preview": "import json\nimport os\n\nfrom detectron2.data import DatasetCatalog, MetadataCatalog\nfrom detectron2.data.datasets import "
  },
  {
    "path": "mask2former/data/datasets/register_coco_stuff_10k.py",
    "chars": 13509,
    "preview": "import os\n\nfrom detectron2.data import DatasetCatalog, MetadataCatalog\nfrom detectron2.data.datasets import load_sem_seg"
  },
  {
    "path": "mask2former/data/datasets/register_mapillary_vistas.py",
    "chars": 12967,
    "preview": "import os\n\nfrom detectron2.data import DatasetCatalog, MetadataCatalog\nfrom detectron2.data.datasets import load_sem_seg"
  },
  {
    "path": "mask2former/data/datasets/register_mapillary_vistas_panoptic.py",
    "chars": 16319,
    "preview": "import json\nimport os\n\nfrom detectron2.data import DatasetCatalog, MetadataCatalog\nfrom detectron2.utils.file_io import "
  },
  {
    "path": "mask2former/evaluation/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "mask2former/evaluation/__init__.py.new",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "mask2former/evaluation/instance_evaluation.py",
    "chars": 4574,
    "preview": "import contextlib\nimport copy\nimport io\nimport itertools\nimport json\nimport logging\nimport numpy as np\nimport os\nimport "
  },
  {
    "path": "mask2former/maskformer_model.py",
    "chars": 22062,
    "preview": "from typing import Tuple\n\nimport torch\nfrom torch import nn\nfrom torch.nn import functional as F\n\nfrom detectron2.config"
  },
  {
    "path": "mask2former/modeling/__init__.py",
    "chars": 302,
    "preview": "from .backbone.swin import D2SwinTransformer\nfrom .pixel_decoder.fpn import BasePixelDecoder\nfrom .pixel_decoder.msdefor"
  },
  {
    "path": "mask2former/modeling/backbone/__init__.py",
    "chars": 51,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n"
  },
  {
    "path": "mask2former/modeling/backbone/__init__.py.new",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "mask2former/modeling/backbone/swin.py",
    "chars": 27425,
    "preview": "# --------------------------------------------------------\n# Swin Transformer\n# Copyright (c) 2021 Microsoft\n# Licensed "
  },
  {
    "path": "mask2former/modeling/criterion.py",
    "chars": 15960,
    "preview": "# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py\n\"\"\"\nMaskFormer criter"
  },
  {
    "path": "mask2former/modeling/matcher.py",
    "chars": 9096,
    "preview": "# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py\n\"\"\"\nModules to com"
  },
  {
    "path": "mask2former/modeling/meta_arch/__init__.py",
    "chars": 51,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n"
  },
  {
    "path": "mask2former/modeling/meta_arch/__init__.py.new",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "mask2former/modeling/meta_arch/mask_former_head.py",
    "chars": 5683,
    "preview": "import logging\nfrom copy import deepcopy\nfrom typing import Callable, Dict, List, Optional, Tuple, Union\n\nimport fvcore."
  },
  {
    "path": "mask2former/modeling/meta_arch/per_pixel_baseline.py",
    "chars": 9382,
    "preview": "import logging\nfrom typing import Callable, Dict, List, Optional, Tuple, Union\n\nimport fvcore.nn.weight_init as weight_i"
  },
  {
    "path": "mask2former/modeling/pixel_decoder/__init__.py",
    "chars": 51,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n"
  },
  {
    "path": "mask2former/modeling/pixel_decoder/__init__.py.new",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "mask2former/modeling/pixel_decoder/fpn.py",
    "chars": 12360,
    "preview": "import logging\nimport numpy as np\nfrom typing import Callable, Dict, List, Optional, Tuple, Union\n\nimport fvcore.nn.weig"
  },
  {
    "path": "mask2former/modeling/pixel_decoder/msdeformattn.py",
    "chars": 15255,
    "preview": "import logging\nimport numpy as np\nfrom typing import Callable, Dict, List, Optional, Tuple, Union\n\nimport fvcore.nn.weig"
  },
  {
    "path": "mask2former/modeling/pixel_decoder/ops/functions/__init__.py",
    "chars": 683,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py",
    "chars": 3677,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "mask2former/modeling/pixel_decoder/ops/make.sh",
    "chars": 729,
    "preview": "#!/usr/bin/env bash\n# ------------------------------------------------------------------------------------------------\n#"
  },
  {
    "path": "mask2former/modeling/pixel_decoder/ops/modules/__init__.py",
    "chars": 669,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py",
    "chars": 6754,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "mask2former/modeling/pixel_decoder/ops/setup.py",
    "chars": 2987,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp",
    "chars": 1399,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h",
    "chars": 1282,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu",
    "chars": 7459,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h",
    "chars": 1283,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh",
    "chars": 54837,
    "preview": "/*!\n**************************************************************************\n* Deformable DETR\n* Copyright (c) 2020 Se"
  },
  {
    "path": "mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h",
    "chars": 1981,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "mask2former/modeling/pixel_decoder/ops/src/vision.cpp",
    "chars": 942,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "mask2former/modeling/pixel_decoder/ops/test.py",
    "chars": 4172,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "mask2former/modeling/transformer_decoder/__init__.py",
    "chars": 151,
    "preview": "from .maskformer_transformer_decoder import StandardTransformerDecoder\nfrom .mask2former_transformer_decoder import Mult"
  },
  {
    "path": "mask2former/modeling/transformer_decoder/mask2former_transformer_decoder.py",
    "chars": 18036,
    "preview": "# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py\nimport logging\nimpor"
  },
  {
    "path": "mask2former/modeling/transformer_decoder/maskformer_transformer_decoder.py",
    "chars": 7014,
    "preview": "# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py\nimport fvcore.nn.wei"
  },
  {
    "path": "mask2former/modeling/transformer_decoder/position_encoding.py",
    "chars": 2467,
    "preview": "# # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py\n\"\"\"\nV"
  },
  {
    "path": "mask2former/modeling/transformer_decoder/transformer.py",
    "chars": 11892,
    "preview": "# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/transformer.py\n\"\"\"\nTransform"
  },
  {
    "path": "mask2former/test_time_augmentation.py",
    "chars": 3731,
    "preview": "import copy\nimport logging\nfrom itertools import count\n\nimport numpy as np\nimport torch\nfrom fvcore.transforms import HF"
  },
  {
    "path": "mask2former/utils/__init__.py",
    "chars": 51,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n"
  },
  {
    "path": "mask2former/utils/__init__.py.new",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "mask2former/utils/misc.py",
    "chars": 3846,
    "preview": "# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py\n\"\"\"\nMisc functions, inc"
  },
  {
    "path": "mask2former_video/__init__.py",
    "chars": 379,
    "preview": "from . import modeling\n\n# config\nfrom .config import add_maskformer2_video_config\n\n# models\nfrom .video_maskformer_model"
  },
  {
    "path": "mask2former_video/config.py",
    "chars": 1030,
    "preview": "# -*- coding: utf-8 -*-\nfrom detectron2.config import CfgNode as CN\n\n\ndef add_maskformer2_video_config(cfg):\n    # video"
  },
  {
    "path": "mask2former_video/data_video/__init__.py",
    "chars": 222,
    "preview": "# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC\n\nfrom .dataset_mapper import YTVISDatasetMapper, CocoC"
  },
  {
    "path": "mask2former_video/data_video/augmentation.py",
    "chars": 24140,
    "preview": "# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC\n\nimport numpy as np\nimport logging\nimport sys\nfrom fvc"
  },
  {
    "path": "mask2former_video/data_video/build.py",
    "chars": 10077,
    "preview": "# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC\n\nimport itertools\nimport logging\nimport torch.utils.da"
  },
  {
    "path": "mask2former_video/data_video/combined_loader.py",
    "chars": 1367,
    "preview": "import random\nfrom collections import deque\nfrom typing import Any, Collection, Deque, Iterable, Iterator, List, Sequenc"
  },
  {
    "path": "mask2former_video/data_video/dataset_mapper.py",
    "chars": 19076,
    "preview": "# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC\n\nimport copy\nimport logging\nimport random\nimport numpy"
  },
  {
    "path": "mask2former_video/data_video/datasets/__init__.py",
    "chars": 224,
    "preview": "# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC\n\nfrom . import builtin  # ensure the builtin datasets "
  },
  {
    "path": "mask2former_video/data_video/datasets/builtin.py",
    "chars": 2883,
    "preview": "# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC\n\nimport os\n\nfrom .ytvis import (\n    register_ytvis_in"
  },
  {
    "path": "mask2former_video/data_video/datasets/ytvis.py",
    "chars": 15838,
    "preview": "# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC\n\nimport contextlib\nimport io\nimport json\nimport loggin"
  },
  {
    "path": "mask2former_video/data_video/datasets/ytvis_api/__init__.py",
    "chars": 69,
    "preview": "# Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi\n"
  },
  {
    "path": "mask2former_video/data_video/datasets/ytvis_api/ytvos.py",
    "chars": 11671,
    "preview": "# Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi\n\n__author__ = 'ychfan'\n# Interface for accessing th"
  },
  {
    "path": "mask2former_video/data_video/datasets/ytvis_api/ytvoseval.py",
    "chars": 25607,
    "preview": "# Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi\n\n__author__ = 'ychfan'\n\nimport numpy as np\nimport d"
  },
  {
    "path": "mask2former_video/data_video/ytvis_eval.py",
    "chars": 12482,
    "preview": "# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC\n\nimport contextlib\nimport copy\nimport io\nimport iterto"
  },
  {
    "path": "mask2former_video/modeling/__init__.py",
    "chars": 111,
    "preview": "from .transformer_decoder.video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder\n"
  },
  {
    "path": "mask2former_video/modeling/criterion.py",
    "chars": 21340,
    "preview": "# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py\n\nimport logging\n\nimpo"
  },
  {
    "path": "mask2former_video/modeling/matcher.py",
    "chars": 13290,
    "preview": "# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py\n\"\"\"\nModules to com"
  },
  {
    "path": "mask2former_video/modeling/transformer_decoder/__init__.py",
    "chars": 91,
    "preview": "from .video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder\n"
  },
  {
    "path": "mask2former_video/modeling/transformer_decoder/position_encoding.py",
    "chars": 2631,
    "preview": "# # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py\n\"\"\"\nV"
  },
  {
    "path": "mask2former_video/modeling/transformer_decoder/video_mask2former_transformer_decoder.py",
    "chars": 18739,
    "preview": "# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py\nimport logging\nimpor"
  },
  {
    "path": "mask2former_video/utils/__init__.py",
    "chars": 51,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n"
  },
  {
    "path": "mask2former_video/utils/__init__.py.new",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "mask2former_video/utils/memory.py",
    "chars": 2598,
    "preview": "\nimport logging\nfrom contextlib import contextmanager\nfrom functools import wraps\nimport torch\nfrom torch.cuda.amp impor"
  },
  {
    "path": "mask2former_video/video_maskformer_model.py",
    "chars": 17800,
    "preview": "import logging\nimport math\nfrom typing import Tuple\n\nimport torch\nfrom torch import nn\nfrom torch.nn import functional a"
  },
  {
    "path": "mfvis_nococo/__init__.py",
    "chars": 325,
    "preview": "from . import modeling\n\n# config\nfrom .config import add_maskformer2_video_config\n\n# models\nfrom .video_maskformer_model"
  },
  {
    "path": "mfvis_nococo/configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation.yaml",
    "chars": 1184,
    "preview": "MODEL:\n  BACKBONE:\n    FREEZE_AT: 0\n    NAME: \"build_resnet_backbone\"\n  WEIGHTS: \"detectron2://ImageNetPretrained/torchv"
  },
  {
    "path": "mfvis_nococo/configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep_coco.yaml",
    "chars": 395,
    "preview": "_BASE_: video_maskformer2_R50_bs16_8ep.yaml\nOUTPUT_DIR: 'box_patch_newknn_t5s5_spretrained1_r101_correct'\nMODEL:\n  WEIGH"
  },
  {
    "path": "mfvis_nococo/configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml",
    "chars": 1400,
    "preview": "_BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml\nOUTPUT_DIR: 'box_patch_newknn_t5s5_spretrained3_correct1'\nSEED: 2"
  },
  {
    "path": "mfvis_nococo/configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep_coco.yaml",
    "chars": 1424,
    "preview": "_BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml\nOUTPUT_DIR: 'box_patch_newknn_t5s5_spretrained3_coco_correct1'\nSE"
  },
  {
    "path": "mfvis_nococo/mask2former/__init__.py",
    "chars": 893,
    "preview": "from . import data  # register all new datasets\nfrom . import modeling\n\n# config\nfrom .config import add_maskformer2_con"
  },
  {
    "path": "mfvis_nococo/mask2former/config.py",
    "chars": 4220,
    "preview": "# -*- coding: utf-8 -*-\nfrom detectron2.config import CfgNode as CN\n\n\ndef add_maskformer2_config(cfg):\n    \"\"\"\n    Add c"
  },
  {
    "path": "mfvis_nococo/mask2former/data/__init__.py",
    "chars": 23,
    "preview": "from . import datasets\n"
  },
  {
    "path": "mfvis_nococo/mask2former/data/dataset_mappers/__init__.py",
    "chars": 51,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n"
  },
  {
    "path": "mfvis_nococo/mask2former/data/dataset_mappers/__init__.py.new",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "mfvis_nococo/mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py",
    "chars": 8068,
    "preview": "# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py\nimport cop"
  },
  {
    "path": "mfvis_nococo/mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py",
    "chars": 5759,
    "preview": "# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py\nimport cop"
  },
  {
    "path": "mfvis_nococo/mask2former/data/dataset_mappers/mask_former_instance_dataset_mapper.py",
    "chars": 6544,
    "preview": "import copy\nimport logging\n\nimport numpy as np\nimport pycocotools.mask as mask_util\nimport torch\nfrom torch.nn import fu"
  },
  {
    "path": "mfvis_nococo/mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py",
    "chars": 6179,
    "preview": "import copy\nimport logging\n\nimport numpy as np\nimport torch\nfrom torch.nn import functional as F\n\nfrom detectron2.config"
  },
  {
    "path": "mfvis_nococo/mask2former/data/dataset_mappers/mask_former_semantic_dataset_mapper.py",
    "chars": 6822,
    "preview": "import copy\nimport logging\n\nimport numpy as np\nimport torch\nfrom torch.nn import functional as F\n\nfrom detectron2.config"
  },
  {
    "path": "mfvis_nococo/mask2former/data/datasets/__init__.py",
    "chars": 245,
    "preview": "from . import (\n    register_ade20k_full,\n    register_ade20k_panoptic,\n    register_coco_stuff_10k,\n    register_mapill"
  },
  {
    "path": "mfvis_nococo/mask2former/data/datasets/register_ade20k_full.py",
    "chars": 52164,
    "preview": "import os\n\nfrom detectron2.data import DatasetCatalog, MetadataCatalog\nfrom detectron2.data.datasets import load_sem_seg"
  },
  {
    "path": "mfvis_nococo/mask2former/data/datasets/register_ade20k_instance.py",
    "chars": 4731,
    "preview": "import json\nimport logging\nimport numpy as np\nimport os\nfrom PIL import Image\n\nfrom detectron2.data import DatasetCatalo"
  },
  {
    "path": "mfvis_nococo/mask2former/data/datasets/register_ade20k_panoptic.py",
    "chars": 19699,
    "preview": "import json\nimport os\n\nfrom detectron2.data import DatasetCatalog, MetadataCatalog\nfrom detectron2.utils.file_io import "
  },
  {
    "path": "mfvis_nococo/mask2former/data/datasets/register_coco_panoptic_annos_semseg.py",
    "chars": 7372,
    "preview": "import json\nimport os\n\nfrom detectron2.data import DatasetCatalog, MetadataCatalog\nfrom detectron2.data.datasets import "
  },
  {
    "path": "mfvis_nococo/mask2former/data/datasets/register_coco_stuff_10k.py",
    "chars": 13509,
    "preview": "import os\n\nfrom detectron2.data import DatasetCatalog, MetadataCatalog\nfrom detectron2.data.datasets import load_sem_seg"
  },
  {
    "path": "mfvis_nococo/mask2former/data/datasets/register_mapillary_vistas.py",
    "chars": 12967,
    "preview": "import os\n\nfrom detectron2.data import DatasetCatalog, MetadataCatalog\nfrom detectron2.data.datasets import load_sem_seg"
  },
  {
    "path": "mfvis_nococo/mask2former/data/datasets/register_mapillary_vistas_panoptic.py",
    "chars": 16319,
    "preview": "import json\nimport os\n\nfrom detectron2.data import DatasetCatalog, MetadataCatalog\nfrom detectron2.utils.file_io import "
  },
  {
    "path": "mfvis_nococo/mask2former/evaluation/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "mfvis_nococo/mask2former/evaluation/__init__.py.new",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "mfvis_nococo/mask2former/evaluation/instance_evaluation.py",
    "chars": 4574,
    "preview": "import contextlib\nimport copy\nimport io\nimport itertools\nimport json\nimport logging\nimport numpy as np\nimport os\nimport "
  },
  {
    "path": "mfvis_nococo/mask2former/maskformer_model.py",
    "chars": 18772,
    "preview": "from typing import Tuple\n\nimport torch\nfrom torch import nn\nfrom torch.nn import functional as F\n\nfrom detectron2.config"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/__init__.py",
    "chars": 302,
    "preview": "from .backbone.swin import D2SwinTransformer\nfrom .pixel_decoder.fpn import BasePixelDecoder\nfrom .pixel_decoder.msdefor"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/backbone/__init__.py",
    "chars": 51,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/backbone/__init__.py.new",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/backbone/swin.py",
    "chars": 27425,
    "preview": "# --------------------------------------------------------\n# Swin Transformer\n# Copyright (c) 2021 Microsoft\n# Licensed "
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/criterion.py",
    "chars": 16127,
    "preview": "# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py\n\"\"\"\nMaskFormer criter"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/matcher.py",
    "chars": 9096,
    "preview": "# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py\n\"\"\"\nModules to com"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/meta_arch/__init__.py",
    "chars": 51,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/meta_arch/__init__.py.new",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/meta_arch/mask_former_head.py",
    "chars": 5683,
    "preview": "import logging\nfrom copy import deepcopy\nfrom typing import Callable, Dict, List, Optional, Tuple, Union\n\nimport fvcore."
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/meta_arch/per_pixel_baseline.py",
    "chars": 9382,
    "preview": "import logging\nfrom typing import Callable, Dict, List, Optional, Tuple, Union\n\nimport fvcore.nn.weight_init as weight_i"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/pixel_decoder/__init__.py",
    "chars": 51,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/pixel_decoder/__init__.py.new",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/pixel_decoder/fpn.py",
    "chars": 12360,
    "preview": "import logging\nimport numpy as np\nfrom typing import Callable, Dict, List, Optional, Tuple, Union\n\nimport fvcore.nn.weig"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/pixel_decoder/msdeformattn.py",
    "chars": 15255,
    "preview": "import logging\nimport numpy as np\nfrom typing import Callable, Dict, List, Optional, Tuple, Union\n\nimport fvcore.nn.weig"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/pixel_decoder/ops/functions/__init__.py",
    "chars": 683,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py",
    "chars": 3677,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/pixel_decoder/ops/make.sh",
    "chars": 729,
    "preview": "#!/usr/bin/env bash\n# ------------------------------------------------------------------------------------------------\n#"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/pixel_decoder/ops/modules/__init__.py",
    "chars": 669,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py",
    "chars": 6754,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/pixel_decoder/ops/setup.py",
    "chars": 2987,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp",
    "chars": 1399,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h",
    "chars": 1282,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu",
    "chars": 7459,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h",
    "chars": 1283,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh",
    "chars": 54837,
    "preview": "/*!\n**************************************************************************\n* Deformable DETR\n* Copyright (c) 2020 Se"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h",
    "chars": 1981,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/pixel_decoder/ops/src/vision.cpp",
    "chars": 942,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/pixel_decoder/ops/test.py",
    "chars": 4172,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/transformer_decoder/__init__.py",
    "chars": 151,
    "preview": "from .maskformer_transformer_decoder import StandardTransformerDecoder\nfrom .mask2former_transformer_decoder import Mult"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/transformer_decoder/mask2former_transformer_decoder.py",
    "chars": 18036,
    "preview": "# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py\nimport logging\nimpor"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/transformer_decoder/maskformer_transformer_decoder.py",
    "chars": 7014,
    "preview": "# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py\nimport fvcore.nn.wei"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/transformer_decoder/position_encoding.py",
    "chars": 2467,
    "preview": "# # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py\n\"\"\"\nV"
  },
  {
    "path": "mfvis_nococo/mask2former/modeling/transformer_decoder/transformer.py",
    "chars": 11892,
    "preview": "# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/transformer.py\n\"\"\"\nTransform"
  },
  {
    "path": "mfvis_nococo/mask2former/test_time_augmentation.py",
    "chars": 3731,
    "preview": "import copy\nimport logging\nfrom itertools import count\n\nimport numpy as np\nimport torch\nfrom fvcore.transforms import HF"
  },
  {
    "path": "mfvis_nococo/mask2former/utils/__init__.py",
    "chars": 51,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n"
  },
  {
    "path": "mfvis_nococo/mask2former/utils/__init__.py.new",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "mfvis_nococo/mask2former/utils/misc.py",
    "chars": 3846,
    "preview": "# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py\n\"\"\"\nMisc functions, inc"
  },
  {
    "path": "mfvis_nococo/mask2former_video/__init__.py",
    "chars": 376,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\nfrom . import modeling\n\n# config\nfrom .config import add_maskformer2_"
  },
  {
    "path": "mfvis_nococo/mask2former_video/config.py",
    "chars": 402,
    "preview": "# -*- coding: utf-8 -*-\n# Copyright (c) Facebook, Inc. and its affiliates.\nfrom detectron2.config import CfgNode as CN\n\n"
  },
  {
    "path": "mfvis_nococo/mask2former_video/data_video/__init__.py",
    "chars": 273,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC\n\nfr"
  },
  {
    "path": "mfvis_nococo/mask2former_video/data_video/augmentation.py",
    "chars": 6073,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC\n\nim"
  },
  {
    "path": "mfvis_nococo/mask2former_video/data_video/build.py",
    "chars": 9790,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC\n\nim"
  },
  {
    "path": "mfvis_nococo/mask2former_video/data_video/dataset_mapper.py",
    "chars": 14498,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC\n\nim"
  },
  {
    "path": "mfvis_nococo/mask2former_video/data_video/datasets/__init__.py",
    "chars": 275,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC\n\nfr"
  },
  {
    "path": "mfvis_nococo/mask2former_video/data_video/datasets/builtin.py",
    "chars": 2100,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC\n\nim"
  },
  {
    "path": "mfvis_nococo/mask2former_video/data_video/datasets/ytvis.py",
    "chars": 15553,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC\n\nim"
  },
  {
    "path": "mfvis_nococo/mask2former_video/data_video/datasets/ytvis_api/__init__.py",
    "chars": 120,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi\n"
  },
  {
    "path": "mfvis_nococo/mask2former_video/data_video/datasets/ytvis_api/ytvos.py",
    "chars": 11722,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi\n"
  },
  {
    "path": "mfvis_nococo/mask2former_video/data_video/datasets/ytvis_api/ytvoseval.py",
    "chars": 25658,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi\n"
  },
  {
    "path": "mfvis_nococo/mask2former_video/data_video/ytvis_eval.py",
    "chars": 12533,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC\n\nim"
  },
  {
    "path": "mfvis_nococo/mask2former_video/modeling/__init__.py",
    "chars": 162,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\nfrom .transformer_decoder.video_mask2former_transformer_decoder impor"
  },
  {
    "path": "mfvis_nococo/mask2former_video/modeling/criterion.py",
    "chars": 22095,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/facebookresearch/de"
  },
  {
    "path": "mfvis_nococo/mask2former_video/modeling/matcher.py",
    "chars": 10647,
    "preview": "# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py\n\"\"\"\nModules to com"
  },
  {
    "path": "mfvis_nococo/mask2former_video/modeling/transformer_decoder/__init__.py",
    "chars": 142,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\nfrom .video_mask2former_transformer_decoder import VideoMultiScaleMas"
  },
  {
    "path": "mfvis_nococo/mask2former_video/modeling/transformer_decoder/position_encoding.py",
    "chars": 2682,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# # Modified by Bowen Cheng from: https://github.com/facebookresearch"
  },
  {
    "path": "mfvis_nococo/mask2former_video/modeling/transformer_decoder/video_mask2former_transformer_decoder.py",
    "chars": 18615,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from: https://github.com/facebookresearch/d"
  },
  {
    "path": "mfvis_nococo/mask2former_video/utils/__init__.py",
    "chars": 51,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n"
  },
  {
    "path": "mfvis_nococo/mask2former_video/utils/memory.py",
    "chars": 2649,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n\nimport logging\nfrom contextlib import contextmanager\nfrom functools "
  },
  {
    "path": "mfvis_nococo/mask2former_video/video_maskformer_model.py",
    "chars": 16803,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\nimport logging\nimport math\nfrom typing import Tuple\n\nimport torch\nfro"
  },
  {
    "path": "mfvis_nococo/scripts/eval_8gpu_mask2former_r101_video.sh",
    "chars": 319,
    "preview": "export PYTHONPATH=$PYTHONPATH:`pwd`\n\nID=159\n\nCUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus "
  },
  {
    "path": "mfvis_nococo/scripts/train_8gpu_mask2former_r101_video_coco.sh",
    "chars": 249,
    "preview": "export PYTHONPATH=$PYTHONPATH:`pwd`\n\nID=159\n\n\nCUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus"
  },
  {
    "path": "mfvis_nococo/scripts/train_8gpu_mask2former_r50_video.sh",
    "chars": 243,
    "preview": "export PYTHONPATH=$PYTHONPATH:`pwd`\n\nID=159\n\n\nCUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus"
  },
  {
    "path": "mfvis_nococo/scripts/train_8gpu_mask2former_r50_video_coco.sh",
    "chars": 247,
    "preview": "export PYTHONPATH=$PYTHONPATH:`pwd`\n\nID=159\n\nCUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus "
  },
  {
    "path": "mfvis_nococo/scripts/visual_video_r101.sh",
    "chars": 346,
    "preview": "export PYTHONPATH=$PYTHONPATH:`pwd`\n\nCUDA_VISIBLE_DEVICES=0 python3 demo_video/demo.py --config-file configs/youtubevis_"
  },
  {
    "path": "mfvis_nococo/scripts/visual_video_r50.sh",
    "chars": 343,
    "preview": "export PYTHONPATH=$PYTHONPATH:`pwd`\n\nCUDA_VISIBLE_DEVICES=0 python3 demo_video/demo.py --config-file configs/youtubevis_"
  },
  {
    "path": "mfvis_nococo/train_net_video.py",
    "chars": 10343,
    "preview": "\"\"\"\nThis script is a simplified version of the training script in detectron2/tools.\n\"\"\"\ntry:\n    # ignore ShapelyDepreca"
  },
  {
    "path": "requirements.txt",
    "chars": 53,
    "preview": "cython\nscipy\nshapely\ntimm\nh5py\nsubmitit\nscikit-image\n"
  },
  {
    "path": "scripts/eval_8gpu_mask2former_r101_video.sh",
    "chars": 316,
    "preview": "export PYTHONPATH=$PYTHONPATH:`pwd`\n\nID=159\n\n\nCUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus"
  },
  {
    "path": "scripts/eval_8gpu_mask2former_r50_video.sh",
    "chars": 314,
    "preview": "export PYTHONPATH=$PYTHONPATH:`pwd`\n\nID=159\n\nCUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus "
  },
  {
    "path": "scripts/eval_8gpu_mask2former_swinl_video.sh",
    "chars": 338,
    "preview": "export PYTHONPATH=$PYTHONPATH:`pwd`\n\nID=159\n\n\nCUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus"
  },
  {
    "path": "scripts/train_8gpu_mask2former_r101_video.sh",
    "chars": 241,
    "preview": "export PYTHONPATH=$PYTHONPATH:`pwd`\n\nID=159\n\nCUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train_net_video.py --num-gpus "
  }
]

// ... and 15 more files (download for full content)

About this extraction

This page contains the full source code of the SysCV/MaskFreeVIS GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 215 files (1.3 MB), approximately 370.7k tokens, and a symbol index with 1000 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo