Repository: facebookresearch/maskrcnn-benchmark Branch: main Commit: 57eec25b7514 Files: 251 Total size: 811.4 KB Directory structure: gitextract_3g1fmxak/ ├── .flake8 ├── .github/ │ └── ISSUE_TEMPLATE/ │ ├── bug-report.md │ ├── feature-request.md │ └── questions-help-support.md ├── .gitignore ├── ABSTRACTIONS.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── INSTALL.md ├── LICENSE ├── MODEL_ZOO.md ├── README.md ├── TROUBLESHOOTING.md ├── configs/ │ ├── caffe2/ │ │ ├── e2e_faster_rcnn_R_101_FPN_1x_caffe2.yaml │ │ ├── e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml │ │ ├── e2e_faster_rcnn_R_50_FPN_1x_caffe2.yaml │ │ ├── e2e_faster_rcnn_X_101_32x8d_FPN_1x_caffe2.yaml │ │ ├── e2e_keypoint_rcnn_R_50_FPN_1x_caffe2.yaml │ │ ├── e2e_mask_rcnn_R_101_FPN_1x_caffe2.yaml │ │ ├── e2e_mask_rcnn_R_50_C4_1x_caffe2.yaml │ │ ├── e2e_mask_rcnn_R_50_FPN_1x_caffe2.yaml │ │ ├── e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x_caffe2.yaml │ │ └── e2e_mask_rcnn_X_101_32x8d_FPN_1x_caffe2.yaml │ ├── cityscapes/ │ │ ├── README.md │ │ ├── e2e_faster_rcnn_R_50_FPN_1x_cocostyle.yaml │ │ ├── e2e_mask_rcnn_R_50_FPN_1x_binarymask.yaml │ │ ├── e2e_mask_rcnn_R_50_FPN_1x_cocostyle.yaml │ │ └── e2e_mask_rcnn_R_50_FPN_1x_poly.yaml │ ├── dcn/ │ │ ├── README.md │ │ ├── e2e_faster_rcnn_dconv_R_50_FPN_1x.yaml │ │ ├── e2e_faster_rcnn_mdconv_R_50_FPN_1x.yaml │ │ ├── e2e_mask_rcnn_dconv_R_50_FPN_1x.yaml │ │ └── e2e_mask_rcnn_mdconv_R_50_FPN_1x.yaml │ ├── e2e_faster_rcnn_R_101_FPN_1x.yaml │ ├── e2e_faster_rcnn_R_50_C4_1x.yaml │ ├── e2e_faster_rcnn_R_50_FPN_1x.yaml │ ├── e2e_faster_rcnn_X_101_32x8d_FPN_1x.yaml │ ├── e2e_faster_rcnn_fbnet.yaml │ ├── e2e_faster_rcnn_fbnet_600.yaml │ ├── e2e_faster_rcnn_fbnet_chamv1a_600.yaml │ ├── e2e_keypoint_rcnn_R_50_FPN_1x.yaml │ ├── e2e_mask_rcnn_R_101_FPN_1x.yaml │ ├── e2e_mask_rcnn_R_50_C4_1x.yaml │ ├── e2e_mask_rcnn_R_50_FPN_1x.yaml │ ├── e2e_mask_rcnn_R_50_FPN_1x_periodically_testing.yaml │ ├── e2e_mask_rcnn_X_101_32x8d_FPN_1x.yaml │ ├── e2e_mask_rcnn_fbnet.yaml │ ├── e2e_mask_rcnn_fbnet_600.yaml │ ├── e2e_mask_rcnn_fbnet_xirb16d_dsmask.yaml │ ├── e2e_mask_rcnn_fbnet_xirb16d_dsmask_600.yaml │ ├── gn_baselines/ │ │ ├── README.md │ │ ├── e2e_faster_rcnn_R_50_FPN_1x_gn.yaml │ │ ├── e2e_faster_rcnn_R_50_FPN_Xconv1fc_1x_gn.yaml │ │ ├── e2e_mask_rcnn_R_50_FPN_1x_gn.yaml │ │ ├── e2e_mask_rcnn_R_50_FPN_Xconv1fc_1x_gn.yaml │ │ ├── scratch_e2e_faster_rcnn_R_50_FPN_3x_gn.yaml │ │ ├── scratch_e2e_faster_rcnn_R_50_FPN_Xconv1fc_3x_gn.yaml │ │ ├── scratch_e2e_mask_rcnn_R_50_FPN_3x_gn.yaml │ │ └── scratch_e2e_mask_rcnn_R_50_FPN_Xconv1fc_3x_gn.yaml │ ├── pascal_voc/ │ │ ├── e2e_faster_rcnn_R_50_C4_1x_1_gpu_voc.yaml │ │ ├── e2e_faster_rcnn_R_50_C4_1x_4_gpu_voc.yaml │ │ └── e2e_mask_rcnn_R_50_FPN_1x_cocostyle.yaml │ ├── quick_schedules/ │ │ ├── e2e_faster_rcnn_R_50_C4_quick.yaml │ │ ├── e2e_faster_rcnn_R_50_FPN_quick.yaml │ │ ├── e2e_faster_rcnn_X_101_32x8d_FPN_quick.yaml │ │ ├── e2e_keypoint_rcnn_R_50_FPN_quick.yaml │ │ ├── e2e_mask_rcnn_R_50_C4_quick.yaml │ │ ├── e2e_mask_rcnn_R_50_FPN_quick.yaml │ │ ├── e2e_mask_rcnn_X_101_32x8d_FPN_quick.yaml │ │ ├── rpn_R_50_C4_quick.yaml │ │ └── rpn_R_50_FPN_quick.yaml │ ├── retinanet/ │ │ ├── retinanet_R-101-FPN_1x.yaml │ │ ├── retinanet_R-101-FPN_P5_1x.yaml │ │ ├── retinanet_R-50-FPN_1x.yaml │ │ ├── retinanet_R-50-FPN_1x_quick.yaml │ │ ├── retinanet_R-50-FPN_P5_1x.yaml │ │ └── retinanet_X_101_32x8d_FPN_1x.yaml │ ├── rpn_R_101_FPN_1x.yaml │ ├── rpn_R_50_C4_1x.yaml │ ├── rpn_R_50_FPN_1x.yaml │ ├── rpn_X_101_32x8d_FPN_1x.yaml │ └── test_time_aug/ │ └── e2e_mask_rcnn_R_50_FPN_1x.yaml ├── demo/ │ ├── README.md │ ├── predictor.py │ └── webcam.py ├── docker/ │ ├── Dockerfile │ └── docker-jupyter/ │ ├── Dockerfile │ └── jupyter_notebook_config.py ├── maskrcnn_benchmark/ │ ├── __init__.py │ ├── config/ │ │ ├── __init__.py │ │ ├── defaults.py │ │ └── paths_catalog.py │ ├── csrc/ │ │ ├── ROIAlign.h │ │ ├── ROIPool.h │ │ ├── SigmoidFocalLoss.h │ │ ├── cpu/ │ │ │ ├── ROIAlign_cpu.cpp │ │ │ ├── nms_cpu.cpp │ │ │ └── vision.h │ │ ├── cuda/ │ │ │ ├── ROIAlign_cuda.cu │ │ │ ├── ROIPool_cuda.cu │ │ │ ├── SigmoidFocalLoss_cuda.cu │ │ │ ├── deform_conv_cuda.cu │ │ │ ├── deform_conv_kernel_cuda.cu │ │ │ ├── deform_pool_cuda.cu │ │ │ ├── deform_pool_kernel_cuda.cu │ │ │ ├── nms.cu │ │ │ └── vision.h │ │ ├── deform_conv.h │ │ ├── deform_pool.h │ │ ├── nms.h │ │ └── vision.cpp │ ├── data/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── build.py │ │ ├── collate_batch.py │ │ ├── datasets/ │ │ │ ├── __init__.py │ │ │ ├── abstract.py │ │ │ ├── cityscapes.py │ │ │ ├── coco.py │ │ │ ├── concat_dataset.py │ │ │ ├── evaluation/ │ │ │ │ ├── __init__.py │ │ │ │ ├── cityscapes/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── cityscapes_eval.py │ │ │ │ │ └── eval_instances.py │ │ │ │ ├── coco/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── abs_to_coco.py │ │ │ │ │ ├── coco_eval.py │ │ │ │ │ └── coco_eval_wrapper.py │ │ │ │ └── voc/ │ │ │ │ ├── __init__.py │ │ │ │ └── voc_eval.py │ │ │ ├── list_dataset.py │ │ │ └── voc.py │ │ ├── samplers/ │ │ │ ├── __init__.py │ │ │ ├── distributed.py │ │ │ ├── grouped_batch_sampler.py │ │ │ └── iteration_based_batch_sampler.py │ │ └── transforms/ │ │ ├── __init__.py │ │ ├── build.py │ │ └── transforms.py │ ├── engine/ │ │ ├── __init__.py │ │ ├── bbox_aug.py │ │ ├── inference.py │ │ └── trainer.py │ ├── layers/ │ │ ├── __init__.py │ │ ├── _utils.py │ │ ├── batch_norm.py │ │ ├── dcn/ │ │ │ ├── __init__.py │ │ │ ├── deform_conv_func.py │ │ │ ├── deform_conv_module.py │ │ │ ├── deform_pool_func.py │ │ │ └── deform_pool_module.py │ │ ├── misc.py │ │ ├── nms.py │ │ ├── roi_align.py │ │ ├── roi_pool.py │ │ ├── sigmoid_focal_loss.py │ │ └── smooth_l1_loss.py │ ├── modeling/ │ │ ├── __init__.py │ │ ├── backbone/ │ │ │ ├── __init__.py │ │ │ ├── backbone.py │ │ │ ├── fbnet.py │ │ │ ├── fbnet_builder.py │ │ │ ├── fbnet_modeldef.py │ │ │ ├── fpn.py │ │ │ └── resnet.py │ │ ├── balanced_positive_negative_sampler.py │ │ ├── box_coder.py │ │ ├── detector/ │ │ │ ├── __init__.py │ │ │ ├── detectors.py │ │ │ └── generalized_rcnn.py │ │ ├── make_layers.py │ │ ├── matcher.py │ │ ├── poolers.py │ │ ├── registry.py │ │ ├── roi_heads/ │ │ │ ├── __init__.py │ │ │ ├── box_head/ │ │ │ │ ├── __init__.py │ │ │ │ ├── box_head.py │ │ │ │ ├── inference.py │ │ │ │ ├── loss.py │ │ │ │ ├── roi_box_feature_extractors.py │ │ │ │ └── roi_box_predictors.py │ │ │ ├── keypoint_head/ │ │ │ │ ├── __init__.py │ │ │ │ ├── inference.py │ │ │ │ ├── keypoint_head.py │ │ │ │ ├── loss.py │ │ │ │ ├── roi_keypoint_feature_extractors.py │ │ │ │ └── roi_keypoint_predictors.py │ │ │ ├── mask_head/ │ │ │ │ ├── __init__.py │ │ │ │ ├── inference.py │ │ │ │ ├── loss.py │ │ │ │ ├── mask_head.py │ │ │ │ ├── roi_mask_feature_extractors.py │ │ │ │ └── roi_mask_predictors.py │ │ │ └── roi_heads.py │ │ ├── rpn/ │ │ │ ├── __init__.py │ │ │ ├── anchor_generator.py │ │ │ ├── inference.py │ │ │ ├── loss.py │ │ │ ├── retinanet/ │ │ │ │ ├── __init__.py │ │ │ │ ├── inference.py │ │ │ │ ├── loss.py │ │ │ │ └── retinanet.py │ │ │ ├── rpn.py │ │ │ └── utils.py │ │ └── utils.py │ ├── solver/ │ │ ├── __init__.py │ │ ├── build.py │ │ └── lr_scheduler.py │ ├── structures/ │ │ ├── __init__.py │ │ ├── bounding_box.py │ │ ├── boxlist_ops.py │ │ ├── image_list.py │ │ ├── keypoint.py │ │ └── segmentation_mask.py │ └── utils/ │ ├── README.md │ ├── __init__.py │ ├── c2_model_loading.py │ ├── checkpoint.py │ ├── collect_env.py │ ├── comm.py │ ├── cv2_util.py │ ├── env.py │ ├── imports.py │ ├── logger.py │ ├── metric_logger.py │ ├── miscellaneous.py │ ├── model_serialization.py │ ├── model_zoo.py │ ├── registry.py │ └── timer.py ├── requirements.txt ├── setup.py ├── tests/ │ ├── checkpoint.py │ ├── env_tests/ │ │ └── env.py │ ├── test_backbones.py │ ├── test_box_coder.py │ ├── test_configs.py │ ├── test_data_samplers.py │ ├── test_detectors.py │ ├── test_fbnet.py │ ├── test_feature_extractors.py │ ├── test_metric_logger.py │ ├── test_nms.py │ ├── test_predictors.py │ ├── test_rpn_heads.py │ ├── test_segmentation_mask.py │ └── utils.py └── tools/ ├── cityscapes/ │ ├── convert_cityscapes_to_coco.py │ └── instances2dict_with_polygons.py ├── test_net.py └── train_net.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .flake8 ================================================ # This is an example .flake8 config, used when developing *Black* itself. # Keep in sync with setup.cfg which is used for source packages. [flake8] ignore = E203, E266, E501, W503 max-line-length = 80 max-complexity = 18 select = B,C,E,F,W,T4,B9 ================================================ FILE: .github/ISSUE_TEMPLATE/bug-report.md ================================================ --- name: "\U0001F41B Bug Report" about: Submit a bug report to help us improve Mask R-CNN Benchmark --- ## 🐛 Bug ## To Reproduce Steps to reproduce the behavior: 1. 1. 1. ## Expected behavior ## Environment Please copy and paste the output from the [environment collection script from PyTorch](https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py) (or fill out the checklist below manually). You can get the script and run it with: ``` wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py # For security purposes, please check the contents of collect_env.py before running it. python collect_env.py ``` - PyTorch Version (e.g., 1.0): - OS (e.g., Linux): - How you installed PyTorch (`conda`, `pip`, source): - Build command you used (if compiling from source): - Python version: - CUDA/cuDNN version: - GPU models and configuration: - Any other relevant information: ## Additional context ================================================ FILE: .github/ISSUE_TEMPLATE/feature-request.md ================================================ --- name: "\U0001F680Feature Request" about: Submit a proposal/request for a new Mask R-CNN Benchmark feature --- ## 🚀 Feature ## Motivation ## Pitch ## Alternatives ## Additional context ================================================ FILE: .github/ISSUE_TEMPLATE/questions-help-support.md ================================================ --- name: "❓Questions/Help/Support" about: Do you need support? --- ## ❓ Questions and Help ================================================ FILE: .gitignore ================================================ # compilation and distribution __pycache__ _ext *.pyc *.so maskrcnn_benchmark.egg-info/ build/ dist/ # pytorch/python/numpy formats *.pth *.pkl *.npy # ipython/jupyter notebooks *.ipynb **/.ipynb_checkpoints/ # Editor temporaries *.swn *.swo *.swp *~ # Pycharm editor settings .idea # vscode editor settings .vscode # MacOS .DS_Store # project dirs /datasets /models /output ================================================ FILE: ABSTRACTIONS.md ================================================ ## Abstractions The main abstractions introduced by `maskrcnn_benchmark` that are useful to have in mind are the following: ### ImageList In PyTorch, the first dimension of the input to the network generally represents the batch dimension, and thus all elements of the same batch have the same height / width. In order to support images with different sizes and aspect ratios in the same batch, we created the `ImageList` class, which holds internally a batch of images (os possibly different sizes). The images are padded with zeros such that they have the same final size and batched over the first dimension. The original sizes of the images before padding are stored in the `image_sizes` attribute, and the batched tensor in `tensors`. We provide a convenience function `to_image_list` that accepts a few different input types, including a list of tensors, and returns an `ImageList` object. ```python from maskrcnn_benchmark.structures.image_list import to_image_list images = [torch.rand(3, 100, 200), torch.rand(3, 150, 170)] batched_images = to_image_list(images) # it is also possible to make the final batched image be a multiple of a number batched_images_32 = to_image_list(images, size_divisible=32) ``` ### BoxList The `BoxList` class holds a set of bounding boxes (represented as a `Nx4` tensor) for a specific image, as well as the size of the image as a `(width, height)` tuple. It also contains a set of methods that allow to perform geometric transformations to the bounding boxes (such as cropping, scaling and flipping). The class accepts bounding boxes from two different input formats: - `xyxy`, where each box is encoded as a `x1`, `y1`, `x2` and `y2` coordinates, and - `xywh`, where each box is encoded as `x1`, `y1`, `w` and `h`. Additionally, each `BoxList` instance can also hold arbitrary additional information for each bounding box, such as labels, visibility, probability scores etc. Here is an example on how to create a `BoxList` from a list of coordinates: ```python from maskrcnn_benchmark.structures.bounding_box import BoxList, FLIP_LEFT_RIGHT width = 100 height = 200 boxes = [ [0, 10, 50, 50], [50, 20, 90, 60], [10, 10, 50, 50] ] # create a BoxList with 3 boxes bbox = BoxList(boxes, image_size=(width, height), mode='xyxy') # perform some box transformations, has similar API as PIL.Image bbox_scaled = bbox.resize((width * 2, height * 3)) bbox_flipped = bbox.transpose(FLIP_LEFT_RIGHT) # add labels for each bbox labels = torch.tensor([0, 10, 1]) bbox.add_field('labels', labels) # bbox also support a few operations, like indexing # here, selects boxes 0 and 2 bbox_subset = bbox[[0, 2]] ``` ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Code of Conduct Facebook has adopted a Code of Conduct that we expect project participants to adhere to. Please read the [full text](https://code.fb.com/codeofconduct/) so that you can understand what actions will and will not be tolerated. ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to Mask-RCNN Benchmark We want to make contributing to this project as easy and transparent as possible. ## Our Development Process Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis. ## Pull Requests We actively welcome your pull requests. 1. Fork the repo and create your branch from `master`. 2. If you've added code that should be tested, add tests. 3. If you've changed APIs, update the documentation. 4. Ensure the test suite passes. 5. Make sure your code lints. 6. If you haven't already, complete the Contributor License Agreement ("CLA"). ## Contributor License Agreement ("CLA") In order to accept your pull request, we need you to submit a CLA. You only need to do this once to work on any of Facebook's open source projects. Complete your CLA here: ## Issues We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue. Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe disclosure of security bugs. In those cases, please go through the process outlined on that page and do not file a public issue. ## Coding Style * 4 spaces for indentation rather than tabs * 80 character line length * PEP8 formatting following [Black](https://black.readthedocs.io/en/stable/) ## License By contributing to Mask-RCNN Benchmark, you agree that your contributions will be licensed under the LICENSE file in the root directory of this source tree. ================================================ FILE: INSTALL.md ================================================ ## Installation ### Requirements: - PyTorch 1.0 from a nightly release. It **will not** work with 1.0 nor 1.0.1. Installation instructions can be found in https://pytorch.org/get-started/locally/ - torchvision from master - cocoapi - yacs - matplotlib - GCC >= 4.9 - OpenCV - CUDA >= 9.0 ### Option 1: Step-by-step installation ```bash # first, make sure that your conda is setup properly with the right environment # for that, check that `which conda`, `which pip` and `which python` points to the # right path. From a clean conda env, this is what you need to do conda create --name maskrcnn_benchmark -y conda activate maskrcnn_benchmark # this installs the right pip and dependencies for the fresh python conda install ipython pip # maskrcnn_benchmark and coco api dependencies pip install ninja yacs cython matplotlib tqdm opencv-python # follow PyTorch installation in https://pytorch.org/get-started/locally/ # we give the instructions for CUDA 9.0 conda install -c pytorch pytorch-nightly torchvision cudatoolkit=9.0 export INSTALL_DIR=$PWD # install pycocotools cd $INSTALL_DIR git clone https://github.com/cocodataset/cocoapi.git cd cocoapi/PythonAPI python setup.py build_ext install # install cityscapesScripts cd $INSTALL_DIR git clone https://github.com/mcordts/cityscapesScripts.git cd cityscapesScripts/ python setup.py build_ext install # install apex cd $INSTALL_DIR git clone https://github.com/NVIDIA/apex.git cd apex python setup.py install --cuda_ext --cpp_ext # install PyTorch Detection cd $INSTALL_DIR git clone https://github.com/facebookresearch/maskrcnn-benchmark.git cd maskrcnn-benchmark # the following will install the lib with # symbolic links, so that you can modify # the files if you want and won't need to # re-build it python setup.py build develop unset INSTALL_DIR # or if you are on macOS # MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py build develop ``` #### Windows 10 ```bash open a cmd and change to desired installation directory from now on will be refered as INSTALL_DIR conda create --name maskrcnn_benchmark conda activate maskrcnn_benchmark # this installs the right pip and dependencies for the fresh python conda install ipython # maskrcnn_benchmark and coco api dependencies pip install ninja yacs cython matplotlib tqdm opencv-python # follow PyTorch installation in https://pytorch.org/get-started/locally/ # we give the instructions for CUDA 9.0 ## Important : check the cuda version installed on your computer by running the command in the cmd : nvcc -- version conda install -c pytorch pytorch-nightly torchvision cudatoolkit=9.0 git clone https://github.com/cocodataset/cocoapi.git #To prevent installation error do the following after commiting cocooapi : #using file explorer naviagate to cocoapi\PythonAPI\setup.py and change line 14 from: #extra_compile_args=['-Wno-cpp', '-Wno-unused-function', '-std=c99'], #to #extra_compile_args={'gcc': ['/Qstd=c99']}, #Based on https://github.com/cocodataset/cocoapi/issues/51 cd cocoapi/PythonAPI python setup.py build_ext install # navigate back to INSTALL_DIR cd .. cd .. # install apex git clone https://github.com/NVIDIA/apex.git cd apex python setup.py install --cuda_ext --cpp_ext # navigate back to INSTALL_DIR cd .. # install PyTorch Detection git clone https://github.com/Idolized22/maskrcnn-benchmark.git cd maskrcnn-benchmark # the following will install the lib with # symbolic links, so that you can modify # the files if you want and won't need to # re-build it python setup.py build develop ``` ### Option 2: Docker Image (Requires CUDA, Linux only) Build image with defaults (`CUDA=9.0`, `CUDNN=7`, `FORCE_CUDA=1`): nvidia-docker build -t maskrcnn-benchmark docker/ Build image with other CUDA and CUDNN versions: nvidia-docker build -t maskrcnn-benchmark --build-arg CUDA=9.2 --build-arg CUDNN=7 docker/ Build image with FORCE_CUDA disabled: nvidia-docker build -t maskrcnn-benchmark --build-arg FORCE_CUDA=0 docker/ Build and run image with built-in jupyter notebook(note that the password is used to log in jupyter notebook): nvidia-docker build -t maskrcnn-benchmark-jupyter docker/docker-jupyter/ nvidia-docker run -td -p 8888:8888 -e PASSWORD= -v : maskrcnn-benchmark-jupyter ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2018 Facebook Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: MODEL_ZOO.md ================================================ ## Model Zoo and Baselines ### Hardware - 8 NVIDIA V100 GPUs ### Software - PyTorch version: 1.0.0a0+dd2c487 - CUDA 9.2 - CUDNN 7.1 - NCCL 2.2.13-1 ### End-to-end Faster and Mask R-CNN baselines All the baselines were trained using the exact same experimental setup as in Detectron. We initialize the detection models with ImageNet weights from Caffe2, the same as used by Detectron. The pre-trained models are available in the link in the model id. backbone | type | lr sched | im / gpu | train mem(GB) | train time (s/iter) | total train time(hr) | inference time(s/im) | box AP | mask AP | model id -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- R-50-C4 | Fast | 1x | 1 | 5.8 | 0.4036 | 20.2 | 0.17130 | 34.8 | - | [6358800](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_R_50_C4_1x.pth) R-50-FPN | Fast | 1x | 2 | 4.4 | 0.3530 | 8.8 | 0.12580 | 36.8 | - | [6358793](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_R_50_FPN_1x.pth) R-101-FPN | Fast | 1x | 2 | 7.1 | 0.4591 | 11.5 | 0.143149 | 39.1 | - | [6358804](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_R_101_FPN_1x.pth) X-101-32x8d-FPN | Fast | 1x | 1 | 7.6 | 0.7007 | 35.0 | 0.209965 | 41.2 | - | [6358717](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_X_101_32x8d_FPN_1x.pth) R-50-C4 | Mask | 1x | 1 | 5.8 | 0.4520 | 22.6 | 0.17796 + 0.028 | 35.6 | 31.5 | [6358801](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_50_C4_1x.pth) R-50-FPN | Mask | 1x | 2 | 5.2 | 0.4536 | 11.3 | 0.12966 + 0.034 | 37.8 | 34.2 | [6358792](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_50_FPN_1x.pth) R-101-FPN | Mask | 1x | 2 | 7.9 | 0.5665 | 14.2 | 0.15384 + 0.034 | 40.1 | 36.1 | [6358805](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_101_FPN_1x.pth) X-101-32x8d-FPN | Mask | 1x | 1 | 7.8 | 0.7562 | 37.8 | 0.21739 + 0.034 | 42.2 | 37.8 | [6358718](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_X_101_32x8d_FPN_1x.pth) For person keypoint detection: backbone | type | lr sched | im / gpu | train mem(GB) | train time (s/iter) | total train time(hr) | inference time(s/im) | box AP | keypoint AP | model id -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- R-50-FPN | Keypoint | 1x | 2 | 5.7 | 0.3771 | 9.4 | 0.10941 | 53.7 | 64.3 | 9981060 ### Light-weight Model baselines We provided pre-trained models for selected FBNet models. * All the models are trained from scratched with BN using the training schedule specified below. * Evaluation is performed on a single NVIDIA V100 GPU with `MODEL.RPN.POST_NMS_TOP_N_TEST` set to `200`. The following inference time is reported: * inference total batch=8: Total inference time including data loading, model inference and pre/post preprocessing using 8 images per batch. * inference model batch=8: Model inference time only and using 8 images per batch. * inference model batch=1: Model inference time only and using 1 image per batch. * inferenee caffe2 batch=1: Model inference time for the model in Caffe2 format using 1 image per batch. The Caffe2 models fused the BN to Conv and purely run on C++/CUDA by using Caffe2 ops for rpn/detection post processing. The pre-trained models are available in the link in the model id. backbone | type | resolution | lr sched | im / gpu | train mem(GB) | train time (s/iter) | total train time (hr) | inference total batch=8 (s/im) | inference model batch=8 (s/im) | inference model batch=1 (s/im) | inference caffe2 batch=1 (s/im) | box AP | mask AP | model id -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- [R-50-C4](configs/e2e_faster_rcnn_R_50_C4_1x.yaml) (reference) | Fast | 800 | 1x | 1 | 5.8 | 0.4036 | 20.2 | 0.0875 | **0.0793** | 0.0831 | **0.0625** | 34.4 | - | f35857197 [fbnet_chamv1a](configs/e2e_faster_rcnn_fbnet_chamv1a_600.yaml) | Fast | 600 | 0.75x | 12 | 13.6 | 0.5444 | 20.5 | 0.0315 | **0.0260** | 0.0376 | **0.0188** | 33.5 | - | [f100940543](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_fbnet_chamv1a_600.pth) [fbnet_default](configs/e2e_faster_rcnn_fbnet_600.yaml) | Fast | 600 | 0.5x | 16 | 11.1 | 0.4872 | 12.5 | 0.0316 | **0.0250** | 0.0297 | **0.0130** | 28.2 | - | [f101086388](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_fbnet_600.pth) [R-50-C4](configs/e2e_mask_rcnn_R_50_C4_1x.yaml) (reference) | Mask | 800 | 1x | 1 | 5.8 | 0.452 | 22.6 | 0.0918 | **0.0848** | 0.0844 | - | 35.2 | 31.0 | f35858791 [fbnet_xirb16d](configs/e2e_mask_rcnn_fbnet_xirb16d_dsmask_600.yaml) | Mask | 600 | 0.5x | 16 | 13.4 | 1.1732 | 29 | 0.0386 | **0.0319** | 0.0356 | - | 30.7 | 26.9 | [f101086394](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_fbnet_xirb16d_dsmask.pth) [fbnet_default](configs/e2e_mask_rcnn_fbnet_600.yaml) | Mask | 600 | 0.5x | 16 | 13.0 | 0.9036 | 23.0 | 0.0327 | **0.0269** | 0.0385 | - | 29.0 | 26.1 | [f101086385](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_fbnet_600.pth) ## Comparison with Detectron and mmdetection In the following section, we compare our implementation with [Detectron](https://github.com/facebookresearch/Detectron) and [mmdetection](https://github.com/open-mmlab/mmdetection). The same remarks from [mmdetection](https://github.com/open-mmlab/mmdetection/blob/master/MODEL_ZOO.md#training-speed) about different hardware applies here. ### Training speed The numbers here are in seconds / iteration. The lower, the better. type | Detectron (P100) | mmdetection (V100) | maskrcnn_benchmark (V100) -- | -- | -- | -- Faster R-CNN R-50 C4 | 0.566 | - | 0.4036 Faster R-CNN R-50 FPN | 0.544 | 0.554 | 0.3530 Faster R-CNN R-101 FPN | 0.647 | - | 0.4591 Faster R-CNN X-101-32x8d FPN | 0.799 | - | 0.7007 Mask R-CNN R-50 C4 | 0.620 | - | 0.4520 Mask R-CNN R-50 FPN | 0.889 | 0.690 | 0.4536 Mask R-CNN R-101 FPN | 1.008 | - | 0.5665 Mask R-CNN X-101-32x8d FPN | 0.961 | - | 0.7562 ### Training memory The lower, the better type | Detectron (P100) | mmdetection (V100) | maskrcnn_benchmark (V100) -- | -- | -- | -- Faster R-CNN R-50 C4 | 6.3 | - | 5.8 Faster R-CNN R-50 FPN | 7.2 | 4.9 | 4.4 Faster R-CNN R-101 FPN | 8.9 | - | 7.1 Faster R-CNN X-101-32x8d FPN | 7.0 | - | 7.6 Mask R-CNN R-50 C4 | 6.6 | - | 5.8 Mask R-CNN R-50 FPN | 8.6 | 5.9 | 5.2 Mask R-CNN R-101 FPN | 10.2 | - | 7.9 Mask R-CNN X-101-32x8d FPN | 7.7 | - | 7.8 ### Accuracy The higher, the better type | Detectron (P100) | mmdetection (V100) | maskrcnn_benchmark (V100) -- | -- | -- | -- Faster R-CNN R-50 C4 | 34.8 | - | 34.8 Faster R-CNN R-50 FPN | 36.7 | 36.7 | 36.8 Faster R-CNN R-101 FPN | 39.4 | - | 39.1 Faster R-CNN X-101-32x8d FPN | 41.3 | - | 41.2 Mask R-CNN R-50 C4 | 35.8 & 31.4 | - | 35.6 & 31.5 Mask R-CNN R-50 FPN | 37.7 & 33.9 | 37.5 & 34.4 | 37.8 & 34.2 Mask R-CNN R-101 FPN | 40.0 & 35.9 | - | 40.1 & 36.1 Mask R-CNN X-101-32x8d FPN | 42.1 & 37.3 | - | 42.2 & 37.8 ================================================ FILE: README.md ================================================ # Faster R-CNN and Mask R-CNN in PyTorch 1.0 **maskrcnn-benchmark has been deprecated. Please see [detectron2](https://github.com/facebookresearch/detectron2), which includes implementations for all models in maskrcnn-benchmark** This project aims at providing the necessary building blocks for easily creating detection and segmentation models using PyTorch 1.0. ![alt text](demo/demo_e2e_mask_rcnn_X_101_32x8d_FPN_1x.png "from http://cocodataset.org/#explore?id=345434") ## Highlights - **PyTorch 1.0:** RPN, Faster R-CNN and Mask R-CNN implementations that matches or exceeds Detectron accuracies - **Very fast**: up to **2x** faster than [Detectron](https://github.com/facebookresearch/Detectron) and **30%** faster than [mmdetection](https://github.com/open-mmlab/mmdetection) during training. See [MODEL_ZOO.md](MODEL_ZOO.md) for more details. - **Memory efficient:** uses roughly 500MB less GPU memory than mmdetection during training - **Multi-GPU training and inference** - **Mixed precision training:** trains faster with less GPU memory on [NVIDIA tensor cores](https://developer.nvidia.com/tensor-cores). - **Batched inference:** can perform inference using multiple images per batch per GPU - **CPU support for inference:** runs on CPU in inference time. See our [webcam demo](demo) for an example - Provides pre-trained models for almost all reference Mask R-CNN and Faster R-CNN configurations with 1x schedule. ## Webcam and Jupyter notebook demo We provide a simple webcam demo that illustrates how you can use `maskrcnn_benchmark` for inference: ```bash cd demo # by default, it runs on the GPU # for best results, use min-image-size 800 python webcam.py --min-image-size 800 # can also run it on the CPU python webcam.py --min-image-size 300 MODEL.DEVICE cpu # or change the model that you want to use python webcam.py --config-file ../configs/caffe2/e2e_mask_rcnn_R_101_FPN_1x_caffe2.yaml --min-image-size 300 MODEL.DEVICE cpu # in order to see the probability heatmaps, pass --show-mask-heatmaps python webcam.py --min-image-size 300 --show-mask-heatmaps MODEL.DEVICE cpu # for the keypoint demo python webcam.py --config-file ../configs/caffe2/e2e_keypoint_rcnn_R_50_FPN_1x_caffe2.yaml --min-image-size 300 MODEL.DEVICE cpu ``` A notebook with the demo can be found in [demo/Mask_R-CNN_demo.ipynb](demo/Mask_R-CNN_demo.ipynb). ## Installation Check [INSTALL.md](INSTALL.md) for installation instructions. ## Model Zoo and Baselines Pre-trained models, baselines and comparison with Detectron and mmdetection can be found in [MODEL_ZOO.md](MODEL_ZOO.md) ## Inference in a few lines We provide a helper class to simplify writing inference pipelines using pre-trained models. Here is how we would do it. Run this from the `demo` folder: ```python from maskrcnn_benchmark.config import cfg from predictor import COCODemo config_file = "../configs/caffe2/e2e_mask_rcnn_R_50_FPN_1x_caffe2.yaml" # update the config options with the config file cfg.merge_from_file(config_file) # manual override some options cfg.merge_from_list(["MODEL.DEVICE", "cpu"]) coco_demo = COCODemo( cfg, min_image_size=800, confidence_threshold=0.7, ) # load image and then run prediction image = ... predictions = coco_demo.run_on_opencv_image(image) ``` ## Perform training on COCO dataset For the following examples to work, you need to first install `maskrcnn_benchmark`. You will also need to download the COCO dataset. We recommend to symlink the path to the coco dataset to `datasets/` as follows We use `minival` and `valminusminival` sets from [Detectron](https://github.com/facebookresearch/Detectron/blob/master/detectron/datasets/data/README.md#coco-minival-annotations) ```bash # symlink the coco dataset cd ~/github/maskrcnn-benchmark mkdir -p datasets/coco ln -s /path_to_coco_dataset/annotations datasets/coco/annotations ln -s /path_to_coco_dataset/train2014 datasets/coco/train2014 ln -s /path_to_coco_dataset/test2014 datasets/coco/test2014 ln -s /path_to_coco_dataset/val2014 datasets/coco/val2014 # or use COCO 2017 version ln -s /path_to_coco_dataset/annotations datasets/coco/annotations ln -s /path_to_coco_dataset/train2017 datasets/coco/train2017 ln -s /path_to_coco_dataset/test2017 datasets/coco/test2017 ln -s /path_to_coco_dataset/val2017 datasets/coco/val2017 # for pascal voc dataset: ln -s /path_to_VOCdevkit_dir datasets/voc ``` P.S. `COCO_2017_train` = `COCO_2014_train` + `valminusminival` , `COCO_2017_val` = `minival` You can also configure your own paths to the datasets. For that, all you need to do is to modify `maskrcnn_benchmark/config/paths_catalog.py` to point to the location where your dataset is stored. You can also create a new `paths_catalog.py` file which implements the same two classes, and pass it as a config argument `PATHS_CATALOG` during training. ### Single GPU training Most of the configuration files that we provide assume that we are running on 8 GPUs. In order to be able to run it on fewer GPUs, there are a few possibilities: **1. Run the following without modifications** ```bash python /path_to_maskrcnn_benchmark/tools/train_net.py --config-file "/path/to/config/file.yaml" ``` This should work out of the box and is very similar to what we should do for multi-GPU training. But the drawback is that it will use much more GPU memory. The reason is that we set in the configuration files a global batch size that is divided over the number of GPUs. So if we only have a single GPU, this means that the batch size for that GPU will be 8x larger, which might lead to out-of-memory errors. If you have a lot of memory available, this is the easiest solution. **2. Modify the cfg parameters** If you experience out-of-memory errors, you can reduce the global batch size. But this means that you'll also need to change the learning rate, the number of iterations and the learning rate schedule. Here is an example for Mask R-CNN R-50 FPN with the 1x schedule: ```bash python tools/train_net.py --config-file "configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" SOLVER.IMS_PER_BATCH 2 SOLVER.BASE_LR 0.0025 SOLVER.MAX_ITER 720000 SOLVER.STEPS "(480000, 640000)" TEST.IMS_PER_BATCH 1 MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN 2000 ``` This follows the [scheduling rules from Detectron.](https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14-L30) Note that we have multiplied the number of iterations by 8x (as well as the learning rate schedules), and we have divided the learning rate by 8x. We also changed the batch size during testing, but that is generally not necessary because testing requires much less memory than training. Furthermore, we set `MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN 2000` as the proposals are selected for per the batch rather than per image in the default training. The value is calculated by **1000 x images-per-gpu**. Here we have 2 images per GPU, therefore we set the number as 1000 x 2 = 2000. If we have 8 images per GPU, the value should be set as 8000. Note that this does not apply if `MODEL.RPN.FPN_POST_NMS_PER_BATCH` is set to `False` during training. See [#672](https://github.com/facebookresearch/maskrcnn-benchmark/issues/672) for more details. ### Multi-GPU training We use internally `torch.distributed.launch` in order to launch multi-gpu training. This utility function from PyTorch spawns as many Python processes as the number of GPUs we want to use, and each Python process will only use a single GPU. ```bash export NGPUS=8 python -m torch.distributed.launch --nproc_per_node=$NGPUS /path_to_maskrcnn_benchmark/tools/train_net.py --config-file "path/to/config/file.yaml" MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN images_per_gpu x 1000 ``` Note we should set `MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN` follow the rule in Single-GPU training. ### Mixed precision training We currently use [APEX](https://github.com/NVIDIA/apex) to add [Automatic Mixed Precision](https://developer.nvidia.com/automatic-mixed-precision) support. To enable, just do Single-GPU or Multi-GPU training and set `DTYPE "float16"`. ```bash export NGPUS=8 python -m torch.distributed.launch --nproc_per_node=$NGPUS /path_to_maskrcnn_benchmark/tools/train_net.py --config-file "path/to/config/file.yaml" MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN images_per_gpu x 1000 DTYPE "float16" ``` If you want more verbose logging, set `AMP_VERBOSE True`. See [Mixed Precision Training guide](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) for more details. ## Evaluation You can test your model directly on single or multiple gpus. Here is an example for Mask R-CNN R-50 FPN with the 1x schedule on 8 GPUS: ```bash export NGPUS=8 python -m torch.distributed.launch --nproc_per_node=$NGPUS /path_to_maskrcnn_benchmark/tools/test_net.py --config-file "configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" TEST.IMS_PER_BATCH 16 ``` To calculate mAP for each class, you can simply modify a few lines in [coco_eval.py](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/data/datasets/evaluation/coco/coco_eval.py). See [#524](https://github.com/facebookresearch/maskrcnn-benchmark/issues/524#issuecomment-475118810) for more details. ## Abstractions For more information on some of the main abstractions in our implementation, see [ABSTRACTIONS.md](ABSTRACTIONS.md). ## Adding your own dataset This implementation adds support for COCO-style datasets. But adding support for training on a new dataset can be done as follows: ```python from maskrcnn_benchmark.structures.bounding_box import BoxList class MyDataset(object): def __init__(self, ...): # as you would do normally def __getitem__(self, idx): # load the image as a PIL Image image = ... # load the bounding boxes as a list of list of boxes # in this case, for illustrative purposes, we use # x1, y1, x2, y2 order. boxes = [[0, 0, 10, 10], [10, 20, 50, 50]] # and labels labels = torch.tensor([10, 20]) # create a BoxList from the boxes boxlist = BoxList(boxes, image.size, mode="xyxy") # add the labels to the boxlist boxlist.add_field("labels", labels) if self.transforms: image, boxlist = self.transforms(image, boxlist) # return the image, the boxlist and the idx in your dataset return image, boxlist, idx def get_img_info(self, idx): # get img_height and img_width. This is used if # we want to split the batches according to the aspect ratio # of the image, as it can be more efficient than loading the # image from disk return {"height": img_height, "width": img_width} ``` That's it. You can also add extra fields to the boxlist, such as segmentation masks (using `structures.segmentation_mask.SegmentationMask`), or even your own instance type. For a full example of how the `COCODataset` is implemented, check [`maskrcnn_benchmark/data/datasets/coco.py`](maskrcnn_benchmark/data/datasets/coco.py). Once you have created your dataset, it needs to be added in a couple of places: - [`maskrcnn_benchmark/data/datasets/__init__.py`](maskrcnn_benchmark/data/datasets/__init__.py): add it to `__all__` - [`maskrcnn_benchmark/config/paths_catalog.py`](maskrcnn_benchmark/config/paths_catalog.py): `DatasetCatalog.DATASETS` and corresponding `if` clause in `DatasetCatalog.get()` ### Testing While the aforementioned example should work for training, we leverage the cocoApi for computing the accuracies during testing. Thus, test datasets should currently follow the cocoApi for now. To enable your dataset for testing, add a corresponding if statement in [`maskrcnn_benchmark/data/datasets/evaluation/__init__.py`](maskrcnn_benchmark/data/datasets/evaluation/__init__.py): ```python if isinstance(dataset, datasets.MyDataset): return coco_evaluation(**args) ``` ## Finetuning from Detectron weights on custom datasets Create a script `tools/trim_detectron_model.py` like [here](https://gist.github.com/wangg12/aea194aa6ab6a4de088f14ee193fd968). You can decide which keys to be removed and which keys to be kept by modifying the script. Then you can simply point the converted model path in the config file by changing `MODEL.WEIGHT`. For further information, please refer to [#15](https://github.com/facebookresearch/maskrcnn-benchmark/issues/15). ## Troubleshooting If you have issues running or compiling this code, we have compiled a list of common issues in [TROUBLESHOOTING.md](TROUBLESHOOTING.md). If your issue is not present there, please feel free to open a new issue. ## Citations Please consider citing this project in your publications if it helps your research. The following is a BibTeX reference. The BibTeX entry requires the `url` LaTeX package. ``` @misc{massa2018mrcnn, author = {Massa, Francisco and Girshick, Ross}, title = {{maskrcnn-benchmark: Fast, modular reference implementation of Instance Segmentation and Object Detection algorithms in PyTorch}}, year = {2018}, howpublished = {\url{https://github.com/facebookresearch/maskrcnn-benchmark}}, note = {Accessed: [Insert date here]} } ``` ## Projects using maskrcnn-benchmark - [RetinaMask: Learning to predict masks improves state-of-the-art single-shot detection for free](https://arxiv.org/abs/1901.03353). Cheng-Yang Fu, Mykhailo Shvets, and Alexander C. Berg. Tech report, arXiv,1901.03353. - [FCOS: Fully Convolutional One-Stage Object Detection](https://arxiv.org/abs/1904.01355). Zhi Tian, Chunhua Shen, Hao Chen and Tong He. Tech report, arXiv,1904.01355. [[code](https://github.com/tianzhi0549/FCOS)] - [MULAN: Multitask Universal Lesion Analysis Network for Joint Lesion Detection, Tagging, and Segmentation](https://arxiv.org/abs/1908.04373). Ke Yan, Youbao Tang, Yifan Peng, Veit Sandfort, Mohammadhadi Bagheri, Zhiyong Lu, and Ronald M. Summers. MICCAI 2019. [[code](https://github.com/rsummers11/CADLab/tree/master/MULAN_universal_lesion_analysis)] - [Is Sampling Heuristics Necessary in Training Deep Object Detectors?](https://arxiv.org/abs/1909.04868) Joya Chen, Dong Liu, Tong Xu, Shilong Zhang, Shiwei Wu, Bin Luo, Xuezheng Peng, Enhong Chen. Tech report, arXiv,1909.04868. [[code](https://github.com/ChenJoya/sampling-free)] ## License maskrcnn-benchmark is released under the MIT license. See [LICENSE](LICENSE) for additional details. ================================================ FILE: TROUBLESHOOTING.md ================================================ # Troubleshooting Here is a compilation if common issues that you might face while compiling / running this code: ## Compilation errors when compiling the library If you encounter build errors like the following: ``` /usr/include/c++/6/type_traits:1558:8: note: provided for ‘template struct std::is_convertible’ struct is_convertible ^~~~~~~~~~~~~~ /usr/include/c++/6/tuple:502:1: error: body of constexpr function ‘static constexpr bool std::_TC<, _Elements>::_NonNestedTuple() [with _SrcTuple = std::tuple&&; bool = true; _Elements = {at::Tensor, at::Tensor, at::Tensor, at::Tensor}]’ not a return-statement } ^ error: command '/usr/local/cuda/bin/nvcc' failed with exit status 1 ``` check your CUDA version and your `gcc` version. ``` nvcc --version gcc --version ``` If you are using CUDA 9.0 and gcc 6.4.0, then refer to https://github.com/facebookresearch/maskrcnn-benchmark/issues/25, which has a summary of the solution. Basically, CUDA 9.0 is not compatible with gcc 6.4.0. ## ImportError: No module named maskrcnn_benchmark.config when running webcam.py This means that `maskrcnn-benchmark` has not been properly installed. Refer to https://github.com/facebookresearch/maskrcnn-benchmark/issues/22 for a few possible issues. Note that we now support Python 2 as well. ## ImportError: Undefined symbol: __cudaPopCallConfiguration error when import _C This probably means that the inconsistent version of NVCC compile and your conda CUDAToolKit package. This is firstly mentioned in https://github.com/facebookresearch/maskrcnn-benchmark/issues/45 . All you need to do is: ``` # Check the NVCC compile version(e.g.) /usr/cuda-9.2/bin/nvcc --version # Check the CUDAToolKit version(e.g.) ~/anaconda3/bin/conda list | grep cuda # If you need to update your CUDAToolKit ~/anaconda3/bin/conda install -c anaconda cudatoolkit==9.2 ``` Both of them should have the **same** version. For example, if NVCC==9.2 and CUDAToolKit==9.2, this will be fine while when NVCC==9.2 but CUDAToolKit==9, it fails. ## Segmentation fault (core dumped) when running the library This probably means that you have compiled the library using GCC < 4.9, which is ABI incompatible with PyTorch. Indeed, during installation, you probably saw a message like ``` Your compiler (g++ 4.8) may be ABI-incompatible with PyTorch! Please use a compiler that is ABI-compatible with GCC 4.9 and above. See https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html. See https://gist.github.com/goldsborough/d466f43e8ffc948ff92de7486c5216d6 for instructions on how to install GCC 4.9 or higher. ``` Follow the instructions on https://gist.github.com/goldsborough/d466f43e8ffc948ff92de7486c5216d6 to install GCC 4.9 or higher, and try recompiling `maskrcnn-benchmark` again, after cleaning the `build` folder with ``` rm -rf build ``` ================================================ FILE: configs/caffe2/e2e_faster_rcnn_R_101_FPN_1x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/35857890/e2e_faster_rcnn_R-101-FPN_1x" BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 ================================================ FILE: configs/caffe2/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/35857197/e2e_faster_rcnn_R-50-C4_1x" DATASETS: TEST: ("coco_2014_minival",) ================================================ FILE: configs/caffe2/e2e_faster_rcnn_R_50_FPN_1x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/35857345/e2e_faster_rcnn_R-50-FPN_1x" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 ================================================ FILE: configs/caffe2/e2e_faster_rcnn_X_101_32x8d_FPN_1x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x" BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 ================================================ FILE: configs/caffe2/e2e_keypoint_rcnn_R_50_FPN_1x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/37697547/e2e_keypoint_rcnn_R-50-FPN_1x" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" NUM_CLASSES: 2 ROI_KEYPOINT_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "KeypointRCNNFeatureExtractor" PREDICTOR: "KeypointRCNNPredictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 56 SHARE_BOX_FEATURE_EXTRACTOR: False KEYPOINT_ON: True DATASETS: TRAIN: ("keypoints_coco_2014_train", "keypoints_coco_2014_valminusminival",) TEST: ("keypoints_coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/caffe2/e2e_mask_rcnn_R_101_FPN_1x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/35861795/e2e_mask_rcnn_R-101-FPN_1x" BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 ================================================ FILE: configs/caffe2/e2e_mask_rcnn_R_50_C4_1x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/35858791/e2e_mask_rcnn_R-50-C4_1x" ROI_MASK_HEAD: PREDICTOR: "MaskRCNNC4Predictor" SHARE_BOX_FEATURE_EXTRACTOR: True MASK_ON: True DATASETS: TEST: ("coco_2014_minival",) ================================================ FILE: configs/caffe2/e2e_mask_rcnn_R_50_FPN_1x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/35858933/e2e_mask_rcnn_R-50-FPN_1x" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 ================================================ FILE: configs/caffe2/e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/37129812/e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x" BACKBONE: CONV_BODY: "R-152-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 ================================================ FILE: configs/caffe2/e2e_mask_rcnn_X_101_32x8d_FPN_1x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x" BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 ================================================ FILE: configs/cityscapes/README.md ================================================ ### Paper 1 [mask-rcnn](https://arxiv.org/pdf/1703.06870.pdf) ### dataset 1 [cityscapesScripts](https://github.com/mcordts/cityscapesScripts) ### Performance (from paper) | case | training data | im/gpu | mask AP[val] | mask AP [test] | mask AP50 [test] | |--------------|:-------------:|:------:|:------------:|:--------------:|-----------------:| | R-50-FPN | fine | 8/8 | 31.5 | 26.2 | 49.9 | | R-50-FPN | fine + COCO | 8/8 | 36.4 | 32.0 | 58.1 | ### Note (from paper) We apply our Mask R-CNN models with the ResNet-FPN-50 backbone; we found the 101-layer counterpart performs similarly due to the small dataset size. We train with image scale (shorter side) randomly sampled from [800, 1024], which reduces overfitting; inference is on a single scale of 1024 pixels. We use a mini-batch size of 1 image per GPU (so 8 on 8 GPUs) and train the model for 24k iterations, starting from a learning rate of 0.01 and reducing it to 0.001 at 18k iterations. It takes ∼4 hours of training on a single 8-GPU machine under this setting. ### Implemetation (for finetuning from coco trained model) Step 1: download trained model on coco dataset from [model zoo](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_50_FPN_1x.pth) Step 2: do the model surgery on the trained model as below and use it as `pretrained model` for finetuning: ```python def clip_weights_from_pretrain_of_coco_to_cityscapes(f, out_file): """""" # COCO categories for pretty print COCO_CATEGORIES = [ "__background__", "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", ] # Cityscapes of fine categories for pretty print CITYSCAPES_FINE_CATEGORIES = [ "__background__", "person", "rider", "car", "truck", "bus", "train", "motorcycle", "bicycle", ] coco_cats = COCO_CATEGORIES cityscapes_cats = CITYSCAPES_FINE_CATEGORIES coco_cats_to_inds = dict(zip(coco_cats, range(len(coco_cats)))) cityscapes_cats_to_inds = dict( zip(cityscapes_cats, range(len(cityscapes_cats))) ) checkpoint = torch.load(f) m = checkpoint['model'] weight_names = { "cls_score": "module.roi_heads.box.predictor.cls_score.weight", "bbox_pred": "module.roi_heads.box.predictor.bbox_pred.weight", "mask_fcn_logits": "module.roi_heads.mask.predictor.mask_fcn_logits.weight", } bias_names = { "cls_score": "module.roi_heads.box.predictor.cls_score.bias", "bbox_pred": "module.roi_heads.box.predictor.bbox_pred.bias", "mask_fcn_logits": "module.roi_heads.mask.predictor.mask_fcn_logits.bias", } representation_size = m[weight_names["cls_score"]].size(1) cls_score = nn.Linear(representation_size, len(cityscapes_cats)) nn.init.normal_(cls_score.weight, std=0.01) nn.init.constant_(cls_score.bias, 0) representation_size = m[weight_names["bbox_pred"]].size(1) class_agnostic = m[weight_names["bbox_pred"]].size(0) != len(coco_cats) * 4 num_bbox_reg_classes = 2 if class_agnostic else len(cityscapes_cats) bbox_pred = nn.Linear(representation_size, num_bbox_reg_classes * 4) nn.init.normal_(bbox_pred.weight, std=0.001) nn.init.constant_(bbox_pred.bias, 0) dim_reduced = m[weight_names["mask_fcn_logits"]].size(1) mask_fcn_logits = Conv2d(dim_reduced, len(cityscapes_cats), 1, 1, 0) nn.init.constant_(mask_fcn_logits.bias, 0) nn.init.kaiming_normal_( mask_fcn_logits.weight, mode="fan_out", nonlinearity="relu" ) def _copy_weight(src_weight, dst_weight): for ix, cat in enumerate(cityscapes_cats): if cat not in coco_cats: continue jx = coco_cats_to_inds[cat] dst_weight[ix] = src_weight[jx] return dst_weight def _copy_bias(src_bias, dst_bias, class_agnostic=False): if class_agnostic: return dst_bias return _copy_weight(src_bias, dst_bias) m[weight_names["cls_score"]] = _copy_weight( m[weight_names["cls_score"]], cls_score.weight ) m[weight_names["bbox_pred"]] = _copy_weight( m[weight_names["bbox_pred"]], bbox_pred.weight ) m[weight_names["mask_fcn_logits"]] = _copy_weight( m[weight_names["mask_fcn_logits"]], mask_fcn_logits.weight ) m[bias_names["cls_score"]] = _copy_bias( m[bias_names["cls_score"]], cls_score.bias ) m[bias_names["bbox_pred"]] = _copy_bias( m[bias_names["bbox_pred"]], bbox_pred.bias, class_agnostic ) m[bias_names["mask_fcn_logits"]] = _copy_bias( m[bias_names["mask_fcn_logits"]], mask_fcn_logits.bias ) print("f: {}\nout_file: {}".format(f, out_file)) torch.save(m, out_file) ``` Step 3: modify the `input&weight&solver` configuration in the `yaml` file, like this: ``` MODEL: WEIGHT: "xxx.pth" # the model u save from above code INPUT: MIN_SIZE_TRAIN: (800, 832, 864, 896, 928, 960, 992, 1024, 1024) MAX_SIZE_TRAIN: 2048 MIN_SIZE_TEST: 1024 MAX_SIZE_TEST: 2048 SOLVER: BASE_LR: 0.01 IMS_PER_BATCH: 8 WEIGHT_DECAY: 0.0001 STEPS: (3000,) MAX_ITER: 4000 ``` Step 4: train the model. ================================================ FILE: configs/cityscapes/e2e_faster_rcnn_R_50_FPN_1x_cocostyle.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" NUM_CLASSES: 9 DATASETS: TRAIN: ("cityscapes_fine_instanceonly_seg_train_cocostyle",) TEST: ("cityscapes_fine_instanceonly_seg_val_cocostyle",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (18000,) MAX_ITER: 24000 ================================================ FILE: configs/cityscapes/e2e_mask_rcnn_R_50_FPN_1x_binarymask.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" NUM_CLASSES: 9 ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("cityscapes_mask_instance_train",) TEST: ("cityscapes_mask_instance_val",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (18000,) MAX_ITER: 24000 OUTPUT_DIR: "runs/cityscapes_mask" ================================================ FILE: configs/cityscapes/e2e_mask_rcnn_R_50_FPN_1x_cocostyle.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" NUM_CLASSES: 9 ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("cityscapes_fine_instanceonly_seg_train_cocostyle",) TEST: ("cityscapes_fine_instanceonly_seg_val_cocostyle",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (18000,) MAX_ITER: 24000 ================================================ FILE: configs/cityscapes/e2e_mask_rcnn_R_50_FPN_1x_poly.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" NUM_CLASSES: 11 ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True INPUT: MIN_SIZE_TRAIN: (800, 832, 864, 896, 928, 960, 992, 1024, 1024) MAX_SIZE_TRAIN: 2048 MIN_SIZE_TEST: 1024 MAX_SIZE_TEST: 2048 DATASETS: TRAIN: ("cityscapes_poly_instance_train",) TEST: ("cityscapes_poly_instance_val",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: IMS_PER_BATCH: 8 BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (18000,) MAX_ITER: 24000 OUTPUT_DIR: "runs/cityscapes_poly" ================================================ FILE: configs/dcn/README.md ================================================ ### Reference 1 [Deformable ConvNets v2: More Deformable, Better Results](https://arxiv.org/pdf/1811.11168.pdf) 2 third-party: [mmdetection](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn) ### Performance | case | bbox AP | mask AP | |----------------------------:|--------:|:-------:| | R-50-FPN-dcn (implement) | 39.8 | - | | R-50-FPN-dcn (mmdetection) | 40.0 | - | | R-50-FPN-mdcn (implement) | 40.0 | - | | R-50-FPN-mdcn (mmdetection) | 40.3 | - | | R-50-FPN-dcn (implement) | 40.8 | 36.8 | | R-50-FPN-dcn (mmdetection) | 41.1 | 37.2 | | R-50-FPN-dcn (implement) | 40.7 | 36.7 | | R-50-FPN-dcn (mmdetection) | 41.4 | 37.4 | ### Note see [dcn-v2](https://github.com/open-mmlab/mmdetection/blob/master/MODEL_ZOO.md#deformable-convolution-v2) in `mmdetection` for more details. ### Usage add these three lines ``` MODEL: RESNETS: # corresponding to C2,C3,C4,C5 STAGE_WITH_DCN: (False, True, True, True) WITH_MODULATED_DCN: True DEFORMABLE_GROUPS: 1 ``` ================================================ FILE: configs/dcn/e2e_faster_rcnn_dconv_R_50_FPN_1x.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 STAGE_WITH_DCN: (False, True, True, True) WITH_MODULATED_DCN: False DEFORMABLE_GROUPS: 1 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/dcn/e2e_faster_rcnn_mdconv_R_50_FPN_1x.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 STAGE_WITH_DCN: (False, True, True, True) WITH_MODULATED_DCN: True DEFORMABLE_GROUPS: 1 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/dcn/e2e_mask_rcnn_dconv_R_50_FPN_1x.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 STAGE_WITH_DCN: (False, True, True, True) WITH_MODULATED_DCN: False DEFORMABLE_GROUPS: 1 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/dcn/e2e_mask_rcnn_mdconv_R_50_FPN_1x.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 STAGE_WITH_DCN: (False, True, True, True) WITH_MODULATED_DCN: True DEFORMABLE_GROUPS: 1 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/e2e_faster_rcnn_R_101_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/e2e_faster_rcnn_R_50_C4_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN: PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TEST: 1000 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 8 ================================================ FILE: configs/e2e_faster_rcnn_R_50_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/e2e_faster_rcnn_X_101_32x8d_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-32x8d" BACKBONE: CONV_BODY: "R-101-FPN" RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" RESNETS: BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 8 ================================================ FILE: configs/e2e_faster_rcnn_fbnet.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" BACKBONE: CONV_BODY: FBNet FBNET: ARCH: "default" BN_TYPE: "bn" WIDTH_DIVISOR: 8 DW_CONV_SKIP_BN: True DW_CONV_SKIP_RELU: True RPN: ANCHOR_SIZES: (16, 32, 64, 128, 256) ANCHOR_STRIDE: (16, ) BATCH_SIZE_PER_IMAGE: 256 PRE_NMS_TOP_N_TRAIN: 6000 PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TRAIN: 2000 POST_NMS_TOP_N_TEST: 100 RPN_HEAD: FBNet.rpn_head ROI_HEADS: BATCH_SIZE_PER_IMAGE: 512 ROI_BOX_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head NUM_CLASSES: 81 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.06 WARMUP_FACTOR: 0.1 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 128 # for 8GPUs # TEST: # IMS_PER_BATCH: 8 INPUT: MIN_SIZE_TRAIN: (320, ) MAX_SIZE_TRAIN: 640 MIN_SIZE_TEST: 320 MAX_SIZE_TEST: 640 PIXEL_MEAN: [103.53, 116.28, 123.675] PIXEL_STD: [57.375, 57.12, 58.395] ================================================ FILE: configs/e2e_faster_rcnn_fbnet_600.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" BACKBONE: CONV_BODY: FBNet FBNET: ARCH: "default" BN_TYPE: "bn" WIDTH_DIVISOR: 8 DW_CONV_SKIP_BN: True DW_CONV_SKIP_RELU: True RPN: ANCHOR_SIZES: (32, 64, 128, 256, 512) ANCHOR_STRIDE: (16, ) BATCH_SIZE_PER_IMAGE: 256 PRE_NMS_TOP_N_TRAIN: 6000 PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TRAIN: 2000 POST_NMS_TOP_N_TEST: 200 RPN_HEAD: FBNet.rpn_head ROI_HEADS: BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head NUM_CLASSES: 81 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.06 WARMUP_FACTOR: 0.1 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 128 # for 8GPUs # TEST: # IMS_PER_BATCH: 8 INPUT: MIN_SIZE_TRAIN: (600, ) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 600 MAX_SIZE_TEST: 1000 PIXEL_MEAN: [103.53, 116.28, 123.675] PIXEL_STD: [57.375, 57.12, 58.395] ================================================ FILE: configs/e2e_faster_rcnn_fbnet_chamv1a_600.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" BACKBONE: CONV_BODY: FBNet FBNET: ARCH: "cham_v1a" BN_TYPE: "bn" WIDTH_DIVISOR: 8 DW_CONV_SKIP_BN: True DW_CONV_SKIP_RELU: True RPN: ANCHOR_SIZES: (32, 64, 128, 256, 512) ANCHOR_STRIDE: (16, ) BATCH_SIZE_PER_IMAGE: 256 PRE_NMS_TOP_N_TRAIN: 6000 PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TRAIN: 2000 POST_NMS_TOP_N_TEST: 200 RPN_HEAD: FBNet.rpn_head ROI_HEADS: BATCH_SIZE_PER_IMAGE: 128 ROI_BOX_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head NUM_CLASSES: 81 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.045 WARMUP_FACTOR: 0.1 WEIGHT_DECAY: 0.0001 STEPS: (90000, 120000) MAX_ITER: 135000 IMS_PER_BATCH: 96 # for 8GPUs # TEST: # IMS_PER_BATCH: 8 INPUT: MIN_SIZE_TRAIN: (600, ) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 600 MAX_SIZE_TEST: 1000 PIXEL_MEAN: [103.53, 116.28, 123.675] PIXEL_STD: [57.375, 57.12, 58.395] ================================================ FILE: configs/e2e_keypoint_rcnn_R_50_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" NUM_CLASSES: 2 ROI_KEYPOINT_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "KeypointRCNNFeatureExtractor" PREDICTOR: "KeypointRCNNPredictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 56 SHARE_BOX_FEATURE_EXTRACTOR: False KEYPOINT_ON: True DATASETS: TRAIN: ("keypoints_coco_2014_train", "keypoints_coco_2014_valminusminival",) TEST: ("keypoints_coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/e2e_mask_rcnn_R_101_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/e2e_mask_rcnn_R_50_C4_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN: PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TEST: 1000 ROI_MASK_HEAD: PREDICTOR: "MaskRCNNC4Predictor" SHARE_BOX_FEATURE_EXTRACTOR: True MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 8 ================================================ FILE: configs/e2e_mask_rcnn_R_50_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/e2e_mask_rcnn_R_50_FPN_1x_periodically_testing.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 TEST_PERIOD: 2500 ================================================ FILE: configs/e2e_mask_rcnn_X_101_32x8d_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-32x8d" BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 8 ================================================ FILE: configs/e2e_mask_rcnn_fbnet.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" BACKBONE: CONV_BODY: FBNet FBNET: ARCH: "default" BN_TYPE: "bn" WIDTH_DIVISOR: 8 DW_CONV_SKIP_BN: True DW_CONV_SKIP_RELU: True DET_HEAD_LAST_SCALE: 0.0 RPN: ANCHOR_SIZES: (16, 32, 64, 128, 256) ANCHOR_STRIDE: (16, ) BATCH_SIZE_PER_IMAGE: 256 PRE_NMS_TOP_N_TRAIN: 6000 PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TRAIN: 2000 POST_NMS_TOP_N_TEST: 100 RPN_HEAD: FBNet.rpn_head ROI_HEADS: BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head NUM_CLASSES: 81 ROI_MASK_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head_mask PREDICTOR: "MaskRCNNConv1x1Predictor" RESOLUTION: 12 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.06 WARMUP_FACTOR: 0.1 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 128 # for 8GPUs # TEST: # IMS_PER_BATCH: 8 INPUT: MIN_SIZE_TRAIN: (320, ) MAX_SIZE_TRAIN: 640 MIN_SIZE_TEST: 320 MAX_SIZE_TEST: 640 PIXEL_MEAN: [103.53, 116.28, 123.675] PIXEL_STD: [57.375, 57.12, 58.395] ================================================ FILE: configs/e2e_mask_rcnn_fbnet_600.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" BACKBONE: CONV_BODY: FBNet FBNET: ARCH: "default" BN_TYPE: "bn" WIDTH_DIVISOR: 8 DW_CONV_SKIP_BN: True DW_CONV_SKIP_RELU: True DET_HEAD_LAST_SCALE: 0.0 RPN: ANCHOR_SIZES: (32, 64, 128, 256, 512) ANCHOR_STRIDE: (16, ) BATCH_SIZE_PER_IMAGE: 256 PRE_NMS_TOP_N_TRAIN: 6000 PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TRAIN: 2000 POST_NMS_TOP_N_TEST: 200 RPN_HEAD: FBNet.rpn_head ROI_HEADS: BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head NUM_CLASSES: 81 ROI_MASK_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head_mask PREDICTOR: "MaskRCNNConv1x1Predictor" RESOLUTION: 12 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.06 WARMUP_FACTOR: 0.1 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 128 # for 8GPUs # TEST: # IMS_PER_BATCH: 8 INPUT: MIN_SIZE_TRAIN: (600, ) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 600 MAX_SIZE_TEST: 1000 PIXEL_MEAN: [103.53, 116.28, 123.675] PIXEL_STD: [57.375, 57.12, 58.395] ================================================ FILE: configs/e2e_mask_rcnn_fbnet_xirb16d_dsmask.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" BACKBONE: CONV_BODY: FBNet FBNET: ARCH: "xirb16d_dsmask" BN_TYPE: "bn" WIDTH_DIVISOR: 8 DW_CONV_SKIP_BN: True DW_CONV_SKIP_RELU: True DET_HEAD_LAST_SCALE: -1.0 RPN: ANCHOR_SIZES: (16, 32, 64, 128, 256) ANCHOR_STRIDE: (16, ) BATCH_SIZE_PER_IMAGE: 256 PRE_NMS_TOP_N_TRAIN: 6000 PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TRAIN: 2000 POST_NMS_TOP_N_TEST: 100 RPN_HEAD: FBNet.rpn_head ROI_HEADS: BATCH_SIZE_PER_IMAGE: 512 ROI_BOX_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head NUM_CLASSES: 81 ROI_MASK_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head_mask PREDICTOR: "MaskRCNNConv1x1Predictor" RESOLUTION: 12 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.06 WARMUP_FACTOR: 0.1 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 128 # for 8GPUs # TEST: # IMS_PER_BATCH: 8 INPUT: MIN_SIZE_TRAIN: (320, ) MAX_SIZE_TRAIN: 640 MIN_SIZE_TEST: 320 MAX_SIZE_TEST: 640 PIXEL_MEAN: [103.53, 116.28, 123.675] PIXEL_STD: [57.375, 57.12, 58.395] ================================================ FILE: configs/e2e_mask_rcnn_fbnet_xirb16d_dsmask_600.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" BACKBONE: CONV_BODY: FBNet FBNET: ARCH: "xirb16d_dsmask" BN_TYPE: "bn" WIDTH_DIVISOR: 8 DW_CONV_SKIP_BN: True DW_CONV_SKIP_RELU: True DET_HEAD_LAST_SCALE: 0.0 RPN: ANCHOR_SIZES: (32, 64, 128, 256, 512) ANCHOR_STRIDE: (16, ) BATCH_SIZE_PER_IMAGE: 256 PRE_NMS_TOP_N_TRAIN: 6000 PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TRAIN: 2000 POST_NMS_TOP_N_TEST: 200 RPN_HEAD: FBNet.rpn_head ROI_HEADS: BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head NUM_CLASSES: 81 ROI_MASK_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head_mask PREDICTOR: "MaskRCNNConv1x1Predictor" RESOLUTION: 12 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.06 WARMUP_FACTOR: 0.1 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 128 # for 8GPUs # TEST: # IMS_PER_BATCH: 8 INPUT: MIN_SIZE_TRAIN: (600, ) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 600 MAX_SIZE_TEST: 1000 PIXEL_MEAN: [103.53, 116.28, 123.675] PIXEL_STD: [57.375, 57.12, 58.395] ================================================ FILE: configs/gn_baselines/README.md ================================================ ### Group Normalization 1 [Group Normalization](https://arxiv.org/abs/1803.08494) 2 [Rethinking ImageNet Pre-training](https://arxiv.org/abs/1811.08883) 3 [official code](https://github.com/facebookresearch/Detectron/blob/master/projects/GN/README.md) ### Performance | case | Type | lr schd | im/gpu | bbox AP | mask AP | |----------------------------|:------------:|:---------:|:-------:|:-------:|:-------:| | R-50-FPN, GN (paper) | finetune | 2x | 2 | 40.3 | 35.7 | | R-50-FPN, GN (implement) | finetune | 2x | 2 | 40.2 | 36.0 | | R-50-FPN, GN (paper) | from scratch | 3x | 2 | 39.5 | 35.2 | | R-50-FPN, GN (implement) | from scratch | 3x | 2 | 38.9 | 35.1 | ================================================ FILE: configs/gn_baselines/e2e_faster_rcnn_R_50_FPN_1x_gn.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50-GN" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: # use GN for backbone BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False TRANS_FUNC: "BottleneckWithGN" STEM_FUNC: "StemWithGN" FPN: USE_GN: True # use GN for FPN RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 512 POSITIVE_FRACTION: 0.25 ROI_BOX_HEAD: USE_GN: True # use GN for bbox head POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/gn_baselines/e2e_faster_rcnn_R_50_FPN_Xconv1fc_1x_gn.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50-GN" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: # use GN for backbone BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False TRANS_FUNC: "BottleneckWithGN" STEM_FUNC: "StemWithGN" FPN: USE_GN: True # use GN for FPN RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 512 POSITIVE_FRACTION: 0.25 ROI_BOX_HEAD: USE_GN: True # use GN for bbox head POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 CONV_HEAD_DIM: 256 NUM_STACKED_CONVS: 4 FEATURE_EXTRACTOR: "FPNXconv1fcFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/gn_baselines/e2e_mask_rcnn_R_50_FPN_1x_gn.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50-GN" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: # use GN for backbone BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False TRANS_FUNC: "BottleneckWithGN" STEM_FUNC: "StemWithGN" FPN: USE_GN: True # use GN for FPN RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 512 POSITIVE_FRACTION: 0.25 ROI_BOX_HEAD: USE_GN: True # use GN for bbox head POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: USE_GN: True # use GN for mask head POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) CONV_LAYERS: (256, 256, 256, 256) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/gn_baselines/e2e_mask_rcnn_R_50_FPN_Xconv1fc_1x_gn.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50-GN" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: # use GN for backbone BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False TRANS_FUNC: "BottleneckWithGN" STEM_FUNC: "StemWithGN" FPN: USE_GN: True # use GN for FPN RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 512 POSITIVE_FRACTION: 0.25 ROI_BOX_HEAD: USE_GN: True # use GN for bbox head POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 CONV_HEAD_DIM: 256 NUM_STACKED_CONVS: 4 FEATURE_EXTRACTOR: "FPNXconv1fcFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: USE_GN: True # use GN for mask head POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) CONV_LAYERS: (256, 256, 256, 256) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/gn_baselines/scratch_e2e_faster_rcnn_R_50_FPN_3x_gn.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "" # no pretrained model BACKBONE: CONV_BODY: "R-50-FPN" FREEZE_CONV_BODY_AT: 0 # finetune all layers RESNETS: # use GN for backbone BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False TRANS_FUNC: "BottleneckWithGN" STEM_FUNC: "StemWithGN" FPN: USE_GN: True # use GN for FPN RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 512 POSITIVE_FRACTION: 0.25 ROI_BOX_HEAD: USE_GN: True # use GN for bbox head POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (210000, 250000) MAX_ITER: 270000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/gn_baselines/scratch_e2e_faster_rcnn_R_50_FPN_Xconv1fc_3x_gn.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "" # no pretrained model BACKBONE: CONV_BODY: "R-50-FPN" FREEZE_CONV_BODY_AT: 0 # finetune all layers RESNETS: # use GN for backbone BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False TRANS_FUNC: "BottleneckWithGN" STEM_FUNC: "StemWithGN" FPN: USE_GN: True # use GN for FPN RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 512 POSITIVE_FRACTION: 0.25 ROI_BOX_HEAD: USE_GN: True # use GN for bbox head POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 CONV_HEAD_DIM: 256 NUM_STACKED_CONVS: 4 FEATURE_EXTRACTOR: "FPNXconv1fcFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (210000, 250000) MAX_ITER: 270000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/gn_baselines/scratch_e2e_mask_rcnn_R_50_FPN_3x_gn.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "" # no pretrained model BACKBONE: CONV_BODY: "R-50-FPN" FREEZE_CONV_BODY_AT: 0 # finetune all layers RESNETS: # use GN for backbone BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False TRANS_FUNC: "BottleneckWithGN" STEM_FUNC: "StemWithGN" FPN: USE_GN: True # use GN for FPN RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 512 POSITIVE_FRACTION: 0.25 ROI_BOX_HEAD: USE_GN: True # use GN for bbox head POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: USE_GN: True # use GN for mask head POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) CONV_LAYERS: (256, 256, 256, 256) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (210000, 250000) MAX_ITER: 270000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/gn_baselines/scratch_e2e_mask_rcnn_R_50_FPN_Xconv1fc_3x_gn.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "" # no pretrained model BACKBONE: CONV_BODY: "R-50-FPN" FREEZE_CONV_BODY_AT: 0 # finetune all layers RESNETS: # use GN for backbone BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False TRANS_FUNC: "BottleneckWithGN" STEM_FUNC: "StemWithGN" FPN: USE_GN: True # use GN for FPN RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 512 POSITIVE_FRACTION: 0.25 ROI_BOX_HEAD: USE_GN: True # use GN for bbox head POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 CONV_HEAD_DIM: 256 NUM_STACKED_CONVS: 4 FEATURE_EXTRACTOR: "FPNXconv1fcFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: USE_GN: True # use GN for mask head POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) CONV_LAYERS: (256, 256, 256, 256) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (210000, 250000) MAX_ITER: 270000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/pascal_voc/e2e_faster_rcnn_R_50_C4_1x_1_gpu_voc.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN: PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TEST: 300 ANCHOR_SIZES: (128, 256, 512) ROI_BOX_HEAD: NUM_CLASSES: 21 DATASETS: TRAIN: ("voc_2007_train", "voc_2007_val") TEST: ("voc_2007_test",) SOLVER: BASE_LR: 0.001 WEIGHT_DECAY: 0.0001 STEPS: (50000, ) MAX_ITER: 70000 IMS_PER_BATCH: 1 TEST: IMS_PER_BATCH: 1 ================================================ FILE: configs/pascal_voc/e2e_faster_rcnn_R_50_C4_1x_4_gpu_voc.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN: PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TEST: 300 ANCHOR_SIZES: (128, 256, 512) ROI_BOX_HEAD: NUM_CLASSES: 21 DATASETS: TRAIN: ("voc_2007_train", "voc_2007_val") TEST: ("voc_2007_test",) SOLVER: BASE_LR: 0.004 WEIGHT_DECAY: 0.0001 STEPS: (12500, ) MAX_ITER: 17500 IMS_PER_BATCH: 4 TEST: IMS_PER_BATCH: 4 ================================================ FILE: configs/pascal_voc/e2e_mask_rcnn_R_50_FPN_1x_cocostyle.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" NUM_CLASSES: 21 ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("voc_2012_train_cocostyle",) TEST: ("voc_2012_val_cocostyle",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (18000,) MAX_ITER: 24000 ================================================ FILE: configs/quick_schedules/e2e_faster_rcnn_R_50_C4_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN: PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: BATCH_SIZE_PER_IMAGE: 256 DATASETS: TRAIN: ("coco_2014_minival",) TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (600,) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (1500,) MAX_ITER: 2000 IMS_PER_BATCH: 2 TEST: IMS_PER_BATCH: 2 ================================================ FILE: configs/quick_schedules/e2e_faster_rcnn_R_50_FPN_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TRAIN: ("coco_2014_minival",) TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (600,) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (1500,) MAX_ITER: 2000 IMS_PER_BATCH: 4 TEST: IMS_PER_BATCH: 2 ================================================ FILE: configs/quick_schedules/e2e_faster_rcnn_X_101_32x8d_FPN_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-32x8d" BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TRAIN: ("coco_2014_minival",) TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (600,) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (1500,) MAX_ITER: 2000 IMS_PER_BATCH: 2 TEST: IMS_PER_BATCH: 2 ================================================ FILE: configs/quick_schedules/e2e_keypoint_rcnn_R_50_FPN_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" NUM_CLASSES: 2 ROI_KEYPOINT_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "KeypointRCNNFeatureExtractor" PREDICTOR: "KeypointRCNNPredictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 56 SHARE_BOX_FEATURE_EXTRACTOR: False KEYPOINT_ON: True DATASETS: TRAIN: ("keypoints_coco_2014_minival",) TEST: ("keypoints_coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (1500,) MAX_ITER: 2000 IMS_PER_BATCH: 4 TEST: IMS_PER_BATCH: 2 ================================================ FILE: configs/quick_schedules/e2e_mask_rcnn_R_50_C4_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN: PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: BATCH_SIZE_PER_IMAGE: 256 ROI_MASK_HEAD: PREDICTOR: "MaskRCNNC4Predictor" SHARE_BOX_FEATURE_EXTRACTOR: True MASK_ON: True DATASETS: TRAIN: ("coco_2014_minival",) TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (600,) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (1500,) MAX_ITER: 2000 IMS_PER_BATCH: 4 TEST: IMS_PER_BATCH: 2 ================================================ FILE: configs/quick_schedules/e2e_mask_rcnn_R_50_FPN_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_minival",) TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (600,) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (1500,) MAX_ITER: 2000 IMS_PER_BATCH: 4 TEST: IMS_PER_BATCH: 2 ================================================ FILE: configs/quick_schedules/e2e_mask_rcnn_X_101_32x8d_FPN_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-32x8d" BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_minival",) TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (600,) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (1500,) MAX_ITER: 2000 IMS_PER_BATCH: 2 TEST: IMS_PER_BATCH: 2 ================================================ FILE: configs/quick_schedules/rpn_R_50_C4_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN_ONLY: True RPN: PRE_NMS_TOP_N_TEST: 12000 POST_NMS_TOP_N_TEST: 2000 DATASETS: TRAIN: ("coco_2014_minival",) TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (600,) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (1500,) MAX_ITER: 2000 IMS_PER_BATCH: 4 TEST: IMS_PER_BATCH: 2 ================================================ FILE: configs/quick_schedules/rpn_R_50_FPN_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN_ONLY: True BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 2000 FPN_POST_NMS_TOP_N_TEST: 2000 DATASETS: TRAIN: ("coco_2014_minival",) TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (600,) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (1500,) MAX_ITER: 2000 IMS_PER_BATCH: 4 TEST: IMS_PER_BATCH: 2 ================================================ FILE: configs/retinanet/retinanet_R-101-FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" RPN_ONLY: True RETINANET_ON: True BACKBONE: CONV_BODY: "R-101-FPN-RETINANET" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" RETINANET: SCALES_PER_OCTAVE: 3 STRADDLE_THRESH: -1 FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (800, ) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 4 gpus BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 8 ================================================ FILE: configs/retinanet/retinanet_R-101-FPN_P5_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" RPN_ONLY: True RETINANET_ON: True BACKBONE: CONV_BODY: "R-101-FPN-RETINANET" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" RETINANET: SCALES_PER_OCTAVE: 3 STRADDLE_THRESH: -1 USE_C5: False FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (800, ) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 4 gpus BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 8 ================================================ FILE: configs/retinanet/retinanet_R-50-FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN_ONLY: True RETINANET_ON: True BACKBONE: CONV_BODY: "R-50-FPN-RETINANET" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" RETINANET: SCALES_PER_OCTAVE: 3 STRADDLE_THRESH: -1 FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 4 gpus BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 8 ================================================ FILE: configs/retinanet/retinanet_R-50-FPN_1x_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN_ONLY: True RETINANET_ON: True BACKBONE: CONV_BODY: "R-50-FPN-RETINANET" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" RETINANET: SCALES_PER_OCTAVE: 3 STRADDLE_THRESH: -1 FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 DATASETS: TRAIN: ("coco_2014_minival",) TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (600,) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (3500,) MAX_ITER: 4000 IMS_PER_BATCH: 4 ================================================ FILE: configs/retinanet/retinanet_R-50-FPN_P5_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN_ONLY: True RETINANET_ON: True BACKBONE: CONV_BODY: "R-50-FPN-RETINANET" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" RETINANET: SCALES_PER_OCTAVE: 3 STRADDLE_THRESH: -1 USE_C5: False FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 4 gpus BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 8 ================================================ FILE: configs/retinanet/retinanet_X_101_32x8d_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-32x8d" RPN_ONLY: True RETINANET_ON: True BACKBONE: CONV_BODY: "R-101-FPN-RETINANET" RESNETS: BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 RPN: USE_FPN: True FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" RETINANET: SCALES_PER_OCTAVE: 3 STRADDLE_THRESH: -1 FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (800, ) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 4 gpus BASE_LR: 0.0025 WEIGHT_DECAY: 0.0001 STEPS: (240000, 320000) MAX_ITER: 360000 IMS_PER_BATCH: 4 ================================================ FILE: configs/rpn_R_101_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" RPN_ONLY: True BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 2000 FPN_POST_NMS_TOP_N_TEST: 2000 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/rpn_R_50_C4_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN_ONLY: True RPN: PRE_NMS_TOP_N_TEST: 12000 POST_NMS_TOP_N_TEST: 2000 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/rpn_R_50_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN_ONLY: True BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 2000 FPN_POST_NMS_TOP_N_TEST: 2000 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/rpn_X_101_32x8d_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-32x8d" RPN_ONLY: True BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 2000 FPN_POST_NMS_TOP_N_TEST: 2000 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/test_time_aug/e2e_mask_rcnn_R_50_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 TEST: BBOX_AUG: ENABLED: True H_FLIP: True SCALES: (400, 500, 600, 700, 900, 1000, 1100, 1200) MAX_SIZE: 2000 SCALE_H_FLIP: True ================================================ FILE: demo/README.md ================================================ ## Webcam and Jupyter notebook demo This folder contains a simple webcam demo that illustrates how you can use `maskrcnn_benchmark` for inference. ### With your preferred environment You can start it by running it from this folder, using one of the following commands: ```bash # by default, it runs on the GPU # for best results, use min-image-size 800 python webcam.py --min-image-size 800 # can also run it on the CPU python webcam.py --min-image-size 300 MODEL.DEVICE cpu # or change the model that you want to use python webcam.py --config-file ../configs/caffe2/e2e_mask_rcnn_R_101_FPN_1x_caffe2.yaml --min-image-size 300 MODEL.DEVICE cpu # in order to see the probability heatmaps, pass --show-mask-heatmaps python webcam.py --min-image-size 300 --show-mask-heatmaps MODEL.DEVICE cpu ``` ### With Docker Build the image with the tag `maskrcnn-benchmark` (check [INSTALL.md](../INSTALL.md) for instructions) Adjust permissions of the X server host (be careful with this step, refer to [here](http://wiki.ros.org/docker/Tutorials/GUI) for alternatives) ```bash xhost + ``` Then run a container with the demo: ``` docker run --rm -it \ -e DISPLAY=${DISPLAY} \ --privileged \ -v /tmp/.X11-unix:/tmp/.X11-unix \ --device=/dev/video0:/dev/video0 \ --ipc=host maskrcnn-benchmark \ python demo/webcam.py --min-image-size 300 \ --config-file configs/caffe2/e2e_mask_rcnn_R_50_FPN_1x_caffe2.yaml ``` **DISCLAIMER:** *This was tested for an Ubuntu 16.04 machine, the volume mapping may vary depending on your platform* ================================================ FILE: demo/predictor.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import cv2 import torch from torchvision import transforms as T from torchvision.transforms import functional as F from maskrcnn_benchmark.modeling.detector import build_detection_model from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer from maskrcnn_benchmark.structures.image_list import to_image_list from maskrcnn_benchmark.modeling.roi_heads.mask_head.inference import Masker from maskrcnn_benchmark import layers as L from maskrcnn_benchmark.utils import cv2_util class Resize(object): def __init__(self, min_size, max_size): self.min_size = min_size self.max_size = max_size # modified from torchvision to add support for max size def get_size(self, image_size): w, h = image_size size = self.min_size max_size = self.max_size if max_size is not None: min_original_size = float(min((w, h))) max_original_size = float(max((w, h))) if max_original_size / min_original_size * size > max_size: size = int(round(max_size * min_original_size / max_original_size)) if (w <= h and w == size) or (h <= w and h == size): return (h, w) if w < h: ow = size oh = int(size * h / w) else: oh = size ow = int(size * w / h) return (oh, ow) def __call__(self, image): size = self.get_size(image.size) image = F.resize(image, size) return image class COCODemo(object): # COCO categories for pretty print CATEGORIES = [ "__background", "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", ] def __init__( self, cfg, confidence_threshold=0.7, show_mask_heatmaps=False, masks_per_dim=2, min_image_size=224, weight_loading = None ): self.cfg = cfg.clone() self.model = build_detection_model(cfg) self.model.eval() self.device = torch.device(cfg.MODEL.DEVICE) self.model.to(self.device) self.min_image_size = min_image_size save_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, self.model, save_dir=save_dir) _ = checkpointer.load(cfg.MODEL.WEIGHT) if weight_loading: print('Loading weight from {}.'.format(weight_loading)) _ = checkpointer._load_model(torch.load(weight_loading)) self.transforms = self.build_transform() mask_threshold = -1 if show_mask_heatmaps else 0.5 self.masker = Masker(threshold=mask_threshold, padding=1) # used to make colors for each class self.palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1]) self.cpu_device = torch.device("cpu") self.confidence_threshold = confidence_threshold self.show_mask_heatmaps = show_mask_heatmaps self.masks_per_dim = masks_per_dim def build_transform(self): """ Creates a basic transformation that was used to train the models """ cfg = self.cfg # we are loading images with OpenCV, so we don't need to convert them # to BGR, they are already! So all we need to do is to normalize # by 255 if we want to convert to BGR255 format, or flip the channels # if we want it to be in RGB in [0-1] range. if cfg.INPUT.TO_BGR255: to_bgr_transform = T.Lambda(lambda x: x * 255) else: to_bgr_transform = T.Lambda(lambda x: x[[2, 1, 0]]) normalize_transform = T.Normalize( mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD ) min_size = cfg.INPUT.MIN_SIZE_TEST max_size = cfg.INPUT.MAX_SIZE_TEST transform = T.Compose( [ T.ToPILImage(), Resize(min_size, max_size), T.ToTensor(), to_bgr_transform, normalize_transform, ] ) return transform def run_on_opencv_image(self, image): """ Arguments: image (np.ndarray): an image as returned by OpenCV Returns: prediction (BoxList): the detected objects. Additional information of the detection properties can be found in the fields of the BoxList via `prediction.fields()` """ predictions = self.compute_prediction(image) top_predictions = self.select_top_predictions(predictions) result = image.copy() if self.show_mask_heatmaps: return self.create_mask_montage(result, top_predictions) result = self.overlay_boxes(result, top_predictions) if self.cfg.MODEL.MASK_ON: result = self.overlay_mask(result, top_predictions) if self.cfg.MODEL.KEYPOINT_ON: result = self.overlay_keypoints(result, top_predictions) result = self.overlay_class_names(result, top_predictions) return result def compute_prediction(self, original_image): """ Arguments: original_image (np.ndarray): an image as returned by OpenCV Returns: prediction (BoxList): the detected objects. Additional information of the detection properties can be found in the fields of the BoxList via `prediction.fields()` """ # apply pre-processing to image image = self.transforms(original_image) # convert to an ImageList, padded so that it is divisible by # cfg.DATALOADER.SIZE_DIVISIBILITY image_list = to_image_list(image, self.cfg.DATALOADER.SIZE_DIVISIBILITY) image_list = image_list.to(self.device) # compute predictions with torch.no_grad(): predictions = self.model(image_list) predictions = [o.to(self.cpu_device) for o in predictions] # always single image is passed at a time prediction = predictions[0] # reshape prediction (a BoxList) into the original image size height, width = original_image.shape[:-1] prediction = prediction.resize((width, height)) if prediction.has_field("mask"): # if we have masks, paste the masks in the right position # in the image, as defined by the bounding boxes masks = prediction.get_field("mask") # always single image is passed at a time masks = self.masker([masks], [prediction])[0] prediction.add_field("mask", masks) return prediction def select_top_predictions(self, predictions): """ Select only predictions which have a `score` > self.confidence_threshold, and returns the predictions in descending order of score Arguments: predictions (BoxList): the result of the computation by the model. It should contain the field `scores`. Returns: prediction (BoxList): the detected objects. Additional information of the detection properties can be found in the fields of the BoxList via `prediction.fields()` """ scores = predictions.get_field("scores") keep = torch.nonzero(scores > self.confidence_threshold).squeeze(1) predictions = predictions[keep] scores = predictions.get_field("scores") _, idx = scores.sort(0, descending=True) return predictions[idx] def compute_colors_for_labels(self, labels): """ Simple function that adds fixed colors depending on the class """ colors = labels[:, None] * self.palette colors = (colors % 255).numpy().astype("uint8") return colors def overlay_boxes(self, image, predictions): """ Adds the predicted boxes on top of the image Arguments: image (np.ndarray): an image as returned by OpenCV predictions (BoxList): the result of the computation by the model. It should contain the field `labels`. """ labels = predictions.get_field("labels") boxes = predictions.bbox colors = self.compute_colors_for_labels(labels).tolist() for box, color in zip(boxes, colors): box = box.to(torch.int64) top_left, bottom_right = box[:2].tolist(), box[2:].tolist() image = cv2.rectangle( image, tuple(top_left), tuple(bottom_right), tuple(color), 1 ) return image def overlay_mask(self, image, predictions): """ Adds the instances contours for each predicted object. Each label has a different color. Arguments: image (np.ndarray): an image as returned by OpenCV predictions (BoxList): the result of the computation by the model. It should contain the field `mask` and `labels`. """ masks = predictions.get_field("mask").numpy() labels = predictions.get_field("labels") colors = self.compute_colors_for_labels(labels).tolist() for mask, color in zip(masks, colors): thresh = mask[0, :, :, None].astype(np.uint8) contours, hierarchy = cv2_util.findContours( thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE ) image = cv2.drawContours(image, contours, -1, color, 3) composite = image return composite def overlay_keypoints(self, image, predictions): keypoints = predictions.get_field("keypoints") kps = keypoints.keypoints scores = keypoints.get_field("logits") kps = torch.cat((kps[:, :, 0:2], scores[:, :, None]), dim=2).numpy() for region in kps: image = vis_keypoints(image, region.transpose((1, 0))) return image def create_mask_montage(self, image, predictions): """ Create a montage showing the probability heatmaps for each one one of the detected objects Arguments: image (np.ndarray): an image as returned by OpenCV predictions (BoxList): the result of the computation by the model. It should contain the field `mask`. """ masks = predictions.get_field("mask") masks_per_dim = self.masks_per_dim masks = L.interpolate( masks.float(), scale_factor=1 / masks_per_dim ).byte() height, width = masks.shape[-2:] max_masks = masks_per_dim ** 2 masks = masks[:max_masks] # handle case where we have less detections than max_masks if len(masks) < max_masks: masks_padded = torch.zeros(max_masks, 1, height, width, dtype=torch.uint8) masks_padded[: len(masks)] = masks masks = masks_padded masks = masks.reshape(masks_per_dim, masks_per_dim, height, width) result = torch.zeros( (masks_per_dim * height, masks_per_dim * width), dtype=torch.uint8 ) for y in range(masks_per_dim): start_y = y * height end_y = (y + 1) * height for x in range(masks_per_dim): start_x = x * width end_x = (x + 1) * width result[start_y:end_y, start_x:end_x] = masks[y, x] return cv2.applyColorMap(result.numpy(), cv2.COLORMAP_JET) def overlay_class_names(self, image, predictions): """ Adds detected class names and scores in the positions defined by the top-left corner of the predicted bounding box Arguments: image (np.ndarray): an image as returned by OpenCV predictions (BoxList): the result of the computation by the model. It should contain the field `scores` and `labels`. """ scores = predictions.get_field("scores").tolist() labels = predictions.get_field("labels").tolist() labels = [self.CATEGORIES[i] for i in labels] boxes = predictions.bbox template = "{}: {:.2f}" for box, score, label in zip(boxes, scores, labels): x, y = box[:2] s = template.format(label, score) cv2.putText( image, s, (x, y), cv2.FONT_HERSHEY_SIMPLEX, .5, (255, 255, 255), 1 ) return image import numpy as np import matplotlib.pyplot as plt from maskrcnn_benchmark.structures.keypoint import PersonKeypoints def vis_keypoints(img, kps, kp_thresh=2, alpha=0.7): """Visualizes keypoints (adapted from vis_one_image). kps has shape (4, #keypoints) where 4 rows are (x, y, logit, prob). """ dataset_keypoints = PersonKeypoints.NAMES kp_lines = PersonKeypoints.CONNECTIONS # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv. cmap = plt.get_cmap('rainbow') colors = [cmap(i) for i in np.linspace(0, 1, len(kp_lines) + 2)] colors = [(c[2] * 255, c[1] * 255, c[0] * 255) for c in colors] # Perform the drawing on a copy of the image, to allow for blending. kp_mask = np.copy(img) # Draw mid shoulder / mid hip first for better visualization. mid_shoulder = ( kps[:2, dataset_keypoints.index('right_shoulder')] + kps[:2, dataset_keypoints.index('left_shoulder')]) / 2.0 sc_mid_shoulder = np.minimum( kps[2, dataset_keypoints.index('right_shoulder')], kps[2, dataset_keypoints.index('left_shoulder')]) mid_hip = ( kps[:2, dataset_keypoints.index('right_hip')] + kps[:2, dataset_keypoints.index('left_hip')]) / 2.0 sc_mid_hip = np.minimum( kps[2, dataset_keypoints.index('right_hip')], kps[2, dataset_keypoints.index('left_hip')]) nose_idx = dataset_keypoints.index('nose') if sc_mid_shoulder > kp_thresh and kps[2, nose_idx] > kp_thresh: cv2.line( kp_mask, tuple(mid_shoulder), tuple(kps[:2, nose_idx]), color=colors[len(kp_lines)], thickness=2, lineType=cv2.LINE_AA) if sc_mid_shoulder > kp_thresh and sc_mid_hip > kp_thresh: cv2.line( kp_mask, tuple(mid_shoulder), tuple(mid_hip), color=colors[len(kp_lines) + 1], thickness=2, lineType=cv2.LINE_AA) # Draw the keypoints. for l in range(len(kp_lines)): i1 = kp_lines[l][0] i2 = kp_lines[l][1] p1 = kps[0, i1], kps[1, i1] p2 = kps[0, i2], kps[1, i2] if kps[2, i1] > kp_thresh and kps[2, i2] > kp_thresh: cv2.line( kp_mask, p1, p2, color=colors[l], thickness=2, lineType=cv2.LINE_AA) if kps[2, i1] > kp_thresh: cv2.circle( kp_mask, p1, radius=3, color=colors[l], thickness=-1, lineType=cv2.LINE_AA) if kps[2, i2] > kp_thresh: cv2.circle( kp_mask, p2, radius=3, color=colors[l], thickness=-1, lineType=cv2.LINE_AA) # Blend the keypoints. return cv2.addWeighted(img, 1.0 - alpha, kp_mask, alpha, 0) ================================================ FILE: demo/webcam.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import argparse import cv2 from maskrcnn_benchmark.config import cfg from predictor import COCODemo import time def main(): parser = argparse.ArgumentParser(description="PyTorch Object Detection Webcam Demo") parser.add_argument( "--config-file", default="../configs/caffe2/e2e_mask_rcnn_R_50_FPN_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument( "--confidence-threshold", type=float, default=0.7, help="Minimum score for the prediction to be shown", ) parser.add_argument( "--min-image-size", type=int, default=224, help="Smallest size of the image to feed to the model. " "Model was trained with 800, which gives best results", ) parser.add_argument( "--show-mask-heatmaps", dest="show_mask_heatmaps", help="Show a heatmap probability for the top masks-per-dim masks", action="store_true", ) parser.add_argument( "--masks-per-dim", type=int, default=2, help="Number of heatmaps per dimension to show", ) parser.add_argument( "opts", help="Modify model config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() # load config from file and command-line arguments cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() # prepare object that handles inference plus adds predictions on top of image coco_demo = COCODemo( cfg, confidence_threshold=args.confidence_threshold, show_mask_heatmaps=args.show_mask_heatmaps, masks_per_dim=args.masks_per_dim, min_image_size=args.min_image_size, ) cam = cv2.VideoCapture(0) while True: start_time = time.time() ret_val, img = cam.read() composite = coco_demo.run_on_opencv_image(img) print("Time: {:.2f} s / img".format(time.time() - start_time)) cv2.imshow("COCO detections", composite) if cv2.waitKey(1) == 27: break # esc to quit cv2.destroyAllWindows() if __name__ == "__main__": main() ================================================ FILE: docker/Dockerfile ================================================ ARG CUDA="9.0" ARG CUDNN="7" FROM nvidia/cuda:${CUDA}-cudnn${CUDNN}-devel-ubuntu16.04 RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections # install basics RUN apt-get update -y \ && apt-get install -y apt-utils git curl ca-certificates bzip2 cmake tree htop bmon iotop g++ \ && apt-get install -y libglib2.0-0 libsm6 libxext6 libxrender-dev # Install Miniconda RUN curl -so /miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ && chmod +x /miniconda.sh \ && /miniconda.sh -b -p /miniconda \ && rm /miniconda.sh ENV PATH=/miniconda/bin:$PATH # Create a Python 3.6 environment RUN /miniconda/bin/conda install -y conda-build \ && /miniconda/bin/conda create -y --name py36 python=3.6.7 \ && /miniconda/bin/conda clean -ya ENV CONDA_DEFAULT_ENV=py36 ENV CONDA_PREFIX=/miniconda/envs/$CONDA_DEFAULT_ENV ENV PATH=$CONDA_PREFIX/bin:$PATH ENV CONDA_AUTO_UPDATE_CONDA=false RUN conda install -y ipython RUN pip install requests ninja yacs cython matplotlib opencv-python tqdm # Install PyTorch 1.0 Nightly ARG CUDA RUN conda install pytorch-nightly cudatoolkit=${CUDA} -c pytorch \ && conda clean -ya # Install TorchVision master RUN git clone https://github.com/pytorch/vision.git \ && cd vision \ && python setup.py install # install pycocotools RUN git clone https://github.com/cocodataset/cocoapi.git \ && cd cocoapi/PythonAPI \ && python setup.py build_ext install # install apex RUN git clone https://github.com/NVIDIA/apex.git \ && cd apex \ && python setup.py install --cuda_ext --cpp_ext # install PyTorch Detection ARG FORCE_CUDA="1" ENV FORCE_CUDA=${FORCE_CUDA} RUN git clone https://github.com/facebookresearch/maskrcnn-benchmark.git \ && cd maskrcnn-benchmark \ && python setup.py build develop WORKDIR /maskrcnn-benchmark ================================================ FILE: docker/docker-jupyter/Dockerfile ================================================ ARG CUDA="9.0" ARG CUDNN="7" FROM nvidia/cuda:${CUDA}-cudnn${CUDNN}-devel-ubuntu16.04 RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections # install basics RUN apt-get update -y \ && apt-get install -y apt-utils git curl ca-certificates bzip2 cmake tree htop bmon iotop g++ # Install Miniconda RUN curl -so /miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ && chmod +x /miniconda.sh \ && /miniconda.sh -b -p /miniconda \ && rm /miniconda.sh ENV PATH=/miniconda/bin:$PATH # Create a Python 3.6 environment RUN /miniconda/bin/conda install -y conda-build \ && /miniconda/bin/conda create -y --name py36 python=3.6.7 \ && /miniconda/bin/conda clean -ya ENV CONDA_DEFAULT_ENV=py36 ENV CONDA_PREFIX=/miniconda/envs/$CONDA_DEFAULT_ENV ENV PATH=$CONDA_PREFIX/bin:$PATH ENV CONDA_AUTO_UPDATE_CONDA=false RUN conda install -y ipython RUN pip install requests ninja yacs cython matplotlib jupyter tqdm # Install PyTorch Nightly ARG CUDA RUN conda install -y pytorch-nightly cudatoolkit=${CUDA} -c pytorch # Install OpenCV RUN conda install -y opencv -c menpo \ && conda clean -ya WORKDIR /root USER root RUN mkdir /notebooks WORKDIR /notebooks # Install TorchVision master RUN git clone https://github.com/pytorch/vision.git \ && cd vision \ && python setup.py install # install pycocotools RUN git clone https://github.com/cocodataset/cocoapi.git \ && cd cocoapi/PythonAPI \ && python setup.py build_ext install # install apex RUN git clone https://github.com/NVIDIA/apex.git \ && cd apex \ && python setup.py install --cuda_ext --cpp_ext # install PyTorch Detection ARG FORCE_CUDA="1" ENV FORCE_CUDA=${FORCE_CUDA} RUN git clone https://github.com/facebookresearch/maskrcnn-benchmark.git \ && cd maskrcnn-benchmark \ && python setup.py build develop RUN jupyter notebook --generate-config ENV CONFIG_PATH="/root/.jupyter/jupyter_notebook_config.py" COPY "jupyter_notebook_config.py" ${CONFIG_PATH} ENTRYPOINT ["sh", "-c", "jupyter notebook --allow-root -y --no-browser --ip=0.0.0.0 --config=${CONFIG_PATH}"] ================================================ FILE: docker/docker-jupyter/jupyter_notebook_config.py ================================================ import os from IPython.lib import passwd # c = c # pylint:disable=undefined-variable c = get_config() c.NotebookApp.ip = '0.0.0.0' c.NotebookApp.port = int(os.getenv('PORT', 8888)) c.NotebookApp.open_browser = False # sets a password if PASSWORD is set in the environment if 'PASSWORD' in os.environ: password = os.environ['PASSWORD'] if password: c.NotebookApp.password = passwd(password) else: c.NotebookApp.password = '' c.NotebookApp.token = '' del os.environ['PASSWORD'] ================================================ FILE: maskrcnn_benchmark/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. ================================================ FILE: maskrcnn_benchmark/config/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from .defaults import _C as cfg ================================================ FILE: maskrcnn_benchmark/config/defaults.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import os from yacs.config import CfgNode as CN # ----------------------------------------------------------------------------- # Convention about Training / Test specific parameters # ----------------------------------------------------------------------------- # Whenever an argument can be either used for training or for testing, the # corresponding name will be post-fixed by a _TRAIN for a training parameter, # or _TEST for a test-specific parameter. # For example, the maximum image side during training will be # INPUT.MAX_SIZE_TRAIN, while for testing it will be # INPUT.MAX_SIZE_TEST # ----------------------------------------------------------------------------- # Config definition # ----------------------------------------------------------------------------- _C = CN() _C.MODEL = CN() _C.MODEL.RPN_ONLY = False _C.MODEL.MASK_ON = False _C.MODEL.RETINANET_ON = False _C.MODEL.KEYPOINT_ON = False _C.MODEL.DEVICE = "cuda" _C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN" _C.MODEL.CLS_AGNOSTIC_BBOX_REG = False # If the WEIGHT starts with a catalog://, like :R-50, the code will look for # the path in paths_catalog. Else, it will use it as the specified absolute # path _C.MODEL.WEIGHT = "" # ----------------------------------------------------------------------------- # INPUT # ----------------------------------------------------------------------------- _C.INPUT = CN() # Size of the smallest side of the image during training _C.INPUT.MIN_SIZE_TRAIN = (800,) # (800,) # Maximum size of the side of the image during training _C.INPUT.MAX_SIZE_TRAIN = 1333 # Size of the smallest side of the image during testing _C.INPUT.MIN_SIZE_TEST = 800 # Maximum size of the side of the image during testing _C.INPUT.MAX_SIZE_TEST = 1333 # Values to be used for image normalization _C.INPUT.PIXEL_MEAN = [102.9801, 115.9465, 122.7717] # Values to be used for image normalization _C.INPUT.PIXEL_STD = [1., 1., 1.] # Convert image to BGR format (for Caffe2 models), in range 0-255 _C.INPUT.TO_BGR255 = True # Image ColorJitter _C.INPUT.BRIGHTNESS = 0.0 _C.INPUT.CONTRAST = 0.0 _C.INPUT.SATURATION = 0.0 _C.INPUT.HUE = 0.0 # Flips _C.INPUT.HORIZONTAL_FLIP_PROB_TRAIN = 0.5 _C.INPUT.VERTICAL_FLIP_PROB_TRAIN = 0.0 # ----------------------------------------------------------------------------- # Dataset # ----------------------------------------------------------------------------- _C.DATASETS = CN() # List of the dataset names for training, as present in paths_catalog.py _C.DATASETS.TRAIN = () # List of the dataset names for testing, as present in paths_catalog.py _C.DATASETS.TEST = () # ----------------------------------------------------------------------------- # DataLoader # ----------------------------------------------------------------------------- _C.DATALOADER = CN() # Number of data loading threads _C.DATALOADER.NUM_WORKERS = 4 # If > 0, this enforces that each collated batch should have a size divisible # by SIZE_DIVISIBILITY _C.DATALOADER.SIZE_DIVISIBILITY = 0 # If True, each batch should contain only images for which the aspect ratio # is compatible. This groups portrait images together, and landscape images # are not batched with portrait images. _C.DATALOADER.ASPECT_RATIO_GROUPING = True # ---------------------------------------------------------------------------- # # Backbone options # ---------------------------------------------------------------------------- # _C.MODEL.BACKBONE = CN() # The backbone conv body to use # The string must match a function that is imported in modeling.model_builder # (e.g., 'FPN.add_fpn_ResNet101_conv5_body' to specify a ResNet-101-FPN # backbone) _C.MODEL.BACKBONE.CONV_BODY = "R-50-C4" # Add StopGrad at a specified stage so the bottom layers are frozen _C.MODEL.BACKBONE.FREEZE_CONV_BODY_AT = 2 # ---------------------------------------------------------------------------- # # FPN options # ---------------------------------------------------------------------------- # _C.MODEL.FPN = CN() _C.MODEL.FPN.USE_GN = False _C.MODEL.FPN.USE_RELU = False # ---------------------------------------------------------------------------- # # Group Norm options # ---------------------------------------------------------------------------- # _C.MODEL.GROUP_NORM = CN() # Number of dimensions per group in GroupNorm (-1 if using NUM_GROUPS) _C.MODEL.GROUP_NORM.DIM_PER_GP = -1 # Number of groups in GroupNorm (-1 if using DIM_PER_GP) _C.MODEL.GROUP_NORM.NUM_GROUPS = 32 # GroupNorm's small constant in the denominator _C.MODEL.GROUP_NORM.EPSILON = 1e-5 # ---------------------------------------------------------------------------- # # RPN options # ---------------------------------------------------------------------------- # _C.MODEL.RPN = CN() _C.MODEL.RPN.USE_FPN = False # Base RPN anchor sizes given in absolute pixels w.r.t. the scaled network input _C.MODEL.RPN.ANCHOR_SIZES = (32, 64, 128, 256, 512) # Stride of the feature map that RPN is attached. # For FPN, number of strides should match number of scales _C.MODEL.RPN.ANCHOR_STRIDE = (16,) # RPN anchor aspect ratios _C.MODEL.RPN.ASPECT_RATIOS = (0.5, 1.0, 2.0) # Remove RPN anchors that go outside the image by RPN_STRADDLE_THRESH pixels # Set to -1 or a large value, e.g. 100000, to disable pruning anchors _C.MODEL.RPN.STRADDLE_THRESH = 0 # Minimum overlap required between an anchor and ground-truth box for the # (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD # ==> positive RPN example) _C.MODEL.RPN.FG_IOU_THRESHOLD = 0.7 # Maximum overlap allowed between an anchor and ground-truth box for the # (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD # ==> negative RPN example) _C.MODEL.RPN.BG_IOU_THRESHOLD = 0.3 # Total number of RPN examples per image _C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256 # Target fraction of foreground (positive) examples per RPN minibatch _C.MODEL.RPN.POSITIVE_FRACTION = 0.5 # Number of top scoring RPN proposals to keep before applying NMS # When FPN is used, this is *per FPN level* (not total) _C.MODEL.RPN.PRE_NMS_TOP_N_TRAIN = 12000 _C.MODEL.RPN.PRE_NMS_TOP_N_TEST = 6000 # Number of top scoring RPN proposals to keep after applying NMS _C.MODEL.RPN.POST_NMS_TOP_N_TRAIN = 2000 _C.MODEL.RPN.POST_NMS_TOP_N_TEST = 1000 # NMS threshold used on RPN proposals _C.MODEL.RPN.NMS_THRESH = 0.7 # Proposal height and width both need to be greater than RPN_MIN_SIZE # (a the scale used during training or inference) _C.MODEL.RPN.MIN_SIZE = 0 # Number of top scoring RPN proposals to keep after combining proposals from # all FPN levels _C.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN = 2000 _C.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 2000 # Apply the post NMS per batch (default) or per image during training # (default is True to be consistent with Detectron, see Issue #672) _C.MODEL.RPN.FPN_POST_NMS_PER_BATCH = True # Custom rpn head, empty to use default conv or separable conv _C.MODEL.RPN.RPN_HEAD = "SingleConvRPNHead" # ---------------------------------------------------------------------------- # # ROI HEADS options # ---------------------------------------------------------------------------- # _C.MODEL.ROI_HEADS = CN() _C.MODEL.ROI_HEADS.USE_FPN = False # Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD) _C.MODEL.ROI_HEADS.FG_IOU_THRESHOLD = 0.5 # Overlap threshold for an RoI to be considered background # (class = 0 if overlap in [0, BG_IOU_THRESHOLD)) _C.MODEL.ROI_HEADS.BG_IOU_THRESHOLD = 0.5 # Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets # These are empirically chosen to approximately lead to unit variance targets _C.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS = (10., 10., 5., 5.) # RoI minibatch size *per image* (number of regions of interest [ROIs]) # Total number of RoIs per training minibatch = # TRAIN.BATCH_SIZE_PER_IM * TRAIN.IMS_PER_BATCH # E.g., a common configuration is: 512 * 2 * 8 = 8192 _C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512 # Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0) _C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25 # Only used on test mode # Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to # balance obtaining high recall with not having too many low precision # detections that will slow down inference post processing steps (like NMS) _C.MODEL.ROI_HEADS.SCORE_THRESH = 0.05 # Overlap threshold used for non-maximum suppression (suppress boxes with # IoU >= this threshold) _C.MODEL.ROI_HEADS.NMS = 0.5 # Maximum number of detections to return per image (100 is based on the limit # established for the COCO dataset) _C.MODEL.ROI_HEADS.DETECTIONS_PER_IMG = 100 _C.MODEL.ROI_BOX_HEAD = CN() _C.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor" _C.MODEL.ROI_BOX_HEAD.PREDICTOR = "FastRCNNPredictor" _C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14 _C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0 _C.MODEL.ROI_BOX_HEAD.POOLER_SCALES = (1.0 / 16,) _C.MODEL.ROI_BOX_HEAD.NUM_CLASSES = 81 # Hidden layer dimension when using an MLP for the RoI box head _C.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM = 1024 # GN _C.MODEL.ROI_BOX_HEAD.USE_GN = False # Dilation _C.MODEL.ROI_BOX_HEAD.DILATION = 1 _C.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM = 256 _C.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS = 4 _C.MODEL.ROI_MASK_HEAD = CN() _C.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor" _C.MODEL.ROI_MASK_HEAD.PREDICTOR = "MaskRCNNC4Predictor" _C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14 _C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0 _C.MODEL.ROI_MASK_HEAD.POOLER_SCALES = (1.0 / 16,) _C.MODEL.ROI_MASK_HEAD.MLP_HEAD_DIM = 1024 _C.MODEL.ROI_MASK_HEAD.CONV_LAYERS = (256, 256, 256, 256) _C.MODEL.ROI_MASK_HEAD.RESOLUTION = 14 _C.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True # Whether or not resize and translate masks to the input image. _C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS = False _C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD = 0.5 # Dilation _C.MODEL.ROI_MASK_HEAD.DILATION = 1 # GN _C.MODEL.ROI_MASK_HEAD.USE_GN = False _C.MODEL.ROI_KEYPOINT_HEAD = CN() _C.MODEL.ROI_KEYPOINT_HEAD.FEATURE_EXTRACTOR = "KeypointRCNNFeatureExtractor" _C.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR = "KeypointRCNNPredictor" _C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14 _C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0 _C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SCALES = (1.0 / 16,) _C.MODEL.ROI_KEYPOINT_HEAD.MLP_HEAD_DIM = 1024 _C.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS = tuple(512 for _ in range(8)) _C.MODEL.ROI_KEYPOINT_HEAD.RESOLUTION = 14 _C.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES = 17 _C.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True # ---------------------------------------------------------------------------- # # ResNe[X]t options (ResNets = {ResNet, ResNeXt} # Note that parts of a resnet may be used for both the backbone and the head # These options apply to both # ---------------------------------------------------------------------------- # _C.MODEL.RESNETS = CN() # Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt _C.MODEL.RESNETS.NUM_GROUPS = 1 # Baseline width of each group _C.MODEL.RESNETS.WIDTH_PER_GROUP = 64 # Place the stride 2 conv on the 1x1 filter # Use True only for the original MSRA ResNet; use False for C2 and Torch models _C.MODEL.RESNETS.STRIDE_IN_1X1 = True # Residual transformation function _C.MODEL.RESNETS.TRANS_FUNC = "BottleneckWithFixedBatchNorm" # ResNet's stem function (conv1 and pool1) _C.MODEL.RESNETS.STEM_FUNC = "StemWithFixedBatchNorm" # Apply dilation in stage "res5" _C.MODEL.RESNETS.RES5_DILATION = 1 _C.MODEL.RESNETS.BACKBONE_OUT_CHANNELS = 256 * 4 _C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256 _C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64 _C.MODEL.RESNETS.STAGE_WITH_DCN = (False, False, False, False) _C.MODEL.RESNETS.WITH_MODULATED_DCN = False _C.MODEL.RESNETS.DEFORMABLE_GROUPS = 1 # ---------------------------------------------------------------------------- # # RetinaNet Options (Follow the Detectron version) # ---------------------------------------------------------------------------- # _C.MODEL.RETINANET = CN() # This is the number of foreground classes and background. _C.MODEL.RETINANET.NUM_CLASSES = 81 # Anchor aspect ratios to use _C.MODEL.RETINANET.ANCHOR_SIZES = (32, 64, 128, 256, 512) _C.MODEL.RETINANET.ASPECT_RATIOS = (0.5, 1.0, 2.0) _C.MODEL.RETINANET.ANCHOR_STRIDES = (8, 16, 32, 64, 128) _C.MODEL.RETINANET.STRADDLE_THRESH = 0 # Anchor scales per octave _C.MODEL.RETINANET.OCTAVE = 2.0 _C.MODEL.RETINANET.SCALES_PER_OCTAVE = 3 # Use C5 or P5 to generate P6 _C.MODEL.RETINANET.USE_C5 = True # Convolutions to use in the cls and bbox tower # NOTE: this doesn't include the last conv for logits _C.MODEL.RETINANET.NUM_CONVS = 4 # Weight for bbox_regression loss _C.MODEL.RETINANET.BBOX_REG_WEIGHT = 4.0 # Smooth L1 loss beta for bbox regression _C.MODEL.RETINANET.BBOX_REG_BETA = 0.11 # During inference, #locs to select based on cls score before NMS is performed # per FPN level _C.MODEL.RETINANET.PRE_NMS_TOP_N = 1000 # IoU overlap ratio for labeling an anchor as positive # Anchors with >= iou overlap are labeled positive _C.MODEL.RETINANET.FG_IOU_THRESHOLD = 0.5 # IoU overlap ratio for labeling an anchor as negative # Anchors with < iou overlap are labeled negative _C.MODEL.RETINANET.BG_IOU_THRESHOLD = 0.4 # Focal loss parameter: alpha _C.MODEL.RETINANET.LOSS_ALPHA = 0.25 # Focal loss parameter: gamma _C.MODEL.RETINANET.LOSS_GAMMA = 2.0 # Prior prob for the positives at the beginning of training. This is used to set # the bias init for the logits layer _C.MODEL.RETINANET.PRIOR_PROB = 0.01 # Inference cls score threshold, anchors with score > INFERENCE_TH are # considered for inference _C.MODEL.RETINANET.INFERENCE_TH = 0.05 # NMS threshold used in RetinaNet _C.MODEL.RETINANET.NMS_TH = 0.4 # ---------------------------------------------------------------------------- # # FBNet options # ---------------------------------------------------------------------------- # _C.MODEL.FBNET = CN() _C.MODEL.FBNET.ARCH = "default" # custom arch _C.MODEL.FBNET.ARCH_DEF = "" _C.MODEL.FBNET.BN_TYPE = "bn" _C.MODEL.FBNET.SCALE_FACTOR = 1.0 # the output channels will be divisible by WIDTH_DIVISOR _C.MODEL.FBNET.WIDTH_DIVISOR = 1 _C.MODEL.FBNET.DW_CONV_SKIP_BN = True _C.MODEL.FBNET.DW_CONV_SKIP_RELU = True # > 0 scale, == 0 skip, < 0 same dimension _C.MODEL.FBNET.DET_HEAD_LAST_SCALE = 1.0 _C.MODEL.FBNET.DET_HEAD_BLOCKS = [] # overwrite the stride for the head, 0 to use original value _C.MODEL.FBNET.DET_HEAD_STRIDE = 0 # > 0 scale, == 0 skip, < 0 same dimension _C.MODEL.FBNET.KPTS_HEAD_LAST_SCALE = 0.0 _C.MODEL.FBNET.KPTS_HEAD_BLOCKS = [] # overwrite the stride for the head, 0 to use original value _C.MODEL.FBNET.KPTS_HEAD_STRIDE = 0 # > 0 scale, == 0 skip, < 0 same dimension _C.MODEL.FBNET.MASK_HEAD_LAST_SCALE = 0.0 _C.MODEL.FBNET.MASK_HEAD_BLOCKS = [] # overwrite the stride for the head, 0 to use original value _C.MODEL.FBNET.MASK_HEAD_STRIDE = 0 # 0 to use all blocks defined in arch_def _C.MODEL.FBNET.RPN_HEAD_BLOCKS = 0 _C.MODEL.FBNET.RPN_BN_TYPE = "" # ---------------------------------------------------------------------------- # # Solver # ---------------------------------------------------------------------------- # _C.SOLVER = CN() _C.SOLVER.MAX_ITER = 40000 _C.SOLVER.BASE_LR = 0.001 _C.SOLVER.BIAS_LR_FACTOR = 2 _C.SOLVER.MOMENTUM = 0.9 _C.SOLVER.WEIGHT_DECAY = 0.0005 _C.SOLVER.WEIGHT_DECAY_BIAS = 0 _C.SOLVER.GAMMA = 0.1 _C.SOLVER.STEPS = (30000,) _C.SOLVER.WARMUP_FACTOR = 1.0 / 3 _C.SOLVER.WARMUP_ITERS = 500 _C.SOLVER.WARMUP_METHOD = "linear" _C.SOLVER.CHECKPOINT_PERIOD = 2500 _C.SOLVER.TEST_PERIOD = 0 # Number of images per batch # This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will # see 2 images per batch _C.SOLVER.IMS_PER_BATCH = 16 # ---------------------------------------------------------------------------- # # Specific test options # ---------------------------------------------------------------------------- # _C.TEST = CN() _C.TEST.EXPECTED_RESULTS = [] _C.TEST.EXPECTED_RESULTS_SIGMA_TOL = 4 # Number of images per batch # This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will # see 2 images per batch _C.TEST.IMS_PER_BATCH = 8 # Number of detections per image _C.TEST.DETECTIONS_PER_IMG = 100 # ---------------------------------------------------------------------------- # # Test-time augmentations for bounding box detection # See configs/test_time_aug/e2e_mask_rcnn_R-50-FPN_1x.yaml for an example # ---------------------------------------------------------------------------- # _C.TEST.BBOX_AUG = CN() # Enable test-time augmentation for bounding box detection if True _C.TEST.BBOX_AUG.ENABLED = False # Horizontal flip at the original scale (id transform) _C.TEST.BBOX_AUG.H_FLIP = False # Each scale is the pixel size of an image's shortest side _C.TEST.BBOX_AUG.SCALES = () # Max pixel size of the longer side _C.TEST.BBOX_AUG.MAX_SIZE = 4000 # Horizontal flip at each scale _C.TEST.BBOX_AUG.SCALE_H_FLIP = False # ---------------------------------------------------------------------------- # # Misc options # ---------------------------------------------------------------------------- # _C.OUTPUT_DIR = "." _C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py") # ---------------------------------------------------------------------------- # # Precision options # ---------------------------------------------------------------------------- # # Precision of input, allowable: (float32, float16) _C.DTYPE = "float32" # Enable verbosity in apex.amp _C.AMP_VERBOSE = False ================================================ FILE: maskrcnn_benchmark/config/paths_catalog.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """Centralized catalog of paths.""" import os from copy import deepcopy class DatasetCatalog(object): DATA_DIR = "datasets" DATASETS = { "coco_2017_train": { "img_dir": "coco/train2017", "ann_file": "coco/annotations/instances_train2017.json" }, "coco_2017_val": { "img_dir": "coco/val2017", "ann_file": "coco/annotations/instances_val2017.json" }, "coco_2014_train": { "img_dir": "coco/train2014", "ann_file": "coco/annotations/instances_train2014.json" }, "coco_2014_val": { "img_dir": "coco/val2014", "ann_file": "coco/annotations/instances_val2014.json" }, "coco_2014_minival": { "img_dir": "coco/val2014", "ann_file": "coco/annotations/instances_minival2014.json" }, "coco_2014_valminusminival": { "img_dir": "coco/val2014", "ann_file": "coco/annotations/instances_valminusminival2014.json" }, "keypoints_coco_2014_train": { "img_dir": "coco/train2014", "ann_file": "coco/annotations/person_keypoints_train2014.json", }, "keypoints_coco_2014_val": { "img_dir": "coco/val2014", "ann_file": "coco/annotations/person_keypoints_val2014.json" }, "keypoints_coco_2014_minival": { "img_dir": "coco/val2014", "ann_file": "coco/annotations/person_keypoints_minival2014.json", }, "keypoints_coco_2014_valminusminival": { "img_dir": "coco/val2014", "ann_file": "coco/annotations/person_keypoints_valminusminival2014.json", }, "voc_2007_train": { "data_dir": "voc/VOC2007", "split": "train" }, "voc_2007_train_cocostyle": { "img_dir": "voc/VOC2007/JPEGImages", "ann_file": "voc/VOC2007/Annotations/pascal_train2007.json" }, "voc_2007_val": { "data_dir": "voc/VOC2007", "split": "val" }, "voc_2007_val_cocostyle": { "img_dir": "voc/VOC2007/JPEGImages", "ann_file": "voc/VOC2007/Annotations/pascal_val2007.json" }, "voc_2007_test": { "data_dir": "voc/VOC2007", "split": "test" }, "voc_2007_test_cocostyle": { "img_dir": "voc/VOC2007/JPEGImages", "ann_file": "voc/VOC2007/Annotations/pascal_test2007.json" }, "voc_2012_train": { "data_dir": "voc/VOC2012", "split": "train" }, "voc_2012_train_cocostyle": { "img_dir": "voc/VOC2012/JPEGImages", "ann_file": "voc/VOC2012/Annotations/pascal_train2012.json" }, "voc_2012_val": { "data_dir": "voc/VOC2012", "split": "val" }, "voc_2012_val_cocostyle": { "img_dir": "voc/VOC2012/JPEGImages", "ann_file": "voc/VOC2012/Annotations/pascal_val2012.json" }, "voc_2012_test": { "data_dir": "voc/VOC2012", "split": "test" # PASCAL VOC2012 doesn't made the test annotations available, so there's no json annotation }, ############################################## # These ones are deprecated, should be removed "cityscapes_fine_instanceonly_seg_train_cocostyle": { "img_dir": "cityscapes/images", "ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_train.json" }, "cityscapes_fine_instanceonly_seg_val_cocostyle": { "img_dir": "cityscapes/images", "ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_val.json" }, "cityscapes_fine_instanceonly_seg_test_cocostyle": { "img_dir": "cityscapes/images", "ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_test.json" }, ############################################## "cityscapes_poly_instance_train": { "img_dir": "cityscapes/leftImg8bit/", "ann_dir": "cityscapes/gtFine/", "split": "train", "mode": "poly", }, "cityscapes_poly_instance_val": { "img_dir": "cityscapes/leftImg8bit", "ann_dir": "cityscapes/gtFine", "split": "val", "mode": "poly", }, "cityscapes_poly_instance_minival": { "img_dir": "cityscapes/leftImg8bit", "ann_dir": "cityscapes/gtFine", "split": "val", "mode": "poly", "mini": 10, }, "cityscapes_mask_instance_train": { "img_dir": "cityscapes/leftImg8bit/", "ann_dir": "cityscapes/gtFine/", "split": "train", "mode": "mask", }, "cityscapes_mask_instance_val": { "img_dir": "cityscapes/leftImg8bit", "ann_dir": "cityscapes/gtFine", "split": "val", "mode": "mask", }, "cityscapes_mask_instance_minival": { "img_dir": "cityscapes/leftImg8bit", "ann_dir": "cityscapes/gtFine", "split": "val", "mode": "mask", "mini": 10, }, } @staticmethod def get(name): if "coco" in name: data_dir = DatasetCatalog.DATA_DIR attrs = DatasetCatalog.DATASETS[name] args = dict( root=os.path.join(data_dir, attrs["img_dir"]), ann_file=os.path.join(data_dir, attrs["ann_file"]), ) return dict( factory="COCODataset", args=args, ) elif "voc" in name: data_dir = DatasetCatalog.DATA_DIR attrs = DatasetCatalog.DATASETS[name] args = dict( data_dir=os.path.join(data_dir, attrs["data_dir"]), split=attrs["split"], ) return dict( factory="PascalVOCDataset", args=args, ) elif "cityscapes" in name: data_dir = DatasetCatalog.DATA_DIR attrs = deepcopy(DatasetCatalog.DATASETS[name]) attrs["img_dir"] = os.path.join(data_dir, attrs["img_dir"]) attrs["ann_dir"] = os.path.join(data_dir, attrs["ann_dir"]) return dict(factory="CityScapesDataset", args=attrs) raise RuntimeError("Dataset not available: {}".format(name)) class ModelCatalog(object): S3_C2_DETECTRON_URL = "https://dl.fbaipublicfiles.com/detectron" C2_IMAGENET_MODELS = { "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl", "MSRA/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl", "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl", "MSRA/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl", "FAIR/20171220/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl", } C2_DETECTRON_SUFFIX = "output/train/{}coco_2014_train%3A{}coco_2014_valminusminival/generalized_rcnn/model_final.pkl" C2_DETECTRON_MODELS = { "35857197/e2e_faster_rcnn_R-50-C4_1x": "01_33_49.iAX0mXvW", "35857345/e2e_faster_rcnn_R-50-FPN_1x": "01_36_30.cUF7QR7I", "35857890/e2e_faster_rcnn_R-101-FPN_1x": "01_38_50.sNxI7sX7", "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "06_31_39.5MIHi1fZ", "35858791/e2e_mask_rcnn_R-50-C4_1x": "01_45_57.ZgkA7hPB", "35858933/e2e_mask_rcnn_R-50-FPN_1x": "01_48_14.DzEQe4wC", "35861795/e2e_mask_rcnn_R-101-FPN_1x": "02_31_37.KqyEK4tT", "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "06_35_59.RZotkLKI", "37129812/e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x": "09_35_36.8pzTQKYK", # keypoints "37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "08_42_54.kdzV35ao" } @staticmethod def get(name): if name.startswith("Caffe2Detectron/COCO"): return ModelCatalog.get_c2_detectron_12_2017_baselines(name) if name.startswith("ImageNetPretrained"): return ModelCatalog.get_c2_imagenet_pretrained(name) raise RuntimeError("model not present in the catalog {}".format(name)) @staticmethod def get_c2_imagenet_pretrained(name): prefix = ModelCatalog.S3_C2_DETECTRON_URL name = name[len("ImageNetPretrained/"):] name = ModelCatalog.C2_IMAGENET_MODELS[name] url = "/".join([prefix, name]) return url @staticmethod def get_c2_detectron_12_2017_baselines(name): # Detectron C2 models are stored following the structure # prefix//2012_2017_baselines/.yaml./suffix # we use as identifiers in the catalog Caffe2Detectron/COCO// prefix = ModelCatalog.S3_C2_DETECTRON_URL dataset_tag = "keypoints_" if "keypoint" in name else "" suffix = ModelCatalog.C2_DETECTRON_SUFFIX.format(dataset_tag, dataset_tag) # remove identification prefix name = name[len("Caffe2Detectron/COCO/"):] # split in and model_id, model_name = name.split("/") # parsing to make it match the url address from the Caffe2 models model_name = "{}.yaml".format(model_name) signature = ModelCatalog.C2_DETECTRON_MODELS[name] unique_name = ".".join([model_name, signature]) url = "/".join([prefix, model_id, "12_2017_baselines", unique_name, suffix]) return url ================================================ FILE: maskrcnn_benchmark/csrc/ROIAlign.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #pragma once #include "cpu/vision.h" #ifdef WITH_CUDA #include "cuda/vision.h" #endif // Interface for Python at::Tensor ROIAlign_forward(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); #else AT_ERROR("Not compiled with GPU support"); #endif } return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); } at::Tensor ROIAlign_backward(const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width, const int sampling_ratio) { if (grad.type().is_cuda()) { #ifdef WITH_CUDA return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: maskrcnn_benchmark/csrc/ROIPool.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #pragma once #include "cpu/vision.h" #ifdef WITH_CUDA #include "cuda/vision.h" #endif std::tuple ROIPool_forward(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } at::Tensor ROIPool_backward(const at::Tensor& grad, const at::Tensor& input, const at::Tensor& rois, const at::Tensor& argmax, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width) { if (grad.type().is_cuda()) { #ifdef WITH_CUDA return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: maskrcnn_benchmark/csrc/SigmoidFocalLoss.h ================================================ #pragma once #include "cpu/vision.h" #ifdef WITH_CUDA #include "cuda/vision.h" #endif // Interface for Python at::Tensor SigmoidFocalLoss_forward( const at::Tensor& logits, const at::Tensor& targets, const int num_classes, const float gamma, const float alpha) { if (logits.type().is_cuda()) { #ifdef WITH_CUDA return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } at::Tensor SigmoidFocalLoss_backward( const at::Tensor& logits, const at::Tensor& targets, const at::Tensor& d_losses, const int num_classes, const float gamma, const float alpha) { if (logits.type().is_cuda()) { #ifdef WITH_CUDA return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include "cpu/vision.h" // implementation taken from Caffe2 template struct PreCalc { int pos1; int pos2; int pos3; int pos4; T w1; T w2; T w3; T w4; }; template void pre_calc_for_bilinear_interpolate( const int height, const int width, const int pooled_height, const int pooled_width, const int iy_upper, const int ix_upper, T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w, int roi_bin_grid_h, int roi_bin_grid_w, std::vector>& pre_calc) { int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { for (int iy = 0; iy < iy_upper; iy++) { const T yy = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < ix_upper; ix++) { const T xx = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T x = xx; T y = yy; // deal with: inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { // empty PreCalc pc; pc.pos1 = 0; pc.pos2 = 0; pc.pos3 = 0; pc.pos4 = 0; pc.w1 = 0; pc.w2 = 0; pc.w3 = 0; pc.w4 = 0; pre_calc[pre_calc_index] = pc; pre_calc_index += 1; continue; } if (y <= 0) { y = 0; } if (x <= 0) { x = 0; } int y_low = (int)y; int x_low = (int)x; int y_high; int x_high; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T)y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T)x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; // save weights and indices PreCalc pc; pc.pos1 = y_low * width + x_low; pc.pos2 = y_low * width + x_high; pc.pos3 = y_high * width + x_low; pc.pos4 = y_high * width + x_high; pc.w1 = w1; pc.w2 = w2; pc.w3 = w3; pc.w4 = w4; pre_calc[pre_calc_index] = pc; pre_calc_index += 1; } } } } } template void ROIAlignForward_cpu_kernel( const int nthreads, const T* bottom_data, const T& spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int sampling_ratio, const T* bottom_rois, //int roi_cols, T* top_data) { //AT_ASSERT(roi_cols == 4 || roi_cols == 5); int roi_cols = 5; int n_rois = nthreads / channels / pooled_width / pooled_height; // (n, c, ph, pw) is an element in the pooled output // can be parallelized using omp // #pragma omp parallel for num_threads(32) for (int n = 0; n < n_rois; n++) { int index_n = n * channels * pooled_width * pooled_height; // roi could have 4 or 5 columns const T* offset_bottom_rois = bottom_rois + n * roi_cols; int roi_batch_ind = 0; if (roi_cols == 5) { roi_batch_ind = offset_bottom_rois[0]; offset_bottom_rois++; } // Do not using rounding; this implementation detail is critical T roi_start_w = offset_bottom_rois[0] * spatial_scale; T roi_start_h = offset_bottom_rois[1] * spatial_scale; T roi_end_w = offset_bottom_rois[2] * spatial_scale; T roi_end_h = offset_bottom_rois[3] * spatial_scale; // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale); // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale); // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale); // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale); // Force malformed ROIs to be 1x1 T roi_width = std::max(roi_end_w - roi_start_w, (T)1.); T roi_height = std::max(roi_end_h - roi_start_h, (T)1.); T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 // we want to precalculate indices and weights shared by all channels, // this is the key point of optimization std::vector> pre_calc( roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); pre_calc_for_bilinear_interpolate( height, width, pooled_height, pooled_width, roi_bin_grid_h, roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w, roi_bin_grid_h, roi_bin_grid_w, pre_calc); for (int c = 0; c < channels; c++) { int index_n_c = index_n + c * pooled_width * pooled_height; const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { int index = index_n_c + ph * pooled_width + pw; T output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { for (int ix = 0; ix < roi_bin_grid_w; ix++) { PreCalc pc = pre_calc[pre_calc_index]; output_val += pc.w1 * offset_bottom_data[pc.pos1] + pc.w2 * offset_bottom_data[pc.pos2] + pc.w3 * offset_bottom_data[pc.pos3] + pc.w4 * offset_bottom_data[pc.pos4]; pre_calc_index += 1; } } output_val /= count; top_data[index] = output_val; } // for pw } // for ph } // for c } // for n } at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio) { AT_ASSERTM(!input.type().is_cuda(), "input must be a CPU tensor"); AT_ASSERTM(!rois.type().is_cuda(), "rois must be a CPU tensor"); auto num_rois = rois.size(0); auto channels = input.size(1); auto height = input.size(2); auto width = input.size(3); auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); auto output_size = num_rois * pooled_height * pooled_width * channels; if (output.numel() == 0) { return output; } AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] { ROIAlignForward_cpu_kernel( output_size, input.data(), spatial_scale, channels, height, width, pooled_height, pooled_width, sampling_ratio, rois.data(), output.data()); }); return output; } ================================================ FILE: maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include "cpu/vision.h" template at::Tensor nms_cpu_kernel(const at::Tensor& dets, const at::Tensor& scores, const float threshold) { AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor"); AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor"); AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores"); if (dets.numel() == 0) { return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); } auto x1_t = dets.select(1, 0).contiguous(); auto y1_t = dets.select(1, 1).contiguous(); auto x2_t = dets.select(1, 2).contiguous(); auto y2_t = dets.select(1, 3).contiguous(); at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1); auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); auto ndets = dets.size(0); at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); auto suppressed = suppressed_t.data(); auto order = order_t.data(); auto x1 = x1_t.data(); auto y1 = y1_t.data(); auto x2 = x2_t.data(); auto y2 = y2_t.data(); auto areas = areas_t.data(); for (int64_t _i = 0; _i < ndets; _i++) { auto i = order[_i]; if (suppressed[i] == 1) continue; auto ix1 = x1[i]; auto iy1 = y1[i]; auto ix2 = x2[i]; auto iy2 = y2[i]; auto iarea = areas[i]; for (int64_t _j = _i + 1; _j < ndets; _j++) { auto j = order[_j]; if (suppressed[j] == 1) continue; auto xx1 = std::max(ix1, x1[j]); auto yy1 = std::max(iy1, y1[j]); auto xx2 = std::min(ix2, x2[j]); auto yy2 = std::min(iy2, y2[j]); auto w = std::max(static_cast(0), xx2 - xx1 + 1); auto h = std::max(static_cast(0), yy2 - yy1 + 1); auto inter = w * h; auto ovr = inter / (iarea + areas[j] - inter); if (ovr >= threshold) suppressed[j] = 1; } } return at::nonzero(suppressed_t == 0).squeeze(1); } at::Tensor nms_cpu(const at::Tensor& dets, const at::Tensor& scores, const float threshold) { at::Tensor result; AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] { result = nms_cpu_kernel(dets, scores, threshold); }); return result; } ================================================ FILE: maskrcnn_benchmark/csrc/cpu/vision.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #pragma once #include at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio); at::Tensor nms_cpu(const at::Tensor& dets, const at::Tensor& scores, const float threshold); ================================================ FILE: maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include #include #include #include #include // TODO make it in a common file #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ i += blockDim.x * gridDim.x) template __device__ T bilinear_interpolate(const T* bottom_data, const int height, const int width, T y, T x, const int index /* index for debug only*/) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { //empty return 0; } if (y <= 0) y = 0; if (x <= 0) x = 0; int y_low = (int) y; int x_low = (int) x; int y_high; int x_high; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T) y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T) x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; // do bilinear interpolation T v1 = bottom_data[y_low * width + x_low]; T v2 = bottom_data[y_low * width + x_high]; T v3 = bottom_data[y_high * width + x_low]; T v4 = bottom_data[y_high * width + x_high]; T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __global__ void RoIAlignForward(const int nthreads, const T* bottom_data, const T spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int sampling_ratio, const T* bottom_rois, T* top_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; // Do not using rounding; this implementation detail is critical T roi_start_w = offset_bottom_rois[1] * spatial_scale; T roi_start_h = offset_bottom_rois[2] * spatial_scale; T roi_end_w = offset_bottom_rois[3] * spatial_scale; T roi_end_h = offset_bottom_rois[4] * spatial_scale; // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale); // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale); // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale); // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale); // Force malformed ROIs to be 1x1 T roi_width = max(roi_end_w - roi_start_w, (T)1.); T roi_height = max(roi_end_h - roi_start_h, (T)1.); T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 T output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1 { const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix ++) { const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, index); output_val += val; } } output_val /= count; top_data[index] = output_val; } } template __device__ void bilinear_interpolate_gradient( const int height, const int width, T y, T x, T & w1, T & w2, T & w3, T & w4, int & x_low, int & x_high, int & y_low, int & y_high, const int index /* index for debug only*/) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { //empty w1 = w2 = w3 = w4 = 0.; x_low = x_high = y_low = y_high = -1; return; } if (y <= 0) y = 0; if (x <= 0) x = 0; y_low = (int) y; x_low = (int) x; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T) y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T) x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; // reference in forward // T v1 = bottom_data[y_low * width + x_low]; // T v2 = bottom_data[y_low * width + x_high]; // T v3 = bottom_data[y_high * width + x_low]; // T v4 = bottom_data[y_high * width + x_high]; // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; return; } template __global__ void RoIAlignBackwardFeature(const int nthreads, const T* top_diff, const int num_rois, const T spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int sampling_ratio, T* bottom_diff, const T* bottom_rois) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; // Do not using rounding; this implementation detail is critical T roi_start_w = offset_bottom_rois[1] * spatial_scale; T roi_start_h = offset_bottom_rois[2] * spatial_scale; T roi_end_w = offset_bottom_rois[3] * spatial_scale; T roi_end_h = offset_bottom_rois[4] * spatial_scale; // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale); // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale); // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale); // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale); // Force malformed ROIs to be 1x1 T roi_width = max(roi_end_w - roi_start_w, (T)1.); T roi_height = max(roi_end_h - roi_start_h, (T)1.); T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width; int top_offset = (n * channels + c) * pooled_height * pooled_width; const T* offset_top_diff = top_diff + top_offset; const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1 { const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix ++) { const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high, index); T g1 = top_diff_this_bin * w1 / count; T g2 = top_diff_this_bin * w2 / count; T g3 = top_diff_this_bin * w3 / count; T g4 = top_diff_this_bin * w4 / count; if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(offset_bottom_diff + y_low * width + x_low, static_cast(g1)); atomicAdd(offset_bottom_diff + y_low * width + x_high, static_cast(g2)); atomicAdd(offset_bottom_diff + y_high * width + x_low, static_cast(g3)); atomicAdd(offset_bottom_diff + y_high * width + x_high, static_cast(g4)); } // if } // ix } // iy } // CUDA_1D_KERNEL_LOOP } // RoIAlignBackward at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio) { AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); auto num_rois = rois.size(0); auto channels = input.size(1); auto height = input.size(2); auto width = input.size(3); auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); auto output_size = num_rois * pooled_height * pooled_width * channels; cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L)); dim3 block(512); if (output.numel() == 0) { THCudaCheck(cudaGetLastError()); return output; } AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] { RoIAlignForward<<>>( output_size, input.contiguous().data(), spatial_scale, channels, height, width, pooled_height, pooled_width, sampling_ratio, rois.contiguous().data(), output.data()); }); THCudaCheck(cudaGetLastError()); return output; } // TODO remove the dependency on input and use instead its sizes -> save memory at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width, const int sampling_ratio) { AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor"); AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); auto num_rois = rois.size(0); auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options()); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L)); dim3 block(512); // handle possibly empty gradients if (grad.numel() == 0) { THCudaCheck(cudaGetLastError()); return grad_input; } AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIAlign_backward", [&] { RoIAlignBackwardFeature<<>>( grad.numel(), grad.contiguous().data(), num_rois, spatial_scale, channels, height, width, pooled_height, pooled_width, sampling_ratio, grad_input.data(), rois.contiguous().data()); }); THCudaCheck(cudaGetLastError()); return grad_input; } ================================================ FILE: maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include #include #include #include #include // TODO make it in a common file #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ i += blockDim.x * gridDim.x) template __global__ void RoIPoolFForward(const int nthreads, const T* bottom_data, const T spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const T* bottom_rois, T* top_data, int* argmax_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; int roi_start_w = round(offset_bottom_rois[1] * spatial_scale); int roi_start_h = round(offset_bottom_rois[2] * spatial_scale); int roi_end_w = round(offset_bottom_rois[3] * spatial_scale); int roi_end_h = round(offset_bottom_rois[4] * spatial_scale); // Force malformed ROIs to be 1x1 int roi_width = max(roi_end_w - roi_start_w + 1, 1); int roi_height = max(roi_end_h - roi_start_h + 1, 1); T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); int hstart = static_cast(floor(static_cast(ph) * bin_size_h)); int wstart = static_cast(floor(static_cast(pw) * bin_size_w)); int hend = static_cast(ceil(static_cast(ph + 1) * bin_size_h)); int wend = static_cast(ceil(static_cast(pw + 1) * bin_size_w)); // Add roi offsets and clip to input boundaries hstart = min(max(hstart + roi_start_h, 0), height); hend = min(max(hend + roi_start_h, 0), height); wstart = min(max(wstart + roi_start_w, 0), width); wend = min(max(wend + roi_start_w, 0), width); bool is_empty = (hend <= hstart) || (wend <= wstart); // Define an empty pooling region to be zero T maxval = is_empty ? 0 : -FLT_MAX; // If nothing is pooled, argmax = -1 causes nothing to be backprop'd int maxidx = -1; const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { int bottom_index = h * width + w; if (offset_bottom_data[bottom_index] > maxval) { maxval = offset_bottom_data[bottom_index]; maxidx = bottom_index; } } } top_data[index] = maxval; argmax_data[index] = maxidx; } } template __global__ void RoIPoolFBackward(const int nthreads, const T* top_diff, const int* argmax_data, const int num_rois, const T spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, T* bottom_diff, const T* bottom_rois) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; int bottom_offset = (roi_batch_ind * channels + c) * height * width; int top_offset = (n * channels + c) * pooled_height * pooled_width; const T* offset_top_diff = top_diff + top_offset; T* offset_bottom_diff = bottom_diff + bottom_offset; const int* offset_argmax_data = argmax_data + top_offset; int argmax = offset_argmax_data[ph * pooled_width + pw]; if (argmax != -1) { atomicAdd( offset_bottom_diff + argmax, static_cast(offset_top_diff[ph * pooled_width + pw])); } } } std::tuple ROIPool_forward_cuda(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width) { AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); auto num_rois = rois.size(0); auto channels = input.size(1); auto height = input.size(2); auto width = input.size(3); auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); auto output_size = num_rois * pooled_height * pooled_width * channels; auto argmax = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options().dtype(at::kInt)); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L)); dim3 block(512); if (output.numel() == 0) { THCudaCheck(cudaGetLastError()); return std::make_tuple(output, argmax); } AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIPool_forward", [&] { RoIPoolFForward<<>>( output_size, input.contiguous().data(), spatial_scale, channels, height, width, pooled_height, pooled_width, rois.contiguous().data(), output.data(), argmax.data()); }); THCudaCheck(cudaGetLastError()); return std::make_tuple(output, argmax); } // TODO remove the dependency on input and use instead its sizes -> save memory at::Tensor ROIPool_backward_cuda(const at::Tensor& grad, const at::Tensor& input, const at::Tensor& rois, const at::Tensor& argmax, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width) { AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor"); AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); // TODO add more checks auto num_rois = rois.size(0); auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options()); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L)); dim3 block(512); // handle possibly empty gradients if (grad.numel() == 0) { THCudaCheck(cudaGetLastError()); return grad_input; } AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIPool_backward", [&] { RoIPoolFBackward<<>>( grad.numel(), grad.contiguous().data(), argmax.data(), num_rois, spatial_scale, channels, height, width, pooled_height, pooled_width, grad_input.data(), rois.contiguous().data()); }); THCudaCheck(cudaGetLastError()); return grad_input; } ================================================ FILE: maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. // This file is modified from https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu // Cheng-Yang Fu // cyfu@cs.unc.edu #include #include #include #include #include #include // TODO make it in a common file #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ i += blockDim.x * gridDim.x) template __global__ void SigmoidFocalLossForward(const int nthreads, const T* logits, const int* targets, const int num_classes, const float gamma, const float alpha, const int num, T* losses) { CUDA_1D_KERNEL_LOOP(i, nthreads) { int n = i / num_classes; int d = i % num_classes; // current class[0~79]; int t = targets[n]; // target class [1~80]; // Decide it is positive or negative case. T c1 = (t == (d+1)); T c2 = (t>=0 & t != (d+1)); T zn = (1.0 - alpha); T zp = (alpha); // p = 1. / 1. + expf(-x); p = sigmoid(x) T p = 1. / (1. + expf(-logits[i])); // (1-p)**gamma * log(p) where T term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN)); // p**gamma * log(1-p) T term2 = powf(p, gamma) * (-1. * logits[i] * (logits[i] >= 0) - logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))); losses[i] = 0.0; losses[i] += -c1 * term1 * zp; losses[i] += -c2 * term2 * zn; } // CUDA_1D_KERNEL_LOOP } // SigmoidFocalLossForward template __global__ void SigmoidFocalLossBackward(const int nthreads, const T* logits, const int* targets, const T* d_losses, const int num_classes, const float gamma, const float alpha, const int num, T* d_logits) { CUDA_1D_KERNEL_LOOP(i, nthreads) { int n = i / num_classes; int d = i % num_classes; // current class[0~79]; int t = targets[n]; // target class [1~80], 0 is background; // Decide it is positive or negative case. T c1 = (t == (d+1)); T c2 = (t>=0 & t != (d+1)); T zn = (1.0 - alpha); T zp = (alpha); // p = 1. / 1. + expf(-x); p = sigmoid(x) T p = 1. / (1. + expf(-logits[i])); // (1-p)**g * (1 - p - g*p*log(p) T term1 = powf((1. - p), gamma) * (1. - p - (p * gamma * logf(max(p, FLT_MIN)))); // (p**g) * (g*(1-p)*log(1-p) - p) T term2 = powf(p, gamma) * ((-1. * logits[i] * (logits[i] >= 0) - logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) * (1. - p) * gamma - p); d_logits[i] = 0.0; d_logits[i] += -c1 * term1 * zp; d_logits[i] += -c2 * term2 * zn; d_logits[i] = d_logits[i] * d_losses[i]; } // CUDA_1D_KERNEL_LOOP } // SigmoidFocalLossBackward at::Tensor SigmoidFocalLoss_forward_cuda( const at::Tensor& logits, const at::Tensor& targets, const int num_classes, const float gamma, const float alpha) { AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); const int num_samples = logits.size(0); auto losses = at::empty({num_samples, logits.size(1)}, logits.options()); auto losses_size = num_samples * logits.size(1); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)losses_size, 512L), 4096L)); dim3 block(512); if (losses.numel() == 0) { THCudaCheck(cudaGetLastError()); return losses; } AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_forward", [&] { SigmoidFocalLossForward<<>>( losses_size, logits.contiguous().data(), targets.contiguous().data(), num_classes, gamma, alpha, num_samples, losses.data()); }); THCudaCheck(cudaGetLastError()); return losses; } at::Tensor SigmoidFocalLoss_backward_cuda( const at::Tensor& logits, const at::Tensor& targets, const at::Tensor& d_losses, const int num_classes, const float gamma, const float alpha) { AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); AT_ASSERTM(d_losses.type().is_cuda(), "d_losses must be a CUDA tensor"); AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); const int num_samples = logits.size(0); AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes"); auto d_logits = at::zeros({num_samples, num_classes}, logits.options()); auto d_logits_size = num_samples * logits.size(1); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)d_logits_size, 512L), 4096L)); dim3 block(512); if (d_logits.numel() == 0) { THCudaCheck(cudaGetLastError()); return d_logits; } AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_backward", [&] { SigmoidFocalLossBackward<<>>( d_logits_size, logits.contiguous().data(), targets.contiguous().data(), d_losses.contiguous().data(), num_classes, gamma, alpha, num_samples, d_logits.data()); }); THCudaCheck(cudaGetLastError()); return d_logits; } ================================================ FILE: maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu ================================================ // modify from // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c #include #include #include #include #include #include #include void deformable_im2col(const at::Tensor data_im, const at::Tensor data_offset, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, at::Tensor data_col); void deformable_col2im(const at::Tensor data_col, const at::Tensor data_offset, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, at::Tensor grad_im); void deformable_col2im_coord( const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, at::Tensor grad_offset); void modulated_deformable_im2col_cuda( const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, at::Tensor data_col); void modulated_deformable_col2im_cuda( const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, at::Tensor grad_im); void modulated_deformable_col2im_coord_cuda( const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, at::Tensor grad_offset, at::Tensor grad_mask); void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput, at::Tensor weight, int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW, int group, int deformable_group) { AT_CHECK(weight.ndimension() == 4, "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, " "but got: %s", weight.ndimension()); AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); AT_CHECK(kW > 0 && kH > 0, "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); AT_CHECK((weight.size(2) == kH && weight.size(3) == kW), "kernel size should be consistent with weight, ", "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH, kW, weight.size(2), weight.size(3)); AT_CHECK(dW > 0 && dH > 0, "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); AT_CHECK( dilationW > 0 && dilationH > 0, "dilation should be greater than 0, but got dilationH: %d dilationW: %d", dilationH, dilationW); int ndim = input.ndimension(); int dimf = 0; int dimh = 1; int dimw = 2; if (ndim == 4) { dimf++; dimh++; dimw++; } AT_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s", ndim); long nInputPlane = weight.size(1) * group; long inputHeight = input.size(dimh); long inputWidth = input.size(dimw); long nOutputPlane = weight.size(0); long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; AT_CHECK(nInputPlane % deformable_group == 0, "input channels must divide deformable group size"); if (outputWidth < 1 || outputHeight < 1) AT_ERROR( "Given input size: (%ld x %ld x %ld). " "Calculated output size: (%ld x %ld x %ld). Output size is too small", nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight, outputWidth); AT_CHECK(input.size(1) == nInputPlane, "invalid number of input planes, expected: %d, but got: %d", nInputPlane, input.size(1)); AT_CHECK((inputHeight >= kH && inputWidth >= kW), "input image is smaller than kernel"); AT_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth), "invalid spatial size of offset, expected height: %d width: %d, but " "got height: %d width: %d", outputHeight, outputWidth, offset.size(2), offset.size(3)); AT_CHECK((offset.size(1) == deformable_group * 2 * kH * kW), "invalid number of channels of offset"); if (gradOutput != NULL) { AT_CHECK(gradOutput->size(dimf) == nOutputPlane, "invalid number of gradOutput planes, expected: %d, but got: %d", nOutputPlane, gradOutput->size(dimf)); AT_CHECK((gradOutput->size(dimh) == outputHeight && gradOutput->size(dimw) == outputWidth), "invalid size of gradOutput, expected height: %d width: %d , but " "got height: %d width: %d", outputHeight, outputWidth, gradOutput->size(dimh), gradOutput->size(dimw)); } } int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight, at::Tensor offset, at::Tensor output, at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step) { // todo: resize columns to include im2col: done // todo: add im2col_step as input // todo: add new output buffer and transpose it to output (or directly // transpose output) todo: possibly change data indexing because of // parallel_imgs shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH, padW, dilationH, dilationW, group, deformable_group); input = input.contiguous(); offset = offset.contiguous(); weight = weight.contiguous(); int batch = 1; if (input.ndimension() == 3) { // Force batch batch = 0; input.unsqueeze_(0); offset.unsqueeze_(0); } // todo: assert batchsize dividable by im2col_step long batchSize = input.size(0); long nInputPlane = input.size(1); long inputHeight = input.size(2); long inputWidth = input.size(3); long nOutputPlane = weight.size(0); long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset"); output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane, outputHeight, outputWidth}); columns = at::zeros( {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, input.options()); if (ones.ndimension() != 2 || ones.size(0) * ones.size(1) < outputHeight * outputWidth) { ones = at::ones({outputHeight, outputWidth}, input.options()); } input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, inputHeight, inputWidth}); offset = offset.view({batchSize / im2col_step, im2col_step, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); at::Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane, im2col_step * outputHeight, outputWidth}, output.options()); output_buffer = output_buffer.view( {output_buffer.size(0), group, output_buffer.size(1) / group, output_buffer.size(2), output_buffer.size(3)}); for (int elt = 0; elt < batchSize / im2col_step; elt++) { deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, im2col_step, deformable_group, columns); columns = columns.view({group, columns.size(0) / group, columns.size(1)}); weight = weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)}); for (int g = 0; g < group; g++) { output_buffer[elt][g] = output_buffer[elt][g] .flatten(1) .addmm_(weight[g].flatten(1), columns[g]) .view_as(output_buffer[elt][g]); } } output_buffer = output_buffer.view( {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2), output_buffer.size(3), output_buffer.size(4)}); output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane, im2col_step, outputHeight, outputWidth}); output_buffer.transpose_(1, 2); output.copy_(output_buffer); output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth}); input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); offset = offset.view( {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); if (batch == 0) { output = output.view({nOutputPlane, outputHeight, outputWidth}); input = input.view({nInputPlane, inputHeight, inputWidth}); offset = offset.view({offset.size(1), offset.size(2), offset.size(3)}); } return 1; } int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset, at::Tensor gradOutput, at::Tensor gradInput, at::Tensor gradOffset, at::Tensor weight, at::Tensor columns, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step) { shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW, padH, padW, dilationH, dilationW, group, deformable_group); input = input.contiguous(); offset = offset.contiguous(); gradOutput = gradOutput.contiguous(); weight = weight.contiguous(); int batch = 1; if (input.ndimension() == 3) { // Force batch batch = 0; input = input.view({1, input.size(0), input.size(1), input.size(2)}); offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)}); gradOutput = gradOutput.view( {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)}); } long batchSize = input.size(0); long nInputPlane = input.size(1); long inputHeight = input.size(2); long inputWidth = input.size(3); long nOutputPlane = weight.size(0); long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; AT_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset"); gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth}); columns = at::zeros( {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, input.options()); // change order of grad output gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step, nOutputPlane, outputHeight, outputWidth}); gradOutput.transpose_(1, 2); gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane, inputHeight, inputWidth}); input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, inputHeight, inputWidth}); gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); offset = offset.view({batchSize / im2col_step, im2col_step, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); for (int elt = 0; elt < batchSize / im2col_step; elt++) { // divide into groups columns = columns.view({group, columns.size(0) / group, columns.size(1)}); weight = weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)}); gradOutput = gradOutput.view( {gradOutput.size(0), group, gradOutput.size(1) / group, gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)}); for (int g = 0; g < group; g++) { columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1), gradOutput[elt][g].flatten(1), 0.0f, 1.0f); } columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)}); gradOutput = gradOutput.view( {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2), gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)}); deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, im2col_step, deformable_group, gradOffset[elt]); deformable_col2im(columns, offset[elt], nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, im2col_step, deformable_group, gradInput[elt]); } gradOutput.transpose_(1, 2); gradOutput = gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth}); gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth}); input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); gradOffset = gradOffset.view( {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); offset = offset.view( {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); if (batch == 0) { gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth}); input = input.view({nInputPlane, inputHeight, inputWidth}); gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth}); offset = offset.view({offset.size(1), offset.size(2), offset.size(3)}); gradOffset = gradOffset.view({offset.size(1), offset.size(2), offset.size(3)}); } return 1; } int deform_conv_backward_parameters_cuda( at::Tensor input, at::Tensor offset, at::Tensor gradOutput, at::Tensor gradWeight, // at::Tensor gradBias, at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, float scale, int im2col_step) { // todo: transpose and reshape outGrad // todo: reshape columns // todo: add im2col_step as input shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH, dW, padH, padW, dilationH, dilationW, group, deformable_group); input = input.contiguous(); offset = offset.contiguous(); gradOutput = gradOutput.contiguous(); int batch = 1; if (input.ndimension() == 3) { // Force batch batch = 0; input = input.view( at::IntList({1, input.size(0), input.size(1), input.size(2)})); gradOutput = gradOutput.view( {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)}); } long batchSize = input.size(0); long nInputPlane = input.size(1); long inputHeight = input.size(2); long inputWidth = input.size(3); long nOutputPlane = gradWeight.size(0); long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset"); columns = at::zeros( {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, input.options()); gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step, nOutputPlane, outputHeight, outputWidth}); gradOutput.transpose_(1, 2); at::Tensor gradOutputBuffer = at::zeros_like(gradOutput); gradOutputBuffer = gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step, outputHeight, outputWidth}); gradOutputBuffer.copy_(gradOutput); gradOutputBuffer = gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step * outputHeight, outputWidth}); gradOutput.transpose_(1, 2); gradOutput = gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth}); input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, inputHeight, inputWidth}); offset = offset.view({batchSize / im2col_step, im2col_step, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); for (int elt = 0; elt < batchSize / im2col_step; elt++) { deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, im2col_step, deformable_group, columns); // divide into group gradOutputBuffer = gradOutputBuffer.view( {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group, gradOutputBuffer.size(2), gradOutputBuffer.size(3)}); columns = columns.view({group, columns.size(0) / group, columns.size(1)}); gradWeight = gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1), gradWeight.size(2), gradWeight.size(3)}); for (int g = 0; g < group; g++) { gradWeight[g] = gradWeight[g] .flatten(1) .addmm_(gradOutputBuffer[elt][g].flatten(1), columns[g].transpose(1, 0), 1.0, scale) .view_as(gradWeight[g]); } gradOutputBuffer = gradOutputBuffer.view( {gradOutputBuffer.size(0), gradOutputBuffer.size(1) * gradOutputBuffer.size(2), gradOutputBuffer.size(3), gradOutputBuffer.size(4)}); columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)}); gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1), gradWeight.size(2), gradWeight.size(3), gradWeight.size(4)}); } input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); offset = offset.view( {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); if (batch == 0) { gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth}); input = input.view({nInputPlane, inputHeight, inputWidth}); } return 1; } void modulated_deform_conv_cuda_forward( at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns, int kernel_h, int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int deformable_group, const bool with_bias) { AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); const int batch = input.size(0); const int channels = input.size(1); const int height = input.size(2); const int width = input.size(3); const int channels_out = weight.size(0); const int channels_kernel = weight.size(1); const int kernel_h_ = weight.size(2); const int kernel_w_ = weight.size(3); if (kernel_h_ != kernel_h || kernel_w_ != kernel_w) AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); if (channels != channels_kernel * group) AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel * group); const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; if (ones.ndimension() != 2 || ones.size(0) * ones.size(1) < height_out * width_out) { // Resize plane and fill with ones... ones = at::ones({height_out, width_out}, input.options()); } // resize output output = output.view({batch, channels_out, height_out, width_out}).zero_(); // resize temporary columns columns = at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); output = output.view({output.size(0), group, output.size(1) / group, output.size(2), output.size(3)}); for (int b = 0; b < batch; b++) { modulated_deformable_im2col_cuda( input[b], offset[b], mask[b], 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, columns); // divide into group weight = weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)}); columns = columns.view({group, columns.size(0) / group, columns.size(1)}); for (int g = 0; g < group; g++) { output[b][g] = output[b][g] .flatten(1) .addmm_(weight[g].flatten(1), columns[g]) .view_as(output[b][g]); } weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), weight.size(3), weight.size(4)}); columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)}); } output = output.view({output.size(0), output.size(1) * output.size(2), output.size(3), output.size(4)}); if (with_bias) { output += bias.view({1, bias.size(0), 1, 1}); } } void modulated_deform_conv_cuda_backward( at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, at::Tensor offset, at::Tensor mask, at::Tensor columns, at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias, at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, const bool with_bias) { AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); const int batch = input.size(0); const int channels = input.size(1); const int height = input.size(2); const int width = input.size(3); const int channels_kernel = weight.size(1); const int kernel_h_ = weight.size(2); const int kernel_w_ = weight.size(3); if (kernel_h_ != kernel_h || kernel_w_ != kernel_w) AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); if (channels != channels_kernel * group) AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel * group); const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; if (ones.ndimension() != 2 || ones.size(0) * ones.size(1) < height_out * width_out) { // Resize plane and fill with ones... ones = at::ones({height_out, width_out}, input.options()); } grad_input = grad_input.view({batch, channels, height, width}); columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out}, input.options()); grad_output = grad_output.view({grad_output.size(0), group, grad_output.size(1) / group, grad_output.size(2), grad_output.size(3)}); for (int b = 0; b < batch; b++) { // divide int group columns = columns.view({group, columns.size(0) / group, columns.size(1)}); weight = weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)}); for (int g = 0; g < group; g++) { columns[g].addmm_(weight[g].flatten(1).transpose(0, 1), grad_output[b][g].flatten(1), 0.0f, 1.0f); } columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)}); weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), weight.size(3), weight.size(4)}); // gradient w.r.t. input coordinate data modulated_deformable_col2im_coord_cuda( columns, input[b], offset[b], mask[b], 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b], grad_mask[b]); // gradient w.r.t. input data modulated_deformable_col2im_cuda( columns, offset[b], mask[b], 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, grad_input[b]); // gradient w.r.t. weight, dWeight should accumulate across the batch and // group modulated_deformable_im2col_cuda( input[b], offset[b], mask[b], 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, columns); columns = columns.view({group, columns.size(0) / group, columns.size(1)}); grad_weight = grad_weight.view({group, grad_weight.size(0) / group, grad_weight.size(1), grad_weight.size(2), grad_weight.size(3)}); if (with_bias) grad_bias = grad_bias.view({group, grad_bias.size(0) / group}); for (int g = 0; g < group; g++) { grad_weight[g] = grad_weight[g] .flatten(1) .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1)) .view_as(grad_weight[g]); if (with_bias) { grad_bias[g] = grad_bias[g] .view({-1, 1}) .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1})) .view(-1); } } columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)}); grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1), grad_weight.size(2), grad_weight.size(3), grad_weight.size(4)}); if (with_bias) grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)}); } grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1), grad_output.size(2), grad_output.size(3), grad_output.size(4)}); } ================================================ FILE: maskrcnn_benchmark/csrc/cuda/deform_conv_kernel_cuda.cu ================================================ /*! ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** * * COPYRIGHT * * All contributions by the University of California: * Copyright (c) 2014-2017 The Regents of the University of California (Regents) * All rights reserved. * * All other contributions: * Copyright (c) 2014-2017, the respective contributors * All rights reserved. * * Caffe uses a shared copyright model: each contributor holds copyright over * their contributions to Caffe. The project versioning records all such * contribution and copyright details. If a contributor wants to further mark * their specific copyright on a particular contribution, they should indicate * their copyright solely in the commit message of the change when it is * committed. * * LICENSE * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * CONTRIBUTION AGREEMENT * * By contributing to the BVLC/caffe repository through pull-request, comment, * or otherwise, the contributor releases their content to the * license and copyright terms herein. * ***************** END Caffe Copyright Notice and Disclaimer ******************** * * Copyright (c) 2018 Microsoft * Licensed under The MIT License [see LICENSE for details] * \file modulated_deformable_im2col.cuh * \brief Function definitions of converting an image to * column matrix based on kernel, padding, dilation, and offset. * These functions are mainly used in deformable convolution operators. * \ref: https://arxiv.org/abs/1703.06211 * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng */ // modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu #include #include #include #include #include using namespace at; #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) const int CUDA_NUM_THREADS = 1024; const int kMaxGridNum = 65535; inline int GET_BLOCKS(const int N) { return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS); } /* const int CUDA_NUM_THREADS = 1024; inline int GET_BLOCKS(const int N) { return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; }*/ template __device__ scalar_t deformable_im2col_bilinear(const scalar_t *bottom_data, const int data_width, const int height, const int width, scalar_t h, scalar_t w) { int h_low = floor(h); int w_low = floor(w); int h_high = h_low + 1; int w_high = w_low + 1; scalar_t lh = h - h_low; scalar_t lw = w - w_low; scalar_t hh = 1 - lh, hw = 1 - lw; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low]; scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) v2 = bottom_data[h_low * data_width + w_high]; scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) v3 = bottom_data[h_high * data_width + w_low]; scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) v4 = bottom_data[h_high * data_width + w_high]; scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ scalar_t get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w, const int h, const int w, const int height, const int width) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { //empty return 0; } int argmax_h_low = floor(argmax_h); int argmax_w_low = floor(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; scalar_t weight = 0; if (h == argmax_h_low && w == argmax_w_low) weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); if (h == argmax_h_low && w == argmax_w_high) weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); if (h == argmax_h_high && w == argmax_w_low) weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); if (h == argmax_h_high && w == argmax_w_high) weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); return weight; } template __device__ scalar_t get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w, const int height, const int width, const scalar_t *im_data, const int data_width, const int bp_dir) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { //empty return 0; } int argmax_h_low = floor(argmax_h); int argmax_w_low = floor(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; scalar_t weight = 0; if (bp_dir == 0) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } else if (bp_dir == 1) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } return weight; } template __global__ void deformable_im2col_gpu_kernel(const int n, const scalar_t *data_im, const scalar_t *data_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int num_channels, const int deformable_group, const int height_col, const int width_col, scalar_t *data_col) { CUDA_KERNEL_LOOP(index, n) { // index index of output matrix const int w_col = index % width_col; const int h_col = (index / width_col) % height_col; const int b_col = (index / width_col / height_col) % batch_size; const int c_im = (index / width_col / height_col) / batch_size; const int c_col = c_im * kernel_h * kernel_w; // compute deformable group index const int deformable_group_index = c_im / channel_per_deformable_group; const int h_in = h_col * stride_h - pad_h; const int w_in = w_col * stride_w - pad_w; scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; //const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; scalar_t val = static_cast(0); const scalar_t h_im = h_in + i * dilation_h + offset_h; const scalar_t w_im = w_in + j * dilation_w + offset_w; if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { //const scalar_t map_h = i * dilation_h + offset_h; //const scalar_t map_w = j * dilation_w + offset_w; //const int cur_height = height - h_in; //const int cur_width = width - w_in; //val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w); val = deformable_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); } *data_col_ptr = val; data_col_ptr += batch_size * height_col * width_col; } } } } void deformable_im2col( const at::Tensor data_im, const at::Tensor data_offset, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, at::Tensor data_col) { // num_axes should be smaller than block size // todo: check parallel_imgs is correctly passed in int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; int num_kernels = channels * height_col * width_col * parallel_imgs; int channel_per_deformable_group = channels / deformable_group; AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_im.type(), "deformable_im2col_gpu", ([&] { const scalar_t *data_im_ = data_im.data(); const scalar_t *data_offset_ = data_offset.data(); scalar_t *data_col_ = data_col.data(); deformable_im2col_gpu_kernel<<>>( num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs, channels, deformable_group, height_col, width_col, data_col_); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in deformable_im2col: %s\n", cudaGetErrorString(err)); } } template __global__ void deformable_col2im_gpu_kernel( const int n, const scalar_t *data_col, const scalar_t *data_offset, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int deformable_group, const int height_col, const int width_col, scalar_t *grad_im) { CUDA_KERNEL_LOOP(index, n) { const int j = (index / width_col / height_col / batch_size) % kernel_w; const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; // compute the start and end of the output const int deformable_group_index = c / channel_per_deformable_group; int w_out = index % width_col; int h_out = (index / width_col) % height_col; int b = (index / width_col / height_col) % batch_size; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h; const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w; const scalar_t cur_top_grad = data_col[index]; const int cur_h = (int)cur_inv_h_data; const int cur_w = (int)cur_inv_w_data; for (int dy = -2; dy <= 2; dy++) { for (int dx = -2; dx <= 2; dx++) { if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && abs(cur_inv_w_data - (cur_w + dx)) < 1) { int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; scalar_t weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); } } } } } void deformable_col2im( const at::Tensor data_col, const at::Tensor data_offset, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, at::Tensor grad_im) { // todo: make sure parallel_imgs is passed in correctly int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; int num_kernels = channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs; int channel_per_deformable_group = channels / deformable_group; AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_col.type(), "deformable_col2im_gpu", ([&] { const scalar_t *data_col_ = data_col.data(); const scalar_t *data_offset_ = data_offset.data(); scalar_t *grad_im_ = grad_im.data(); deformable_col2im_gpu_kernel<<>>( num_kernels, data_col_, data_offset_, channels, height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs, deformable_group, height_col, width_col, grad_im_); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in deformable_col2im: %s\n", cudaGetErrorString(err)); } } template __global__ void deformable_col2im_coord_gpu_kernel(const int n, const scalar_t *data_col, const scalar_t *data_im, const scalar_t *data_offset, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int offset_channels, const int deformable_group, const int height_col, const int width_col, scalar_t *grad_offset) { CUDA_KERNEL_LOOP(index, n) { scalar_t val = 0; int w = index % width_col; int h = (index / width_col) % height_col; int c = (index / width_col / height_col) % offset_channels; int b = (index / width_col / height_col) / offset_channels; // compute the start and end of the output const int deformable_group_index = c / (2 * kernel_h * kernel_w); const int col_step = kernel_h * kernel_w; int cnt = 0; const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) { const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; const int bp_dir = offset_c % 2; int j = (col_pos / width_col / height_col / batch_size) % kernel_w; int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; int w_out = col_pos % width_col; int h_out = (col_pos / width_col) % height_col; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; scalar_t inv_h = h_in + i * dilation_h + offset_h; scalar_t inv_w = w_in + j * dilation_w + offset_w; if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } const scalar_t weight = get_coordinate_weight( inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, width, bp_dir); val += weight * data_col_ptr[col_pos]; cnt += 1; } grad_offset[index] = val; } } void deformable_col2im_coord( const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, at::Tensor grad_offset) { int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w * deformable_group * parallel_imgs; int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group; AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_col.type(), "deformable_col2im_coord_gpu", ([&] { const scalar_t *data_col_ = data_col.data(); const scalar_t *data_im_ = data_im.data(); const scalar_t *data_offset_ = data_offset.data(); scalar_t *grad_offset_ = grad_offset.data(); deformable_col2im_coord_gpu_kernel<<>>( num_kernels, data_col_, data_im_, data_offset_, channels, height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs, 2 * ksize_h * ksize_w * deformable_group, deformable_group, height_col, width_col, grad_offset_); })); } template __device__ scalar_t dmcn_im2col_bilinear(const scalar_t *bottom_data, const int data_width, const int height, const int width, scalar_t h, scalar_t w) { int h_low = floor(h); int w_low = floor(w); int h_high = h_low + 1; int w_high = w_low + 1; scalar_t lh = h - h_low; scalar_t lw = w - w_low; scalar_t hh = 1 - lh, hw = 1 - lw; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low]; scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) v2 = bottom_data[h_low * data_width + w_high]; scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) v3 = bottom_data[h_high * data_width + w_low]; scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) v4 = bottom_data[h_high * data_width + w_high]; scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ scalar_t dmcn_get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w, const int h, const int w, const int height, const int width) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { //empty return 0; } int argmax_h_low = floor(argmax_h); int argmax_w_low = floor(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; scalar_t weight = 0; if (h == argmax_h_low && w == argmax_w_low) weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); if (h == argmax_h_low && w == argmax_w_high) weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); if (h == argmax_h_high && w == argmax_w_low) weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); if (h == argmax_h_high && w == argmax_w_high) weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); return weight; } template __device__ scalar_t dmcn_get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w, const int height, const int width, const scalar_t *im_data, const int data_width, const int bp_dir) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { //empty return 0; } int argmax_h_low = floor(argmax_h); int argmax_w_low = floor(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; scalar_t weight = 0; if (bp_dir == 0) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } else if (bp_dir == 1) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } return weight; } template __global__ void modulated_deformable_im2col_gpu_kernel(const int n, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int num_channels, const int deformable_group, const int height_col, const int width_col, scalar_t *data_col) { CUDA_KERNEL_LOOP(index, n) { // index index of output matrix const int w_col = index % width_col; const int h_col = (index / width_col) % height_col; const int b_col = (index / width_col / height_col) % batch_size; const int c_im = (index / width_col / height_col) / batch_size; const int c_col = c_im * kernel_h * kernel_w; // compute deformable group index const int deformable_group_index = c_im / channel_per_deformable_group; const int h_in = h_col * stride_h - pad_h; const int w_in = w_col * stride_w - pad_w; scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const scalar_t *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; scalar_t val = static_cast(0); const scalar_t h_im = h_in + i * dilation_h + offset_h; const scalar_t w_im = w_in + j * dilation_w + offset_w; //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { //const float map_h = i * dilation_h + offset_h; //const float map_w = j * dilation_w + offset_w; //const int cur_height = height - h_in; //const int cur_width = width - w_in; //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w); val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); } *data_col_ptr = val * mask; data_col_ptr += batch_size * height_col * width_col; //data_col_ptr += height_col * width_col; } } } } template __global__ void modulated_deformable_col2im_gpu_kernel(const int n, const scalar_t *data_col, const scalar_t *data_offset, const scalar_t *data_mask, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int deformable_group, const int height_col, const int width_col, scalar_t *grad_im) { CUDA_KERNEL_LOOP(index, n) { const int j = (index / width_col / height_col / batch_size) % kernel_w; const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; // compute the start and end of the output const int deformable_group_index = c / channel_per_deformable_group; int w_out = index % width_col; int h_out = (index / width_col) % height_col; int b = (index / width_col / height_col) % batch_size; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h; const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w; const scalar_t cur_top_grad = data_col[index] * mask; const int cur_h = (int)cur_inv_h_data; const int cur_w = (int)cur_inv_w_data; for (int dy = -2; dy <= 2; dy++) { for (int dx = -2; dx <= 2; dx++) { if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && abs(cur_inv_w_data - (cur_w + dx)) < 1) { int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; scalar_t weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); } } } } } template __global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n, const scalar_t *data_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int offset_channels, const int deformable_group, const int height_col, const int width_col, scalar_t *grad_offset, scalar_t *grad_mask) { CUDA_KERNEL_LOOP(index, n) { scalar_t val = 0, mval = 0; int w = index % width_col; int h = (index / width_col) % height_col; int c = (index / width_col / height_col) % offset_channels; int b = (index / width_col / height_col) / offset_channels; // compute the start and end of the output const int deformable_group_index = c / (2 * kernel_h * kernel_w); const int col_step = kernel_h * kernel_w; int cnt = 0; const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) { const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; const int bp_dir = offset_c % 2; int j = (col_pos / width_col / height_col / batch_size) % kernel_w; int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; int w_out = col_pos % width_col; int h_out = (col_pos / width_col) % height_col; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; scalar_t inv_h = h_in + i * dilation_h + offset_h; scalar_t inv_w = w_in + j * dilation_w + offset_w; if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } else { mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); } const scalar_t weight = dmcn_get_coordinate_weight( inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, width, bp_dir); val += weight * data_col_ptr[col_pos] * mask; cnt += 1; } // KERNEL_ASSIGN(grad_offset[index], offset_req, val); grad_offset[index] = val; if (offset_c % 2 == 0) // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval); grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; } } void modulated_deformable_im2col_cuda( const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, at::Tensor data_col) { // num_axes should be smaller than block size const int channel_per_deformable_group = channels / deformable_group; const int num_kernels = channels * batch_size * height_col * width_col; AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_im.type(), "modulated_deformable_im2col_gpu", ([&] { const scalar_t *data_im_ = data_im.data(); const scalar_t *data_offset_ = data_offset.data(); const scalar_t *data_mask_ = data_mask.data(); scalar_t *data_col_ = data_col.data(); modulated_deformable_im2col_gpu_kernel<<>>( num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, batch_size, channels, deformable_group, height_col, width_col, data_col_); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); } } void modulated_deformable_col2im_cuda( const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, at::Tensor grad_im) { const int channel_per_deformable_group = channels / deformable_group; const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col; AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_col.type(), "modulated_deformable_col2im_gpu", ([&] { const scalar_t *data_col_ = data_col.data(); const scalar_t *data_offset_ = data_offset.data(); const scalar_t *data_mask_ = data_mask.data(); scalar_t *grad_im_ = grad_im.data(); modulated_deformable_col2im_gpu_kernel<<>>( num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, batch_size, deformable_group, height_col, width_col, grad_im_); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); } } void modulated_deformable_col2im_coord_cuda( const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, at::Tensor grad_offset, at::Tensor grad_mask) { const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group; const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group; AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_col.type(), "modulated_deformable_col2im_coord_gpu", ([&] { const scalar_t *data_col_ = data_col.data(); const scalar_t *data_im_ = data_im.data(); const scalar_t *data_offset_ = data_offset.data(); const scalar_t *data_mask_ = data_mask.data(); scalar_t *grad_offset_ = grad_offset.data(); scalar_t *grad_mask_ = grad_mask.data(); modulated_deformable_col2im_coord_gpu_kernel<<>>( num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, grad_offset_, grad_mask_); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err)); } } ================================================ FILE: maskrcnn_benchmark/csrc/cuda/deform_pool_cuda.cu ================================================ // modify from // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c // based on // author: Charles Shang // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu #include #include #include #include #include #include #include void DeformablePSROIPoolForward( const at::Tensor data, const at::Tensor bbox, const at::Tensor trans, at::Tensor out, at::Tensor top_count, const int batch, const int channels, const int height, const int width, const int num_bbox, const int channels_trans, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std); void DeformablePSROIPoolBackwardAcc( const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox, const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad, at::Tensor trans_grad, const int batch, const int channels, const int height, const int width, const int num_bbox, const int channels_trans, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std); void deform_psroi_pooling_cuda_forward( at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out, at::Tensor top_count, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); const int batch = input.size(0); const int channels = input.size(1); const int height = input.size(2); const int width = input.size(3); const int channels_trans = no_trans ? 2 : trans.size(1); const int num_bbox = bbox.size(0); if (num_bbox != out.size(0)) AT_ERROR("Output shape and bbox number wont match: (%d vs %d).", out.size(0), num_bbox); DeformablePSROIPoolForward( input, bbox, trans, out, top_count, batch, channels, height, width, num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size, pooled_size, part_size, sample_per_part, trans_std); } void deform_psroi_pooling_cuda_backward( at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous"); AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); const int batch = input.size(0); const int channels = input.size(1); const int height = input.size(2); const int width = input.size(3); const int channels_trans = no_trans ? 2 : trans.size(1); const int num_bbox = bbox.size(0); if (num_bbox != out_grad.size(0)) AT_ERROR("Output shape and bbox number wont match: (%d vs %d).", out_grad.size(0), num_bbox); DeformablePSROIPoolBackwardAcc( out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch, channels, height, width, num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size, pooled_size, part_size, sample_per_part, trans_std); } ================================================ FILE: maskrcnn_benchmark/csrc/cuda/deform_pool_kernel_cuda.cu ================================================ /*! * Copyright (c) 2017 Microsoft * Licensed under The MIT License [see LICENSE for details] * \file deformable_psroi_pooling.cu * \brief * \author Yi Li, Guodong Zhang, Jifeng Dai */ /***************** Adapted by Charles Shang *********************/ // modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/cuda/deform_psroi_pooling_cuda.cu #include #include #include #include #include using namespace at; #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ i < (n); \ i += blockDim.x * gridDim.x) const int CUDA_NUM_THREADS = 1024; inline int GET_BLOCKS(const int N) { return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; } template __device__ scalar_t bilinear_interp( const scalar_t *data, const scalar_t x, const scalar_t y, const int width, const int height) { int x1 = floor(x); int x2 = ceil(x); int y1 = floor(y); int y2 = ceil(y); scalar_t dist_x = (scalar_t)(x - x1); scalar_t dist_y = (scalar_t)(y - y1); scalar_t value11 = data[y1 * width + x1]; scalar_t value12 = data[y2 * width + x1]; scalar_t value21 = data[y1 * width + x2]; scalar_t value22 = data[y2 * width + x2]; scalar_t value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22; return value; } template __global__ void DeformablePSROIPoolForwardKernel( const int count, const scalar_t *bottom_data, const scalar_t spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const scalar_t *bottom_rois, const scalar_t *bottom_trans, const int no_trans, const scalar_t trans_std, const int sample_per_part, const int output_dim, const int group_size, const int part_size, const int num_classes, const int channels_each_class, scalar_t *top_data, scalar_t *top_count) { CUDA_KERNEL_LOOP(index, count) { // The output is in order (n, ctop, ph, pw) int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int ctop = (index / pooled_width / pooled_height) % output_dim; int n = index / pooled_width / pooled_height / output_dim; // [start, end) interval for spatial sampling const scalar_t *offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5; scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5; scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; // Force too small ROIs to be 1x1 scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1); // Compute w and h at bottom scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height); scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width); scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part); scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part); int part_h = floor((scalar_t)(ph) / pooled_height * part_size); int part_w = floor((scalar_t)(pw) / pooled_width * part_size); int class_id = ctop / channels_each_class; scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w; wstart += trans_x * roi_width; scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h; hstart += trans_y * roi_height; scalar_t sum = 0; int count = 0; int gw = floor((scalar_t)(pw)*group_size / pooled_width); int gh = floor((scalar_t)(ph)*group_size / pooled_height); gw = min(max(gw, 0), group_size - 1); gh = min(max(gh, 0), group_size - 1); const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; for (int ih = 0; ih < sample_per_part; ih++) { for (int iw = 0; iw < sample_per_part; iw++) { scalar_t w = wstart + iw * sub_bin_size_w; scalar_t h = hstart + ih * sub_bin_size_h; // bilinear interpolation if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { continue; } w = min(max(w, 0.), width - 1.); h = min(max(h, 0.), height - 1.); int c = (ctop * group_size + gh) * group_size + gw; scalar_t val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height); sum += val; count++; } } top_data[index] = count == 0 ? (scalar_t)(0) : sum / count; top_count[index] = count; } } template __global__ void DeformablePSROIPoolBackwardAccKernel( const int count, const scalar_t *top_diff, const scalar_t *top_count, const int num_rois, const scalar_t spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int output_dim, scalar_t *bottom_data_diff, scalar_t *bottom_trans_diff, const scalar_t *bottom_data, const scalar_t *bottom_rois, const scalar_t *bottom_trans, const int no_trans, const scalar_t trans_std, const int sample_per_part, const int group_size, const int part_size, const int num_classes, const int channels_each_class) { CUDA_KERNEL_LOOP(index, count) { // The output is in order (n, ctop, ph, pw) int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int ctop = (index / pooled_width / pooled_height) % output_dim; int n = index / pooled_width / pooled_height / output_dim; // [start, end) interval for spatial sampling const scalar_t *offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5; scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5; scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; // Force too small ROIs to be 1x1 scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1); // Compute w and h at bottom scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height); scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width); scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part); scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part); int part_h = floor((scalar_t)(ph) / pooled_height * part_size); int part_w = floor((scalar_t)(pw) / pooled_width * part_size); int class_id = ctop / channels_each_class; scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w; wstart += trans_x * roi_width; scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h; hstart += trans_y * roi_height; if (top_count[index] <= 0) { continue; } scalar_t diff_val = top_diff[index] / top_count[index]; const scalar_t *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; scalar_t *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; int gw = floor((scalar_t)(pw)*group_size / pooled_width); int gh = floor((scalar_t)(ph)*group_size / pooled_height); gw = min(max(gw, 0), group_size - 1); gh = min(max(gh, 0), group_size - 1); for (int ih = 0; ih < sample_per_part; ih++) { for (int iw = 0; iw < sample_per_part; iw++) { scalar_t w = wstart + iw * sub_bin_size_w; scalar_t h = hstart + ih * sub_bin_size_h; // bilinear interpolation if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { continue; } w = min(max(w, 0.), width - 1.); h = min(max(h, 0.), height - 1.); int c = (ctop * group_size + gh) * group_size + gw; // backward on feature int x0 = floor(w); int x1 = ceil(w); int y0 = floor(h); int y1 = ceil(h); scalar_t dist_x = w - x0, dist_y = h - y0; scalar_t q00 = (1 - dist_x) * (1 - dist_y); scalar_t q01 = (1 - dist_x) * dist_y; scalar_t q10 = dist_x * (1 - dist_y); scalar_t q11 = dist_x * dist_y; int bottom_index_base = c * height * width; atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val); atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val); atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val); atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val); if (no_trans) { continue; } scalar_t U00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; scalar_t U01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; scalar_t U10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; scalar_t U11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; scalar_t diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val; diff_x *= roi_width; scalar_t diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val; diff_y *= roi_height; atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x); atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y); } } } } void DeformablePSROIPoolForward(const at::Tensor data, const at::Tensor bbox, const at::Tensor trans, at::Tensor out, at::Tensor top_count, const int batch, const int channels, const int height, const int width, const int num_bbox, const int channels_trans, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { const int pooled_height = pooled_size; const int pooled_width = pooled_size; const int count = num_bbox * output_dim * pooled_height * pooled_width; const int num_classes = no_trans ? 1 : channels_trans / 2; const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; AT_DISPATCH_FLOATING_TYPES_AND_HALF( data.type(), "deformable_psroi_pool_forward", ([&] { const scalar_t *bottom_data = data.data(); const scalar_t *bottom_rois = bbox.data(); const scalar_t *bottom_trans = no_trans ? NULL : trans.data(); scalar_t *top_data = out.data(); scalar_t *top_count_data = top_count.data(); DeformablePSROIPoolForwardKernel<<>>( count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, output_dim, group_size, part_size, num_classes, channels_each_class, top_data, top_count_data); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err)); } } void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox, const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad, at::Tensor trans_grad, const int batch, const int channels, const int height, const int width, const int num_bbox, const int channels_trans, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { // LOG(INFO) << "DeformablePSROIPoolBackward"; const int num_rois = num_bbox; const int pooled_height = pooled_size; const int pooled_width = pooled_size; const int count = num_bbox * output_dim * pooled_height * pooled_width; const int num_classes = no_trans ? 1 : channels_trans / 2; const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; AT_DISPATCH_FLOATING_TYPES_AND_HALF( out_grad.type(), "deformable_psroi_pool_backward_acc", ([&] { const scalar_t *top_diff = out_grad.data(); const scalar_t *bottom_data = data.data(); const scalar_t *bottom_rois = bbox.data(); const scalar_t *bottom_trans = no_trans ? NULL : trans.data(); scalar_t *bottom_data_diff = in_grad.data(); scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data(); const scalar_t *top_count_data = top_count.data(); DeformablePSROIPoolBackwardAccKernel<<>>( count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff, bottom_data, bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, group_size, part_size, num_classes, channels_each_class); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err)); } } ================================================ FILE: maskrcnn_benchmark/csrc/cuda/nms.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include #include #include #include #include #include int const threadsPerBlock = sizeof(unsigned long long) * 8; __device__ inline float devIoU(float const * const a, float const * const b) { float left = max(a[0], b[0]), right = min(a[2], b[2]); float top = max(a[1], b[1]), bottom = min(a[3], b[3]); float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); float interS = width * height; float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); return interS / (Sa + Sb - interS); } __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, const float *dev_boxes, unsigned long long *dev_mask) { const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); __shared__ float block_boxes[threadsPerBlock * 5]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 5 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; block_boxes[threadIdx.x * 5 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; block_boxes[threadIdx.x * 5 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; block_boxes[threadIdx.x * 5 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; block_boxes[threadIdx.x * 5 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; const float *cur_box = dev_boxes + cur_box_idx * 5; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { t |= 1ULL << i; } } const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); dev_mask[cur_box_idx * col_blocks + col_start] = t; } } // boxes is a N x 5 tensor at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) { using scalar_t = float; AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); auto scores = boxes.select(1, 4); auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); auto boxes_sorted = boxes.index_select(0, order_t); int boxes_num = boxes.size(0); const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); scalar_t* boxes_dev = boxes_sorted.data(); THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState unsigned long long* mask_dev = NULL; //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, // boxes_num * col_blocks * sizeof(unsigned long long))); mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), THCCeilDiv(boxes_num, threadsPerBlock)); dim3 threads(threadsPerBlock); nms_kernel<<>>(boxes_num, nms_overlap_thresh, boxes_dev, mask_dev); std::vector mask_host(boxes_num * col_blocks); THCudaCheck(cudaMemcpy(&mask_host[0], mask_dev, sizeof(unsigned long long) * boxes_num * col_blocks, cudaMemcpyDeviceToHost)); std::vector remv(col_blocks); memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); int64_t* keep_out = keep.data(); int num_to_keep = 0; for (int i = 0; i < boxes_num; i++) { int nblock = i / threadsPerBlock; int inblock = i % threadsPerBlock; if (!(remv[nblock] & (1ULL << inblock))) { keep_out[num_to_keep++] = i; unsigned long long *p = &mask_host[0] + i * col_blocks; for (int j = nblock; j < col_blocks; j++) { remv[j] |= p[j]; } } } THCudaFree(state, mask_dev); // TODO improve this part return std::get<0>(order_t.index({ keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( order_t.device(), keep.scalar_type()) }).sort(0, false)); } ================================================ FILE: maskrcnn_benchmark/csrc/cuda/vision.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #pragma once #include at::Tensor SigmoidFocalLoss_forward_cuda( const at::Tensor& logits, const at::Tensor& targets, const int num_classes, const float gamma, const float alpha); at::Tensor SigmoidFocalLoss_backward_cuda( const at::Tensor& logits, const at::Tensor& targets, const at::Tensor& d_losses, const int num_classes, const float gamma, const float alpha); at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio); at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width, const int sampling_ratio); std::tuple ROIPool_forward_cuda(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width); at::Tensor ROIPool_backward_cuda(const at::Tensor& grad, const at::Tensor& input, const at::Tensor& rois, const at::Tensor& argmax, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width); at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh); int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight, at::Tensor offset, at::Tensor output, at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step); int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset, at::Tensor gradOutput, at::Tensor gradInput, at::Tensor gradOffset, at::Tensor weight, at::Tensor columns, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step); int deform_conv_backward_parameters_cuda( at::Tensor input, at::Tensor offset, at::Tensor gradOutput, at::Tensor gradWeight, // at::Tensor gradBias, at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, float scale, int im2col_step); void modulated_deform_conv_cuda_forward( at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns, int kernel_h, int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int deformable_group, const bool with_bias); void modulated_deform_conv_cuda_backward( at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, at::Tensor offset, at::Tensor mask, at::Tensor columns, at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias, at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, const bool with_bias); void deform_psroi_pooling_cuda_forward( at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out, at::Tensor top_count, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std); void deform_psroi_pooling_cuda_backward( at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std); at::Tensor compute_flow_cuda(const at::Tensor& boxes, const int height, const int width); ================================================ FILE: maskrcnn_benchmark/csrc/deform_conv.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #pragma once #include "cpu/vision.h" #ifdef WITH_CUDA #include "cuda/vision.h" #endif // Interface for Python int deform_conv_forward( at::Tensor input, at::Tensor weight, at::Tensor offset, at::Tensor output, at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return deform_conv_forward_cuda( input, weight, offset, output, columns, ones, kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group, im2col_step ); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } int deform_conv_backward_input( at::Tensor input, at::Tensor offset, at::Tensor gradOutput, at::Tensor gradInput, at::Tensor gradOffset, at::Tensor weight, at::Tensor columns, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return deform_conv_backward_input_cuda( input, offset, gradOutput, gradInput, gradOffset, weight, columns, kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group, im2col_step ); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } int deform_conv_backward_parameters( at::Tensor input, at::Tensor offset, at::Tensor gradOutput, at::Tensor gradWeight, // at::Tensor gradBias, at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, float scale, int im2col_step) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return deform_conv_backward_parameters_cuda( input, offset, gradOutput, gradWeight, columns, ones, kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group, scale, im2col_step ); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } void modulated_deform_conv_forward( at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns, int kernel_h, int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int deformable_group, const bool with_bias) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return modulated_deform_conv_cuda_forward( input, weight, bias, ones, offset, mask, output, columns, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, deformable_group, with_bias ); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } void modulated_deform_conv_backward( at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, at::Tensor offset, at::Tensor mask, at::Tensor columns, at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias, at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, const bool with_bias) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return modulated_deform_conv_cuda_backward( input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight, grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, deformable_group, with_bias ); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: maskrcnn_benchmark/csrc/deform_pool.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #pragma once #include "cpu/vision.h" #ifdef WITH_CUDA #include "cuda/vision.h" #endif // Interface for Python void deform_psroi_pooling_forward( at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out, at::Tensor top_count, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return deform_psroi_pooling_cuda_forward( input, bbox, trans, out, top_count, no_trans, spatial_scale, output_dim, group_size, pooled_size, part_size, sample_per_part, trans_std ); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } void deform_psroi_pooling_backward( at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return deform_psroi_pooling_cuda_backward( out_grad, input, bbox, trans, top_count, input_grad, trans_grad, no_trans, spatial_scale, output_dim, group_size, pooled_size, part_size, sample_per_part, trans_std ); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: maskrcnn_benchmark/csrc/nms.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #pragma once #include "cpu/vision.h" #ifdef WITH_CUDA #include "cuda/vision.h" #endif at::Tensor nms(const at::Tensor& dets, const at::Tensor& scores, const float threshold) { if (dets.type().is_cuda()) { #ifdef WITH_CUDA // TODO raise error if not compiled with CUDA if (dets.numel() == 0) return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); auto b = at::cat({dets, scores.unsqueeze(1)}, 1); return nms_cuda(b, threshold); #else AT_ERROR("Not compiled with GPU support"); #endif } at::Tensor result = nms_cpu(dets, scores, threshold); return result; } ================================================ FILE: maskrcnn_benchmark/csrc/vision.cpp ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include "nms.h" #include "ROIAlign.h" #include "ROIPool.h" #include "SigmoidFocalLoss.h" #include "deform_conv.h" #include "deform_pool.h" PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("nms", &nms, "non-maximum suppression"); m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward"); m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward"); m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward"); m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward"); m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward"); m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward"); // dcn-v2 m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward"); m.def("deform_conv_backward_input", &deform_conv_backward_input, "deform_conv_backward_input"); m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters, "deform_conv_backward_parameters"); m.def("modulated_deform_conv_forward", &modulated_deform_conv_forward, "modulated_deform_conv_forward"); m.def("modulated_deform_conv_backward", &modulated_deform_conv_backward, "modulated_deform_conv_backward"); m.def("deform_psroi_pooling_forward", &deform_psroi_pooling_forward, "deform_psroi_pooling_forward"); m.def("deform_psroi_pooling_backward", &deform_psroi_pooling_backward, "deform_psroi_pooling_backward"); } ================================================ FILE: maskrcnn_benchmark/data/README.md ================================================ # Setting Up Datasets This file describes how to perform training on other datasets. Only Pascal VOC dataset can be loaded from its original format and be outputted to Pascal style results currently. We expect the annotations from other datasets be converted to COCO json format, and the output will be in COCO-style. (i.e. AP, AP50, AP75, APs, APm, APl for bbox and segm) ## Creating Symlinks for PASCAL VOC We assume that your symlinked `datasets/voc/VOC` directory has the following structure: ``` VOC |_ JPEGImages | |_ .jpg | |_ ... | |_ .jpg |_ Annotations | |_ pascal_train.json (optional) | |_ pascal_val.json (optional) | |_ pascal_test.json (optional) | |_ .xml | |_ ... | |_ .xml |_ VOCdevkit ``` Create symlinks for `voc/VOC`: ``` cd ~/github/maskrcnn-benchmark mkdir -p datasets/voc/VOC ln -s /path/to/VOC /datasets/voc/VOC ``` Example configuration files for PASCAL VOC could be found [here](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/configs/pascal_voc/). ### PASCAL VOC Annotations in COCO Format To output COCO-style evaluation result, PASCAL VOC annotations in COCO json format is required and could be downloaded from [here](https://storage.googleapis.com/coco-dataset/external/PASCAL_VOC.zip) via http://cocodataset.org/#external. ## Creating Symlinks for Cityscapes: We assume that your symlinked `datasets/cityscapes` directory has the following structure: ``` cityscapes |_ images | |_ .jpg | |_ ... | |_ .jpg |_ annotations | |_ instanceonly_gtFile_train.json | |_ ... |_ raw |_ gtFine |_ ... |_ README.md ``` Create symlinks for `cityscapes`: ``` cd ~/github/maskrcnn-benchmark mkdir -p datasets/cityscapes ln -s /path/to/cityscapes datasets/data/cityscapes ``` ### Steps to convert Cityscapes Annotations to COCO Format 1. Download gtFine_trainvaltest.zip from https://www.cityscapes-dataset.com/downloads/ (login required) 2. Extract it to /path/to/gtFine_trainvaltest ``` cityscapes |_ gtFine_trainvaltest.zip |_ gtFine_trainvaltest |_ gtFine ``` 3. Run the below commands to convert the annotations ``` cd ~/github git clone https://github.com/mcordts/cityscapesScripts.git cd cityscapesScripts cp ~/github/maskrcnn-benchmark/tools/cityscapes/instances2dict_with_polygons.py cityscapesscripts/evaluation python setup.py install cd ~/github/maskrcnn-benchmark python tools/cityscapes/convert_cityscapes_to_coco.py --datadir /path/to/cityscapes --outdir /path/to/cityscapes/annotations ``` Example configuration files for Cityscapes could be found [here](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/configs/cityscapes/). ================================================ FILE: maskrcnn_benchmark/data/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from .build import make_data_loader ================================================ FILE: maskrcnn_benchmark/data/build.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import bisect import copy import logging import torch.utils.data from maskrcnn_benchmark.utils.comm import get_world_size from maskrcnn_benchmark.utils.imports import import_file from maskrcnn_benchmark.utils.miscellaneous import save_labels from . import datasets as D from . import samplers from .collate_batch import BatchCollator, BBoxAugCollator from .transforms import build_transforms def build_dataset(dataset_list, transforms, dataset_catalog, is_train=True): """ Arguments: dataset_list (list[str]): Contains the names of the datasets, i.e., coco_2014_train, coco_2014_val, etc transforms (callable): transforms to apply to each (image, target) sample dataset_catalog (DatasetCatalog): contains the information on how to construct a dataset. is_train (bool): whether to setup the dataset for training or testing """ if not isinstance(dataset_list, (list, tuple)): raise RuntimeError( "dataset_list should be a list of strings, got {}".format(dataset_list) ) datasets = [] for dataset_name in dataset_list: data = dataset_catalog.get(dataset_name) factory = getattr(D, data["factory"]) args = data["args"] # for COCODataset, we want to remove images without annotations # during training if data["factory"] == "COCODataset": args["remove_images_without_annotations"] = is_train if data["factory"] == "PascalVOCDataset": args["use_difficult"] = not is_train args["transforms"] = transforms # make dataset from factory dataset = factory(**args) datasets.append(dataset) # for testing, return a list of datasets if not is_train: return datasets # for training, concatenate all datasets into a single one dataset = datasets[0] if len(datasets) > 1: dataset = D.ConcatDataset(datasets) return [dataset] def make_data_sampler(dataset, shuffle, distributed): if distributed: return samplers.DistributedSampler(dataset, shuffle=shuffle) if shuffle: sampler = torch.utils.data.sampler.RandomSampler(dataset) else: sampler = torch.utils.data.sampler.SequentialSampler(dataset) return sampler def _quantize(x, bins): bins = copy.copy(bins) bins = sorted(bins) quantized = list(map(lambda y: bisect.bisect_right(bins, y), x)) return quantized def _compute_aspect_ratios(dataset): aspect_ratios = [] for i in range(len(dataset)): img_info = dataset.get_img_info(i) aspect_ratio = float(img_info["height"]) / float(img_info["width"]) aspect_ratios.append(aspect_ratio) return aspect_ratios def make_batch_data_sampler( dataset, sampler, aspect_grouping, images_per_batch, num_iters=None, start_iter=0 ): if aspect_grouping: if not isinstance(aspect_grouping, (list, tuple)): aspect_grouping = [aspect_grouping] aspect_ratios = _compute_aspect_ratios(dataset) group_ids = _quantize(aspect_ratios, aspect_grouping) batch_sampler = samplers.GroupedBatchSampler( sampler, group_ids, images_per_batch, drop_uneven=False ) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_batch, drop_last=False ) if num_iters is not None: batch_sampler = samplers.IterationBasedBatchSampler( batch_sampler, num_iters, start_iter ) return batch_sampler def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0, is_for_period=False): num_gpus = get_world_size() if is_train: images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_gpus == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of GPUs ({}) used.".format( images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = True num_iters = cfg.SOLVER.MAX_ITER else: images_per_batch = cfg.TEST.IMS_PER_BATCH assert ( images_per_batch % num_gpus == 0 ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number of GPUs ({}) used.".format( images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = False if not is_distributed else True num_iters = None start_iter = 0 if images_per_gpu > 1: logger = logging.getLogger(__name__) logger.warning( "When using more than one image per GPU you may encounter " "an out-of-memory (OOM) error if your GPU does not have " "sufficient memory. If this happens, you can reduce " "SOLVER.IMS_PER_BATCH (for training) or " "TEST.IMS_PER_BATCH (for inference). For training, you must " "also adjust the learning rate and schedule length according " "to the linear scaling rule. See for example: " "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14" ) # group images which have similar aspect ratio. In this case, we only # group in two cases: those with width / height > 1, and the other way around, # but the code supports more general grouping strategy aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] paths_catalog = import_file( "maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True ) DatasetCatalog = paths_catalog.DatasetCatalog dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST # If bbox aug is enabled in testing, simply set transforms to None and we will apply transforms later transforms = None if not is_train and cfg.TEST.BBOX_AUG.ENABLED else build_transforms(cfg, is_train) datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train or is_for_period) if is_train: # save category_id to label name mapping save_labels(datasets, cfg.OUTPUT_DIR) data_loaders = [] for dataset in datasets: sampler = make_data_sampler(dataset, shuffle, is_distributed) batch_sampler = make_batch_data_sampler( dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter ) collator = BBoxAugCollator() if not is_train and cfg.TEST.BBOX_AUG.ENABLED else \ BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) num_workers = cfg.DATALOADER.NUM_WORKERS data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=collator, ) data_loaders.append(data_loader) if is_train or is_for_period: # during training, a single (possibly concatenated) data_loader is returned assert len(data_loaders) == 1 return data_loaders[0] return data_loaders ================================================ FILE: maskrcnn_benchmark/data/collate_batch.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from maskrcnn_benchmark.structures.image_list import to_image_list class BatchCollator(object): """ From a list of samples from the dataset, returns the batched images and targets. This should be passed to the DataLoader """ def __init__(self, size_divisible=0): self.size_divisible = size_divisible def __call__(self, batch): transposed_batch = list(zip(*batch)) images = to_image_list(transposed_batch[0], self.size_divisible) targets = transposed_batch[1] img_ids = transposed_batch[2] return images, targets, img_ids class BBoxAugCollator(object): """ From a list of samples from the dataset, returns the images and targets. Images should be converted to batched images in `im_detect_bbox_aug` """ def __call__(self, batch): return list(zip(*batch)) ================================================ FILE: maskrcnn_benchmark/data/datasets/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from .coco import COCODataset from .voc import PascalVOCDataset from .concat_dataset import ConcatDataset from .abstract import AbstractDataset from .cityscapes import CityScapesDataset __all__ = [ "COCODataset", "ConcatDataset", "PascalVOCDataset", "AbstractDataset", "CityScapesDataset", ] ================================================ FILE: maskrcnn_benchmark/data/datasets/abstract.py ================================================ import torch class AbstractDataset(torch.utils.data.Dataset): """ Serves as a common interface to reduce boilerplate and help dataset customization A generic Dataset for the maskrcnn_benchmark must have the following non-trivial fields / methods implemented: CLASSES - list/tuple: A list of strings representing the classes. It must have "__background__" as its 0th element for correct id mapping. __getitem__ - function(idx): This has to return three things: img, target, idx. img is the input image, which has to be load as a PIL Image object implementing the target requires the most effort, since it must have multiple fields: the size, bounding boxes, labels (contiguous), and masks (either COCO-style Polygons, RLE or torch BinaryMask). Usually the target is a BoxList instance with extra fields. Lastly, idx is simply the input argument of the function. also the following is required: __len__ - function(): return the size of the dataset get_img_info - function(idx): return metadata, at least width and height of the input image """ def __init__(self, *args, **kwargs): self.name_to_id = None self.id_to_name = None def __getitem__(self, idx): raise NotImplementedError def initMaps(self): """ Can be called optionally to initialize the id<->category name mapping Initialize default mapping between: class <==> index class: this is a string that represents the class index: positive int, used directly by the ROI heads. NOTE: make sure that the background is always indexed by 0. "__background__" <==> 0 if initialized by hand, double check that the indexing is correct. """ assert isinstance(self.CLASSES, (list, tuple)) assert self.CLASSES[0] == "__background__" cls = self.CLASSES self.name_to_id = dict(zip(cls, range(len(cls)))) self.id_to_name = dict(zip(range(len(cls)), cls)) def get_img_info(self, index): raise NotImplementedError def __len__(self): raise NotImplementedError ================================================ FILE: maskrcnn_benchmark/data/datasets/cityscapes.py ================================================ import os import glob import json from PIL import Image import numpy as np import torch import torchvision from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask from .abstract import AbstractDataset from cityscapesscripts.helpers import csHelpers class CityScapesDataset(AbstractDataset): def __init__( self, img_dir, ann_dir, split, mode="mask", transforms=None, min_area=0, mini=None, ): """ Arguments: img_dir: /path/to/leftImg8bit/ has to contain {train,val,test} ann_dir: /path/to/gtFine/ has to contain {train,val,test} split: "train" or "val" or "test" mode: "poly" or "mask", which annotation format to use transforms: apply transformations to input/annotation min_area: exclude intances below a specific area (bbox area) mini: limit the size of the dataset, so len(dataset) == mini for debugging purposes """ assert split in ["train", "val", "test"] img_dir = os.path.abspath(os.path.join(img_dir, split)) ann_dir = os.path.abspath(os.path.join(ann_dir, split)) assert os.path.exists(img_dir), img_dir assert os.path.exists(ann_dir), ann_dr self.ann_dir = ann_dir self.split = split self.CLASSES = ["__background__"] self.CLASSES += [l.name for l in csHelpers.labels if l.hasInstances] # Adds name_to_id and id_to_name mapping self.initMaps() # This is required for parsing binary masks self.cityscapesID_to_ind = { l.id: self.name_to_id[l.name] for l in csHelpers.labels if l.hasInstances } self.transforms = transforms self.min_area = int(min_area) img_pattern = os.path.join(img_dir, "*", "*_leftImg8bit.png") img_paths = sorted(glob.glob(img_pattern)) if mode == "mask": ann_pattern = os.path.join(ann_dir, "*", "*_instanceIds.png") elif mode == "poly": ann_pattern = os.path.join(ann_dir, "*", "*_polygons.json") else: raise NotImplementedError("Mode is not implemented yet: %s" % mode) self.mode = mode ann_paths = sorted(glob.glob(ann_pattern)) if mini is not None: # Keep the mini dataset diverse by setting the stride img_paths = img_paths[:: len(img_paths) // mini + 1] ann_paths = ann_paths[:: len(ann_paths) // mini + 1] assert len(img_paths) == len(ann_paths) self.img_paths = img_paths self.ann_paths = ann_paths def __getitem__(self, idx): img_path = self.img_paths[idx] ann_path = self.ann_paths[idx] if self.mode == "mask": ann = torch.from_numpy(np.asarray(Image.open(ann_path))) # masks are represented with tensors boxes, segmentations, labels = self._processBinayMasks(ann) else: with open(ann_path, "r") as ann_file: ann = json.load(ann_file) # masks are represented with polygons boxes, segmentations, labels = self._processPolygons(ann) boxes, segmentations, labels = self._filterGT(boxes, segmentations, labels) if len(segmentations) == 0: empty_ann_path = self.get_img_info(idx)["ann_path"] print("EMPTY ENTRY:", empty_ann_path) # self.img_paths.pop(idx) # self.ann_paths.pop(idx) img, target, _ = self[(idx + 1) % len(self)] # just override this image with the next return img, target, idx img = Image.open(img_path) # Compose all into a BoxList instance target = BoxList(boxes, img.size, mode="xyxy") target.add_field("labels", torch.tensor(labels)) masks = SegmentationMask(segmentations, img.size, mode=self.mode) target.add_field("masks", masks) if self.transforms is not None: img, target = self.transforms(img, target) return img, target, idx def _filterGT(self, boxes, segmentations, labels): filtered_boxes = [] filtered_segmentations = [] filtered_labels = [] assert len(segmentations) == len(labels) == len(boxes) for box, segmentation, label in zip(boxes, segmentations, labels): xmin, ymin, xmax, ymax = box area = (xmax - xmin) * (ymax - ymin) if area < self.min_area: continue filtered_boxes.append(box) filtered_segmentations.append(segmentation) filtered_labels.append(label) if len(filtered_boxes) < 1: filtered_boxes = torch.empty(0, 4) return filtered_boxes, filtered_segmentations, filtered_labels def _processPolygons(self, ann): # For a single object polygon annotations are stored in CityScapes like # [[x1, y1], [x2, y2]...] and we need them in the following format: # [x1, y1, x2, y2, x3, y3 ...] polys = [] labels = [] boxes = [] def poly_to_tight_box(poly): xmin = int(min(poly[::2])) ymin = int(min(poly[1::2])) xmax = int(max(poly[::2])) ymax = int(max(poly[1::2])) bbox = xmin, ymin, xmax, ymax return bbox for inst in ann["objects"]: label = inst["label"] if label not in self.CLASSES: continue label = self.name_to_id[label] cityscapes_poly = inst["polygon"] poly = [] for xy in cityscapes_poly: # Equivalent with `poly += xy` but this is more verbose x = xy[0] y = xy[1] poly.append(x) poly.append(y) # In CityScapes instances are described with single polygons only box = poly_to_tight_box(poly) boxes.append(box) polys.append([poly]) labels.append(label) if len(boxes) < 1: boxes = torch.empty(0, 4) return boxes, polys, labels def _processBinayMasks(self, ann): boxes = [] masks = [] labels = [] def mask_to_tight_box(mask): a = mask.nonzero() bbox = [ torch.min(a[:, 1]), torch.min(a[:, 0]), torch.max(a[:, 1]), torch.max(a[:, 0]), ] bbox = list(map(int, bbox)) return bbox # xmin, ymin, xmax, ymax # Sort for consistent order between instances as the polygon annotation instIds = torch.sort(torch.unique(ann))[0] for instId in instIds: if instId < 1000: # group labels continue mask = ann == instId label = int(instId / 1000) label = self.cityscapesID_to_ind[label] box = mask_to_tight_box(mask) boxes.append(box) masks.append(mask) labels.append(label) return boxes, masks, labels def __len__(self): return len(self.img_paths) def get_img_info(self, index): # Reverse engineered from voc.py # All the images have the same size return { "height": 1024, "width": 2048, "idx": index, "img_path": self.img_paths[index], "ann_path": self.ann_paths[index], } ================================================ FILE: maskrcnn_benchmark/data/datasets/coco.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch import torchvision from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask from maskrcnn_benchmark.structures.keypoint import PersonKeypoints min_keypoints_per_image = 10 def _count_visible_keypoints(anno): return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno) def _has_only_empty_bbox(anno): return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno) def has_valid_annotation(anno): # if it's empty, there is no annotation if len(anno) == 0: return False # if all boxes have close to zero area, there is no annotation if _has_only_empty_bbox(anno): return False # keypoints task have a slight different critera for considering # if an annotation is valid if "keypoints" not in anno[0]: return True # for keypoint detection tasks, only consider valid images those # containing at least min_keypoints_per_image if _count_visible_keypoints(anno) >= min_keypoints_per_image: return True return False class COCODataset(torchvision.datasets.coco.CocoDetection): def __init__( self, ann_file, root, remove_images_without_annotations, transforms=None ): super(COCODataset, self).__init__(root, ann_file) # sort indices for reproducible results self.ids = sorted(self.ids) # filter images without detection annotations if remove_images_without_annotations: ids = [] for img_id in self.ids: ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None) anno = self.coco.loadAnns(ann_ids) if has_valid_annotation(anno): ids.append(img_id) self.ids = ids self.categories = {cat['id']: cat['name'] for cat in self.coco.cats.values()} self.json_category_id_to_contiguous_id = { v: i + 1 for i, v in enumerate(self.coco.getCatIds()) } self.contiguous_category_id_to_json_id = { v: k for k, v in self.json_category_id_to_contiguous_id.items() } self.id_to_img_map = {k: v for k, v in enumerate(self.ids)} self._transforms = transforms def __getitem__(self, idx): img, anno = super(COCODataset, self).__getitem__(idx) # filter crowd annotations # TODO might be better to add an extra field anno = [obj for obj in anno if obj["iscrowd"] == 0] boxes = [obj["bbox"] for obj in anno] boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes target = BoxList(boxes, img.size, mode="xywh").convert("xyxy") classes = [obj["category_id"] for obj in anno] classes = [self.json_category_id_to_contiguous_id[c] for c in classes] classes = torch.tensor(classes) target.add_field("labels", classes) if anno and "segmentation" in anno[0]: masks = [obj["segmentation"] for obj in anno] masks = SegmentationMask(masks, img.size, mode='poly') target.add_field("masks", masks) if anno and "keypoints" in anno[0]: keypoints = [obj["keypoints"] for obj in anno] keypoints = PersonKeypoints(keypoints, img.size) target.add_field("keypoints", keypoints) target = target.clip_to_image(remove_empty=True) if self._transforms is not None: img, target = self._transforms(img, target) return img, target, idx def get_img_info(self, index): img_id = self.id_to_img_map[index] img_data = self.coco.imgs[img_id] return img_data ================================================ FILE: maskrcnn_benchmark/data/datasets/concat_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import bisect from torch.utils.data.dataset import ConcatDataset as _ConcatDataset class ConcatDataset(_ConcatDataset): """ Same as torch.utils.data.dataset.ConcatDataset, but exposes an extra method for querying the sizes of the image """ def get_idxs(self, idx): dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) if dataset_idx == 0: sample_idx = idx else: sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] return dataset_idx, sample_idx def get_img_info(self, idx): dataset_idx, sample_idx = self.get_idxs(idx) return self.datasets[dataset_idx].get_img_info(sample_idx) ================================================ FILE: maskrcnn_benchmark/data/datasets/evaluation/__init__.py ================================================ from maskrcnn_benchmark.data import datasets from .coco import coco_evaluation from .voc import voc_evaluation from .cityscapes import abs_cityscapes_evaluation def evaluate(dataset, predictions, output_folder, **kwargs): """evaluate dataset using different methods based on dataset type. Args: dataset: Dataset object predictions(list[BoxList]): each item in the list represents the prediction results for one image. output_folder: output folder, to save evaluation files or results. **kwargs: other args. Returns: evaluation result """ args = dict( dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs ) if isinstance(dataset, datasets.COCODataset): return coco_evaluation(**args) elif isinstance(dataset, datasets.PascalVOCDataset): return voc_evaluation(**args) elif isinstance(dataset, datasets.AbstractDataset): return abs_cityscapes_evaluation(**args) else: dataset_name = dataset.__class__.__name__ raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name)) ================================================ FILE: maskrcnn_benchmark/data/datasets/evaluation/cityscapes/__init__.py ================================================ from .cityscapes_eval import do_cityscapes_evaluation def abs_cityscapes_evaluation( dataset, predictions, box_only, output_folder, iou_types, expected_results, expected_results_sigma_tol, ): return do_cityscapes_evaluation( dataset=dataset, predictions=predictions, box_only=box_only, output_folder=output_folder, iou_types=iou_types, expected_results=expected_results, expected_results_sigma_tol=expected_results_sigma_tol, ) ================================================ FILE: maskrcnn_benchmark/data/datasets/evaluation/cityscapes/cityscapes_eval.py ================================================ import logging import tempfile import os import torch from collections import OrderedDict from tqdm import tqdm from copy import deepcopy import torch import numpy as np from maskrcnn_benchmark.modeling.roi_heads.mask_head.inference import Masker from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou from maskrcnn_benchmark.data.datasets.evaluation.cityscapes import eval_instances from cityscapesscripts.helpers.csHelpers import writeDict2JSON, ensurePath def do_cityscapes_evaluation( dataset, predictions, box_only, output_folder, iou_types, expected_results, expected_results_sigma_tol, ): logger = logging.getLogger("maskrcnn_benchmark.inference") logger.info(f"CityScapes evaluation on [{dataset}]:") # Set default args for evaluation args = deepcopy(eval_instances.defaultArgs) # Set output folder output_folder = os.path.join(output_folder, "evaluationResults") ensurePath(output_folder) # Set custom fields args.exportMatchFile = os.path.join(output_folder, "matches.json") args.exportBoxFile = os.path.join(output_folder, "boxResult.json") args.exportMaskFile = os.path.join(output_folder, "maskResult.json") args.instLabels = list(dataset.CLASSES) logger.info("Evaluation arguments") logger.info("%s" % args) logger.info("Matching GT instances with Predictions") if "bbox" in iou_types or "segm" in iou_types: # Match and compute IoU of mask and box in one iteration: matches = eval_instances.matchGtsWithPreds(dataset, predictions) writeDict2JSON(matches, args.exportMatchFile) else: NotImplementedError(f"IoU type not implemented {iou_types}") # printing strResults = "" if "bbox" in iou_types: # evaluate matches logger.info("Evaluating BBox matches") boxApScores = eval_instances.evaluateBoxMatches(matches, args) # averages logger.info("Average Box scores") boxAvgDict = eval_instances.computeAverages(boxApScores, args) # logging boxResDict = eval_instances.prepareJSONDataForResults( boxAvgDict, boxApScores, args ) if args.JSONOutput: # create output folder if necessary path = os.path.dirname(args.exportBoxFile) ensurePath(path) # Write APs to JSON eval_instances.writeDict2JSON(boxResDict, args.exportBoxFile) strBoxResults = eval_instances.printResults(boxAvgDict, args) strResults += "\nBBox\n" + strBoxResults if "segm" in iou_types: # evaluate matches logger.info("Evaluating Mask matches") maskApScores = eval_instances.evaluateMaskMatches(matches, args) # averages logger.info("Average Mask scores") maskAvgDict = eval_instances.computeAverages(maskApScores, args) # logging maskResDict = eval_instances.prepareJSONDataForResults( maskAvgDict, maskApScores, args ) if args.JSONOutput: # create output folder if necessary path = os.path.dirname(args.exportMaskFile) ensurePath(path) # Write APs to JSON eval_instances.writeDict2JSON(maskResDict, args.exportMaskFile) strMaskResults = eval_instances.printResults(maskAvgDict, args) strResults += "\nMask\n" + strMaskResults logger.info(strResults) ================================================ FILE: maskrcnn_benchmark/data/datasets/evaluation/cityscapes/eval_instances.py ================================================ #!/usr/bin/python # # The evaluation script for instance-level semantic labeling. # We use this script to evaluate your approach on the test set. # You can use the script to evaluate on the validation set. # # Please check the description of the "getPrediction" method below # and set the required environment variables as needed, such that # this script can locate your results. # If the default implementation of the method works, then it's most likely # that our evaluation server will be able to process your results as well. # # To run this script, make sure that your results contain text files # (one for each test set image) with the content: # relPathPrediction1 labelIDPrediction1 confidencePrediction1 # relPathPrediction2 labelIDPrediction2 confidencePrediction2 # relPathPrediction3 labelIDPrediction3 confidencePrediction3 # ... # # - The given paths "relPathPrediction" point to images that contain # binary masks for the described predictions, where any non-zero is # part of the predicted instance. The paths must not contain spaces, # must be relative to the root directory and must point to locations # within the root directory. # - The label IDs "labelIDPrediction" specify the class of that mask, # encoded as defined in labels.py. Note that the regular ID is used, # not the train ID. # - The field "confidencePrediction" is a float value that assigns a # confidence score to the mask. # # Note that this tool creates a file named "gtInstances.json" during its # first run. This file helps to speed up computation and should be deleted # whenever anything changes in the ground truth annotations or anything # goes wrong. # python imports from __future__ import print_function, absolute_import, division import os, sys import fnmatch from copy import deepcopy import io from contextlib import redirect_stdout from tqdm import tqdm import torch import logging import numpy as np from maskrcnn_benchmark.modeling.roi_heads.mask_head.inference import Masker from maskrcnn_benchmark.layers.misc import interpolate # Cityscapes imports from cityscapesscripts.helpers.csHelpers import writeDict2JSON from cityscapesscripts.helpers.csHelpers import ensurePath from cityscapesscripts.helpers.csHelpers import colors, getColorEntry ###################### # Parameters ###################### # A dummy class to collect all bunch of data class CArgs(object): def __repr__(self): """ A weird looking pretty print for Evaluation Arguments """ longest_key = max([len(str(k)) for k in self.__dict__.keys()]) longest_val = max([len(str(v)) for v in self.__dict__.values()]) s = "\n" + "#" * max(79, (longest_key + longest_val + 3)) + "\n" for k, v in self.__dict__.items(): s += "%{}s : %s\n".format(longest_key) % (k, v) s += "#" * max(79, (longest_key + longest_val + 3)) + "\n" return s # And a global object of that class defaultArgs = CArgs() # Parameters that should be modified by user defaultArgs.exportBoxFile = os.path.join("evaluationResults", "boxResult.json") defaultArgs.exportMaskFile = os.path.join("evaluationResults", "maskResult.json") # overlaps for evaluation defaultArgs.overlaps = np.arange(0.5, 1.0, 0.05) # minimum region size for evaluation [pixels] defaultArgs.minRegionSizes = np.array([100]) # defaultArgs.minRegionSizes = np.array( [ 400 ] ) defaultArgs.JSONOutput = True defaultArgs.quiet = False defaultArgs.csv = False defaultArgs.colorized = True defaultArgs.instLabels = [] def matchGtsWithPreds(dataset, predictions): """ Go through the `dataset` and `predictions` one-by-one, and list all instances with any non-zero intersection. This function handles matching when only BBoxes are used, and when instnace segmentation is available it computes the pixel-wise overlap as well The implementation is heavily based on the original CityScapes eval script: https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py Original match structure looks like: {"filename1": "groundTruth":gtInstances "prediction":predInstances } # Filenames are not necessary, replace them with idx Instances= { "category_name1":[Instance1, Instance2, ...] "category_name2":[Instance3, Instance4, ...] ... } gtInstance= { "labelID":int(labelID) "instID":int(instID) "boxArea":np.count_nonzero(npArray binary mask) "intersection": pixel count (ONLY IF the dict is in the inner list of a predInstance["matchedGt"]) "voidIntersection":REMOVE THIS!!! "matchedPred":list(predInstance) which has nonzero intersection } predInstance= { "imgName":"path/to/input/img" "predID": "labelID":int(labelID) "boxArea":pixel count (ONLY IF the dict is in the inner list of a predInstance["matchedGt"]) "confidence":float(confidence) "intersection":np.count_nonzero( np.logical_and( gtNp == gtInstance["instID"] , boolPredInst) ) "voidIntersection":REMOVE THIS!!! "matchedGt":list(gtInstance) which has nonzero intersection } """ assert len(dataset) == len(predictions), f"{len(dataset)} != {len(predictions)}" matches = [] for idx in tqdm(range(len(predictions)), desc="Matching Preds with GT"): matches.append(matchGtWithPred(dataset, predictions, idx)) return matches def isOverlapping(box1, box2): x1min, y1min, x1max, y1max = box1 x2min, y2min, x2max, y2max = box2 ret = x1min < x2max and x2min < x1max and y1min < y2max and y2min < y1max return ret def getUnionBox(box1, box2): x1min, y1min, x1max, y1max = map(int, box1) x2min, y2min, x2max, y2max = map(int, box2) xmin = min(x1min, x2min) ymin = min(y1min, y2min) xmax = max(x1max, x2max) ymax = max(y1max, y2max) unionBox = xmin, ymin, xmax, ymax return unionBox def getIntersectionBox(box1, box2): x1min, y1min, x1max, y1max = map(int, box1) x2min, y2min, x2max, y2max = map(int, box2) xmin = max(x1min, x2min) ymin = max(y1min, y2min) xmax = min(x1max, x2max) ymax = min(y1max, y2max) intersectionBox = xmin, ymin, xmax, ymax return intersectionBox def computeBoxIntersection(gt, pred): """ Compute intersection between GT instance and prediction. """ xmin, ymin, xmax, ymax = getIntersectionBox(gt["box"], pred["box"]) intersection = (xmax - xmin) * (ymax - ymin) return intersection def computeMaskIntersection(gt, gtMask, pred, predMask): """ Compute intersection between GT instance and prediction. Increase efficiency by computing elementwise product between masks only inside the tight bounding box of the union of the prediction and target masks. """ if gtMask is None or predMask is None: return 0 assert gtMask.shape == predMask.shape assert len(gtMask.shape) == len(predMask.shape) == 2 xmin, ymin, xmax, ymax = getUnionBox(gt["box"], pred["box"]) gtMask_crop = gtMask[ymin:ymax, xmin:xmax] predMask_crop = predMask[ymin:ymax, xmin:xmax] # elementwise AND intersection = torch.sum(torch.mul(gtMask_crop, predMask_crop)).item() return intersection def matchGtWithPred(dataset, predictions, idx): # Collect instances from gt and pred separately per image # TODO: not parallel! parallelize this process safely perImgGtInstances, gtMasks = prepareGtImage(dataset, idx) perImgPredInstances, predMasks = preparePredImage(dataset, predictions, idx) # If no masks are provided, the segmentation score will be 0 for gt, gtMask in zip(perImgGtInstances, gtMasks): for pred, predMask in zip(perImgPredInstances, predMasks): if not isOverlapping(gt["box"], pred["box"]): continue boxIntersection = computeBoxIntersection(gt, pred) maskIntersection = computeMaskIntersection(gt, gtMask, pred, predMask) if boxIntersection > 0: # Copy metadata only, and register the matched pairs # this step is redundant but informative # intersection score would be enough gtCopy = gt.copy() predCopy = pred.copy() # remove linking field (an empty list) to avoid confusion gtCopy.pop("matchedPred") predCopy.pop("matchedGt") gtCopy["boxIntersection"] = boxIntersection gtCopy["maskIntersection"] = maskIntersection predCopy["boxIntersection"] = boxIntersection predCopy["maskIntersection"] = maskIntersection gt["matchedPred"].append(predCopy) pred["matchedGt"].append(gtCopy) # Group by classes groupedGtInstances = {labelName: [] for labelName in dataset.CLASSES} groupedPredInstances = {labelName: [] for labelName in dataset.CLASSES} for gt in perImgGtInstances: gtLabelName = dataset.id_to_name[gt["labelID"]] groupedGtInstances[gtLabelName].append(gt) for pred in perImgPredInstances: predLabelName = dataset.id_to_name[pred["labelID"]] groupedPredInstances[predLabelName].append(pred) match = {"groundTruth": groupedGtInstances, "prediction": groupedPredInstances} return match def prepareGtImage(dataset, idx): _, perImageGts, _ = dataset[idx] perImageInstances = [] maskTensor = [None] * len(perImageGts) if len(perImageGts) == 0: return perImageInstances, maskTensor # Resize to original image size imgInfo = dataset.get_img_info(idx) origSize = imgInfo["width"], imgInfo["height"] if perImageGts.size != origSize: perImageGts = perImageGts.resize(size=origSize) # Compute box areas perImageGts = perImageGts.convert("xyxy") bbs = perImageGts.bbox.long() xmins, ymins, xmaxs, ymaxs = bbs[:, 0], bbs[:, 1], bbs[:, 2], bbs[:, 3] boxAreas = ((xmaxs - xmins) * (ymaxs - ymins)).tolist() bbs = bbs.tolist() # object label for each prediction labels = perImageGts.get_field("labels").tolist() if "masks" in perImageGts.fields(): # Get the binary mask for each instance in a contiguous array maskTensor = perImageGts.get_field("masks").get_mask_tensor() # In case of single mask then add a new axis if len(maskTensor.shape) == 2: maskTensor = maskTensor[None] # unique_values = set(torch.unique(maskTensor).tolist()) # assert len(unique_values) == 2, "Not binary mask: %s" % unique_values # pixelCounts = maskTensor.clamp_(0, 1).sum(dim=[1, 2]) pixelCounts = [] for (xmin, ymin, xmax, ymax), instanceMask in zip(bbs, maskTensor): pixelCounts.append(instanceMask[ymin:ymax, xmin:xmax].sum().item()) for instID in range(len(perImageGts)): xmin, ymin, xmax, ymax = bbs[instID] pixelCount = pixelCounts[instID] if maskTensor[0] is not None else 0 gtInstance = { "labelID": labels[instID], "instID": instID, "boxArea": boxAreas[instID], "pixelCount": pixelCount, "box": (xmin, ymin, xmax, ymax), "matchedPred": [], } perImageInstances.append(gtInstance) return perImageInstances, maskTensor def preparePredImage(dataset, predictions, idx): perImagePredictions = predictions[idx] # A list will hold statistics and meta-data about the image perImageInstances = [] # maskTensor represents binary masks of all predicted instance segmentations # if present maskTensor = [None] * len(perImagePredictions) # No predictions for this image if len(perImagePredictions) == 0: return perImageInstances, maskTensor # Resize to original image size imgInfo = dataset.get_img_info(idx) origSize = imgInfo["width"], imgInfo["height"] if perImagePredictions.size != origSize: perImagePredictions = perImagePredictions.resize(size=origSize) # Bounding boxes and areas perImagePredictions = perImagePredictions.convert("xyxy") bbs = perImagePredictions.bbox.long() xmins, ymins, xmaxs, ymaxs = bbs[:, 0], bbs[:, 1], bbs[:, 2], bbs[:, 3] boxAreas = ((xmaxs - xmins) * (ymaxs - ymins)).tolist() bbs = bbs.tolist() # object label and "Objectness" score for each prediction labels = perImagePredictions.get_field("labels").tolist() scores = perImagePredictions.get_field("scores").tolist() # Get the mask for each instance in a contiguous array if "mask" in perImagePredictions.fields(): maskTensor = perImagePredictions.get_field("mask") # sanity checks assert len(perImagePredictions) == len(maskTensor), ( "number of masks (%d) do not match the number of boxes (%d)" % (len(perImagePredictions), len(maskTensor)) ) maskTensor = maskTensor.float() # We assume that the maskTensors are coming right out of the maskRCNN # having values between [0, 1] inclusive # # assert maskTensor.min() >= 0.0 and maskTensor.max() <= 1.0, [ # maskTensor.max(), # maskTensor.min(), # ] # Project masks to the boxes # TODO: Issue #527 - bad Masker interface # # The predicted masks are in the shape of i.e. [N, 1, 28, 28] where N is # the number of instances predicted, and they represent the interior # of the bounding boxes. # # Masker projects these predictions on an empty canvas with the full # size of the input image using the predicted bounding boxes maskTensor = Masker(threshold=0.5).forward_single_image( maskTensor, perImagePredictions )[:, 0, :, :] pixelCounts = [] for (xmin, ymin, xmax, ymax), instanceMask in zip(bbs, maskTensor): pixelCounts.append(instanceMask[ymin:ymax, xmin:xmax].sum().item()) for predID in range(len(perImagePredictions)): xmin, ymin, xmax, ymax = bbs[predID] # if we have instance segmentation prediction then we update pixelCount pixelCount = 0 if maskTensor[0] is not None: pixelCount = pixelCounts[predID] if pixelCount == 0: continue predInstance = { "imgName": idx, "predID": predID, "labelID": labels[predID], "boxArea": boxAreas[predID], "pixelCount": pixelCount, "confidence": scores[predID], "box": (xmin, ymin, xmax, ymax), "matchedGt": [], } perImageInstances.append(predInstance) return perImageInstances, maskTensor def evaluateBoxMatches(matches, args): # In the end, we need two vectors for each class and for each overlap # The first vector (y_true) is binary and is 1, where the ground truth says true, # and is 0 otherwise. # The second vector (y_score) is float [0...1] and represents the confidence of # the prediction. # # We represent the following cases as: # | y_true | y_score # gt instance with matched prediction | 1 | confidence # gt instance w/o matched prediction | 1 | 0.0 # false positive prediction | 0 | confidence # # The current implementation makes only sense for an overlap threshold >= 0.5, # since only then, a single prediction can either be ignored or matched, but # never both. Further, it can never match to two gt instances. # For matching, we vary the overlap and do the following steps: # 1.) remove all predictions that satisfy the overlap criterion with an ignore region (either void or *group) # 2.) remove matches that do not satisfy the overlap # 3.) mark non-matched predictions as false positive # AP overlaps = args.overlaps # region size minRegionSizes = args.minRegionSizes # only keep the first, if distances are not available # if not args.distanceAvailable: # minRegionSizes = [ minRegionSizes[0] ] # distThs = [ distThs [0] ] # distConfs = [ distConfs [0] ] # Here we hold the results # First dimension is class, second overlap ap = np.zeros((len(minRegionSizes), len(args.instLabels), len(overlaps)), np.float) for dI, minRegionSize in enumerate(minRegionSizes): for (oI, overlapTh) in enumerate(overlaps): for (lI, labelName) in enumerate(args.instLabels): y_true = np.empty(0) y_score = np.empty(0) # count hard false negatives hardFns = 0 # found at least one gt and predicted instance? haveGt = False havePred = False for img in matches: predInstances = img["prediction"][labelName] gtInstances = img["groundTruth"][labelName] # filter groups in ground truth gtInstances = [ gt for gt in gtInstances if gt["boxArea"] >= minRegionSize ] if gtInstances: haveGt = True if predInstances: havePred = True curTrue = np.ones(len(gtInstances)) curScore = np.ones(len(gtInstances)) * (-float("inf")) curMatch = np.zeros(len(gtInstances), dtype=np.bool) # collect matches for (gtI, gt) in enumerate(gtInstances): foundMatch = False for pred in gt["matchedPred"]: overlap = float(pred["boxIntersection"]) / ( gt["boxArea"] + pred["boxArea"] - pred["boxIntersection"] ) if overlap > overlapTh: # the score confidence = pred["confidence"] # if we already hat a prediction for this groundtruth # the prediction with the lower score is automatically a false positive if curMatch[gtI]: maxScore = max(curScore[gtI], confidence) minScore = min(curScore[gtI], confidence) curScore[gtI] = maxScore # append false positive curTrue = np.append(curTrue, 0) curScore = np.append(curScore, minScore) curMatch = np.append(curMatch, True) # otherwise set score else: foundMatch = True curMatch[gtI] = True curScore[gtI] = confidence if not foundMatch: hardFns += 1 # remove non-matched ground truth instances curTrue = curTrue[curMatch == True] curScore = curScore[curMatch == True] # collect non-matched predictions as false positive for pred in predInstances: foundGt = False for gt in pred["matchedGt"]: overlap = float(gt["boxIntersection"]) / ( gt["boxArea"] + pred["boxArea"] - gt["boxIntersection"] ) if overlap > overlapTh: foundGt = True break if not foundGt: # collect number of void and *group pixels nbIgnorePixels = 0 for gt in pred["matchedGt"]: # small ground truth instances if gt["boxArea"] < minRegionSize: nbIgnorePixels += gt["boxIntersection"] if pred["boxArea"] > 0: proportionIgnore = ( float(nbIgnorePixels) / pred["boxArea"] ) else: proportionIgnore = 0 # if not ignored # append false positive if proportionIgnore <= overlapTh: curTrue = np.append(curTrue, 0) confidence = pred["confidence"] curScore = np.append(curScore, confidence) # append to overall results y_true = np.append(y_true, curTrue) y_score = np.append(y_score, curScore) # compute the average precision if haveGt and havePred: # compute precision recall curve first # sorting and cumsum scoreArgSort = np.argsort(y_score) yScoreSorted = y_score[scoreArgSort] yTrueSorted = y_true[scoreArgSort] yTrueSortedCumsum = np.cumsum(yTrueSorted) # unique thresholds (thresholds, uniqueIndices) = np.unique( yScoreSorted, return_index=True ) # since we need to add an artificial point to the precision-recall curve # increase its length by 1 nbPrecRecall = len(uniqueIndices) + 1 # prepare precision recall nbExamples = len(yScoreSorted) nbTrueExamples = yTrueSortedCumsum[-1] precision = np.zeros(nbPrecRecall) recall = np.zeros(nbPrecRecall) # deal with the first point # only thing we need to do, is to append a zero to the cumsum at the end. # an index of -1 uses that zero then yTrueSortedCumsum = np.append(yTrueSortedCumsum, 0) # deal with remaining for idxRes, idxScores in enumerate(uniqueIndices): cumSum = yTrueSortedCumsum[idxScores - 1] tp = nbTrueExamples - cumSum fp = nbExamples - idxScores - tp fn = cumSum + hardFns p = float(tp) / (tp + fp) r = float(tp) / (tp + fn) precision[idxRes] = p recall[idxRes] = r # first point in curve is artificial precision[-1] = 1.0 recall[-1] = 0.0 # compute average of precision-recall curve # integration is performed via zero order, or equivalently step-wise integration # first compute the widths of each step: # use a convolution with appropriate kernel, manually deal with the boundaries first recallForConv = np.copy(recall) recallForConv = np.append(recallForConv[0], recallForConv) recallForConv = np.append(recallForConv, 0.0) stepWidths = np.convolve(recallForConv, [-0.5, 0, 0.5], "valid") # integrate is now simply a dot product apCurrent = np.dot(precision, stepWidths) elif haveGt: apCurrent = 0.0 else: apCurrent = float("nan") ap[dI, lI, oI] = apCurrent return ap def evaluateMaskMatches(matches, args): # In the end, we need two vectors for each class and for each overlap # The first vector (y_true) is binary and is 1, where the ground truth says true, # and is 0 otherwise. # The second vector (y_score) is float [0...1] and represents the confidence of # the prediction. # # We represent the following cases as: # | y_true | y_score # gt instance with matched prediction | 1 | confidence # gt instance w/o matched prediction | 1 | 0.0 # false positive prediction | 0 | confidence # # The current implementation makes only sense for an overlap threshold >= 0.5, # since only then, a single prediction can either be ignored or matched, but # never both. Further, it can never match to two gt instances. # For matching, we vary the overlap and do the following steps: # 1.) remove all predictions that satisfy the overlap criterion with an ignore region (either void or *group) # 2.) remove matches that do not satisfy the overlap # 3.) mark non-matched predictions as false positive # AP overlaps = args.overlaps # region size minRegionSizes = args.minRegionSizes # only keep the first, if distances are not available # if not args.distanceAvailable: # minRegionSizes = [ minRegionSizes[0] ] # distThs = [ distThs [0] ] # distConfs = [ distConfs [0] ] # Here we hold the results # First dimension is class, second overlap ap = np.zeros((len(minRegionSizes), len(args.instLabels), len(overlaps)), np.float) for dI, minRegionSize in enumerate(minRegionSizes): for (oI, overlapTh) in enumerate(overlaps): for (lI, labelName) in enumerate(args.instLabels): y_true = np.empty(0) y_score = np.empty(0) # count hard false negatives hardFns = 0 # found at least one gt and predicted instance? haveGt = False havePred = False for img in matches: predInstances = img["prediction"][labelName] gtInstances = img["groundTruth"][labelName] # filter groups in ground truth gtInstances = [ gt for gt in gtInstances if gt["pixelCount"] >= minRegionSize ] if gtInstances: haveGt = True if predInstances: havePred = True curTrue = np.ones(len(gtInstances)) curScore = np.ones(len(gtInstances)) * (-float("inf")) curMatch = np.zeros(len(gtInstances), dtype=np.bool) # collect matches for (gtI, gt) in enumerate(gtInstances): foundMatch = False for pred in gt["matchedPred"]: overlap = float(pred["maskIntersection"]) / ( gt["pixelCount"] + pred["pixelCount"] - pred["maskIntersection"] ) if overlap > overlapTh: # the score confidence = pred["confidence"] # if we already hat a prediction for this groundtruth # the prediction with the lower score is automatically a false positive if curMatch[gtI]: maxScore = max(curScore[gtI], confidence) minScore = min(curScore[gtI], confidence) curScore[gtI] = maxScore # append false positive curTrue = np.append(curTrue, 0) curScore = np.append(curScore, minScore) curMatch = np.append(curMatch, True) # otherwise set score else: foundMatch = True curMatch[gtI] = True curScore[gtI] = confidence if not foundMatch: hardFns += 1 # remove non-matched ground truth instances curTrue = curTrue[curMatch == True] curScore = curScore[curMatch == True] # collect non-matched predictions as false positive for pred in predInstances: foundGt = False for gt in pred["matchedGt"]: overlap = float(gt["maskIntersection"]) / ( gt["pixelCount"] + pred["pixelCount"] - gt["maskIntersection"] ) if overlap > overlapTh: foundGt = True break if not foundGt: # collect number of void and *group pixels nbIgnorePixels = 0 for gt in pred["matchedGt"]: # small ground truth instances if gt["pixelCount"] < minRegionSize: nbIgnorePixels += gt["maskIntersection"] if pred["pixelCount"] <= 0: proportionIgnore = 0 else: proportionIgnore = ( float(nbIgnorePixels) / pred["pixelCount"] ) # if not ignored # append false positive if proportionIgnore <= overlapTh: curTrue = np.append(curTrue, 0) confidence = pred["confidence"] curScore = np.append(curScore, confidence) # append to overall results y_true = np.append(y_true, curTrue) y_score = np.append(y_score, curScore) # compute the average precision if haveGt and havePred: # compute precision recall curve first # sorting and cumsum scoreArgSort = np.argsort(y_score) yScoreSorted = y_score[scoreArgSort] yTrueSorted = y_true[scoreArgSort] yTrueSortedCumsum = np.cumsum(yTrueSorted) # unique thresholds (thresholds, uniqueIndices) = np.unique( yScoreSorted, return_index=True ) # since we need to add an artificial point to the precision-recall curve # increase its length by 1 nbPrecRecall = len(uniqueIndices) + 1 # prepare precision recall nbExamples = len(yScoreSorted) nbTrueExamples = yTrueSortedCumsum[-1] precision = np.zeros(nbPrecRecall) recall = np.zeros(nbPrecRecall) # deal with the first point # only thing we need to do, is to append a zero to the cumsum at the end. # an index of -1 uses that zero then yTrueSortedCumsum = np.append(yTrueSortedCumsum, 0) # deal with remaining for idxRes, idxScores in enumerate(uniqueIndices): cumSum = yTrueSortedCumsum[idxScores - 1] tp = nbTrueExamples - cumSum fp = nbExamples - idxScores - tp fn = cumSum + hardFns p = float(tp) / (tp + fp) r = float(tp) / (tp + fn) precision[idxRes] = p recall[idxRes] = r # first point in curve is artificial precision[-1] = 1.0 recall[-1] = 0.0 # compute average of precision-recall curve # integration is performed via zero order, or equivalently step-wise integration # first compute the widths of each step: # use a convolution with appropriate kernel, manually deal with the boundaries first recallForConv = np.copy(recall) recallForConv = np.append(recallForConv[0], recallForConv) recallForConv = np.append(recallForConv, 0.0) stepWidths = np.convolve(recallForConv, [-0.5, 0, 0.5], "valid") # integrate is now simply a dot product apCurrent = np.dot(precision, stepWidths) elif haveGt: apCurrent = 0.0 else: apCurrent = float("nan") ap[dI, lI, oI] = apCurrent return ap def computeAverages(aps, args): # max distance index # dInf = np.argmax( args.distanceThs ) # d50m = np.where( np.isclose( args.distanceThs , 50. ) ) # d100m = np.where( np.isclose( args.distanceThs , 100. ) ) dInf = np.argmin(args.minRegionSizes) o50 = np.where(np.isclose(args.overlaps, 0.5)) o75 = np.where(np.isclose(args.overlaps, 0.75)) avgDict = {} avgDict["allAp"] = np.nanmean(aps[dInf, :, :]) avgDict["allAp50%"] = np.nanmean(aps[dInf, :, o50]) avgDict["allAp75%"] = np.nanmean(aps[dInf, :, o75]) avgDict["classes"] = {} for (lI, labelName) in enumerate(args.instLabels): avgDict["classes"][labelName] = {} avgDict["classes"][labelName]["ap"] = np.average(aps[dInf, lI, :]) avgDict["classes"][labelName]["ap50%"] = np.average(aps[dInf, lI, o50]) avgDict["classes"][labelName]["ap75%"] = np.average(aps[dInf, lI, o75]) return avgDict def printResults(avgDict, args): strbuffer = io.StringIO() # redirect all the print functions to a string buffer with redirect_stdout(strbuffer): sep = "," if args.csv else "" col1 = ":" if not args.csv else "" noCol = colors.ENDC if args.colorized else "" bold = colors.BOLD if args.colorized else "" lineLen = 65 print("") if not args.csv: print("#" * lineLen) line = bold line += "{:<15}".format("what") + sep + col1 line += "{:>15}".format("AP") + sep line += "{:>15}".format("AP_50%") + sep line += "{:>15}".format("AP_75%") + sep line += noCol print(line) if not args.csv: print("#" * lineLen) for (lI, labelName) in enumerate(args.instLabels): apAvg = avgDict["classes"][labelName]["ap"] ap50o = avgDict["classes"][labelName]["ap50%"] ap75o = avgDict["classes"][labelName]["ap75%"] line = "{:<15}".format(labelName) + sep + col1 line += getColorEntry(apAvg, args) + sep + "{:>15.3f}".format(apAvg) + sep line += getColorEntry(ap50o, args) + sep + "{:>15.3f}".format(ap50o) + sep line += getColorEntry(ap75o, args) + sep + "{:>15.3f}".format(ap75o) + sep line += noCol print(line) allApAvg = avgDict["allAp"] allAp50o = avgDict["allAp50%"] allAp75o = avgDict["allAp75%"] if not args.csv: print("-" * lineLen) line = "{:<15}".format("average") + sep + col1 line += getColorEntry(allApAvg, args) + sep + "{:>15.3f}".format(allApAvg) + sep line += getColorEntry(allAp50o, args) + sep + "{:>15.3f}".format(allAp50o) + sep line += getColorEntry(allAp75o, args) + sep + "{:>15.3f}".format(allAp75o) + sep line += noCol print(line) print("") return strbuffer.getvalue() def prepareJSONDataForResults(avgDict, aps, args): JSONData = {} JSONData["averages"] = avgDict JSONData["overlaps"] = args.overlaps.tolist() JSONData["minRegionSizes"] = args.minRegionSizes.tolist() JSONData["instLabels"] = args.instLabels JSONData["resultApMatrix"] = aps.tolist() return JSONData ================================================ FILE: maskrcnn_benchmark/data/datasets/evaluation/coco/__init__.py ================================================ from .coco_eval import do_coco_evaluation as do_orig_coco_evaluation from .coco_eval_wrapper import do_coco_evaluation as do_wrapped_coco_evaluation from maskrcnn_benchmark.data.datasets import AbstractDataset, COCODataset def coco_evaluation( dataset, predictions, output_folder, box_only, iou_types, expected_results, expected_results_sigma_tol, ): if isinstance(dataset, COCODataset): return do_orig_coco_evaluation( dataset=dataset, predictions=predictions, box_only=box_only, output_folder=output_folder, iou_types=iou_types, expected_results=expected_results, expected_results_sigma_tol=expected_results_sigma_tol, ) elif isinstance(dataset, AbstractDataset): return do_wrapped_coco_evaluation( dataset=dataset, predictions=predictions, box_only=box_only, output_folder=output_folder, iou_types=iou_types, expected_results=expected_results, expected_results_sigma_tol=expected_results_sigma_tol, ) else: raise NotImplementedError( ( "Ground truth dataset is not a COCODataset, " "nor it is derived from AbstractDataset: type(dataset)=" "%s" % type(dataset) ) ) ================================================ FILE: maskrcnn_benchmark/data/datasets/evaluation/coco/abs_to_coco.py ================================================ # COCO style evaluation for custom datasets derived from AbstractDataset # Warning! area is computed using binary maps, therefore results may differ # because of the precomputed COCO areas # by botcs@github import numpy as np import torch import pycocotools.mask as mask_util from maskrcnn_benchmark.data.datasets.abstract import AbstractDataset from maskrcnn_benchmark.structures.bounding_box import BoxList import logging from datetime import datetime from tqdm import tqdm def convert_abstract_to_coco(dataset, num_workers=None, chunksize=100): """ Convert any dataset derived from AbstractDataset to COCO style for evaluating with the pycocotools lib Conversion imitates required fields of COCO instance segmentation ground truth files like: ".../annotations/instances_train2014.json" After th conversion is done a dict is returned that follows the same format as COCO json files. By default .coco_eval_wrapper.py saves it to the hard-drive in json format and loads it with the maskrcnn_benchmark's default COCODataset Args: dataset: any dataset derived from AbstractDataset num_workers (optional): number of worker threads to parallelize the conversion (default is to use all cores for conversion) chunk_size (optional): how many entries one thread processes before requesting new task. The larger the less overhead there is. """ logger = logging.getLogger("maskrcnn_benchmark.inference") assert isinstance(dataset, AbstractDataset) # Official COCO annotations have these fields # 'info', 'licenses', 'images', 'type', 'annotations', 'categories' coco_dict = {} coco_dict["info"] = { "description": ( "This is an automatically generated COCO annotation" " file using maskrcnn_benchmark" ), "date_created": "%s" % datetime.now(), } coco_dict["type"] = "instances" images = [] annotations = [] if num_workers is None: num_workers = torch.multiprocessing.cpu_count() else: num_workers = min(num_workers, torch.multiprocessing.cpu_count()) dataset_name = dataset.__class__.__name__ num_images = len(dataset) logger.info( ( "Parsing each entry in " "%s, total=%d. " "Using N=%d workers and chunksize=%d" ) % (dataset_name, num_images, num_workers, chunksize) ) with torch.multiprocessing.Pool(num_workers) as pool: with tqdm(total=num_images) as progress_bar: args = [(dataset, idx) for idx in range(num_images)] iterator = pool.imap(process_single_image, args, chunksize=100) for img_annots_pair in iterator: image, per_img_annotations = img_annots_pair images.append(image) annotations.extend(per_img_annotations) progress_bar.update() for ann_id, ann in enumerate(annotations, 1): ann["id"] = ann_id logger.info("Parsing categories:") # CATEGORY DATA categories = [ {"id": category_id, "name": name} for category_id, name in dataset.id_to_name.items() if name != "__background__" ] # Logging categories for cat in categories: logger.info(str(cat)) coco_dict["images"] = images coco_dict["annotations"] = annotations coco_dict["categories"] = categories return coco_dict def process_single_image(args): dataset, idx = args # IMAGE DATA img_id = idx + 1 image = {} # Official COCO "images" entries have these fields # 'license', 'url', 'file_name', 'height', 'width', 'date_captured', 'id' img, target, ret_idx = dataset[idx] img_info = dataset.get_img_info(idx) assert isinstance(img_info, dict) image.update(img_info) image["width"], image["height"] = target.size if "id" not in image.keys(): # Start indexing from 1 if "id" field is not present image["id"] = img_id else: img_id = image["id"] # ANNOTATION DATA per_img_annotations = [] # Official COCO "annotations" entries have these fields # 'segmentation', 'area', 'iscrowd', 'image_id', 'bbox', 'category_id', 'id' # assert ret_idx == idx, (ret_idx, idx) assert isinstance(target, BoxList) bboxes = target.convert("xywh").bbox.tolist() segm_available = "masks" in target.fields() if segm_available: masks = target.get_field("masks").get_mask_tensor() # [N, H, W] if masks.dim() == 2: masks = masks.unsqueeze(0) rles = masks_to_rles(masks) """ !!!WARNING!!! At this point the area value differs from the precomputed original COCO area values, because we compute the area by counting the nonzero entries of the binary mask while COCO areas are computed directly from the polygons Example: Reference image data {'license': 2, 'url': 'http://farm9.staticflickr.com/8035/8024364858_9c41dc1666_z.jpg', 'file_name': 'COCO_val2014_000000000139.jpg', 'height': 426, 'width': 640, 'date_captured': '2013-11-21 01:34:01', 'id': 139} Original COCO area values [ 531.8071, 13244.6572, 5833.1182, 2245.3435, 1833.7841, 1289.3734, 210.1482, 2913.1104, 435.1450, 217.7192, 2089.9749, 338.6089, 322.5936, 225.6642, 2171.6189, 178.1851, 90.9873, 189.5601, 120.2320, 2362.4897] Area values using the binary masks [ 531, 13247, 5846, 2251, 1850, 1292, 212, 2922, 439, 224, 2060, 342, 324, 226, 2171, 178, 90, 187, 120, 2372] """ areas = (masks != 0).sum([1, 2]).tolist() else: areas = target.area().tolist() cat_ids = target.get_field("labels").long().tolist() assert len(bboxes) == len(areas) == len(cat_ids) num_instances = len(target) for ann_idx in range(num_instances): annotation = {} if segm_available: annotation["segmentation"] = rles[ann_idx] annotation["area"] = areas[ann_idx] annotation["iscrowd"] = 0 annotation["image_id"] = img_id annotation["bbox"] = bboxes[ann_idx] annotation["category_id"] = cat_ids[ann_idx] per_img_annotations.append(annotation) return image, per_img_annotations def masks_to_rles(masks_tensor): # TODO: parallelize rles = [] for instance_mask in masks_tensor: np_mask = np.array(instance_mask[:, :, None], order="F") rle = mask_util.encode(np_mask)[0] rle["counts"] = rle["counts"].decode("utf-8") rles.append(rle) return rles ================================================ FILE: maskrcnn_benchmark/data/datasets/evaluation/coco/coco_eval.py ================================================ import logging import tempfile import os import torch from collections import OrderedDict from tqdm import tqdm from maskrcnn_benchmark.modeling.roi_heads.mask_head.inference import Masker from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou def do_coco_evaluation( dataset, predictions, box_only, output_folder, iou_types, expected_results, expected_results_sigma_tol, ): logger = logging.getLogger("maskrcnn_benchmark.inference") if box_only: logger.info("Evaluating bbox proposals") areas = {"all": "", "small": "s", "medium": "m", "large": "l"} res = COCOResults("box_proposal") for limit in [100, 1000]: for area, suffix in areas.items(): stats = evaluate_box_proposals( predictions, dataset, area=area, limit=limit ) key = "AR{}@{:d}".format(suffix, limit) res.results["box_proposal"][key] = stats["ar"].item() logger.info(res) check_expected_results(res, expected_results, expected_results_sigma_tol) if output_folder: torch.save(res, os.path.join(output_folder, "box_proposals.pth")) return logger.info("Preparing results for COCO format") coco_results = {} if "bbox" in iou_types: logger.info("Preparing bbox results") coco_results["bbox"] = prepare_for_coco_detection(predictions, dataset) if "segm" in iou_types: logger.info("Preparing segm results") coco_results["segm"] = prepare_for_coco_segmentation(predictions, dataset) if 'keypoints' in iou_types: logger.info('Preparing keypoints results') coco_results['keypoints'] = prepare_for_coco_keypoint(predictions, dataset) results = COCOResults(*iou_types) logger.info("Evaluating predictions") for iou_type in iou_types: with tempfile.NamedTemporaryFile() as f: file_path = f.name if output_folder: file_path = os.path.join(output_folder, iou_type + ".json") res = evaluate_predictions_on_coco( dataset.coco, coco_results[iou_type], file_path, iou_type ) results.update(res) logger.info(results) check_expected_results(results, expected_results, expected_results_sigma_tol) if output_folder: torch.save(results, os.path.join(output_folder, "coco_results.pth")) return results, coco_results def prepare_for_coco_detection(predictions, dataset): # assert isinstance(dataset, COCODataset) coco_results = [] for image_id, prediction in enumerate(predictions): original_id = dataset.id_to_img_map[image_id] if len(prediction) == 0: continue img_info = dataset.get_img_info(image_id) image_width = img_info["width"] image_height = img_info["height"] prediction = prediction.resize((image_width, image_height)) prediction = prediction.convert("xywh") boxes = prediction.bbox.tolist() scores = prediction.get_field("scores").tolist() labels = prediction.get_field("labels").tolist() mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels] coco_results.extend( [ { "image_id": original_id, "category_id": mapped_labels[k], "bbox": box, "score": scores[k], } for k, box in enumerate(boxes) ] ) return coco_results def prepare_for_coco_segmentation(predictions, dataset): import pycocotools.mask as mask_util import numpy as np masker = Masker(threshold=0.5, padding=1) # assert isinstance(dataset, COCODataset) coco_results = [] for image_id, prediction in tqdm(enumerate(predictions)): original_id = dataset.id_to_img_map[image_id] if len(prediction) == 0: continue img_info = dataset.get_img_info(image_id) image_width = img_info["width"] image_height = img_info["height"] prediction = prediction.resize((image_width, image_height)) masks = prediction.get_field("mask") # t = time.time() # Masker is necessary only if masks haven't been already resized. if list(masks.shape[-2:]) != [image_height, image_width]: masks = masker(masks.expand(1, -1, -1, -1, -1), prediction) masks = masks[0] # logger.info('Time mask: {}'.format(time.time() - t)) # prediction = prediction.convert('xywh') # boxes = prediction.bbox.tolist() scores = prediction.get_field("scores").tolist() labels = prediction.get_field("labels").tolist() # rles = prediction.get_field('mask') rles = [ mask_util.encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0] for mask in masks ] for rle in rles: rle["counts"] = rle["counts"].decode("utf-8") mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels] coco_results.extend( [ { "image_id": original_id, "category_id": mapped_labels[k], "segmentation": rle, "score": scores[k], } for k, rle in enumerate(rles) ] ) return coco_results def prepare_for_coco_keypoint(predictions, dataset): # assert isinstance(dataset, COCODataset) coco_results = [] for image_id, prediction in enumerate(predictions): original_id = dataset.id_to_img_map[image_id] if len(prediction.bbox) == 0: continue # TODO replace with get_img_info? image_width = dataset.coco.imgs[original_id]['width'] image_height = dataset.coco.imgs[original_id]['height'] prediction = prediction.resize((image_width, image_height)) prediction = prediction.convert('xywh') boxes = prediction.bbox.tolist() scores = prediction.get_field('scores').tolist() labels = prediction.get_field('labels').tolist() keypoints = prediction.get_field('keypoints') keypoints = keypoints.resize((image_width, image_height)) keypoints = keypoints.keypoints.view(keypoints.keypoints.shape[0], -1).tolist() mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels] coco_results.extend([{ 'image_id': original_id, 'category_id': mapped_labels[k], 'keypoints': keypoint, 'score': scores[k]} for k, keypoint in enumerate(keypoints)]) return coco_results # inspired from Detectron def evaluate_box_proposals( predictions, dataset, thresholds=None, area="all", limit=None ): """Evaluate detection proposal recall metrics. This function is a much faster alternative to the official COCO API recall evaluation code. However, it produces slightly different results. """ # Record max overlap value for each gt box # Return vector of overlap values areas = { "all": 0, "small": 1, "medium": 2, "large": 3, "96-128": 4, "128-256": 5, "256-512": 6, "512-inf": 7, } area_ranges = [ [0 ** 2, 1e5 ** 2], # all [0 ** 2, 32 ** 2], # small [32 ** 2, 96 ** 2], # medium [96 ** 2, 1e5 ** 2], # large [96 ** 2, 128 ** 2], # 96-128 [128 ** 2, 256 ** 2], # 128-256 [256 ** 2, 512 ** 2], # 256-512 [512 ** 2, 1e5 ** 2], ] # 512-inf assert area in areas, "Unknown area range: {}".format(area) area_range = area_ranges[areas[area]] gt_overlaps = [] num_pos = 0 for image_id, prediction in enumerate(predictions): original_id = dataset.id_to_img_map[image_id] img_info = dataset.get_img_info(image_id) image_width = img_info["width"] image_height = img_info["height"] prediction = prediction.resize((image_width, image_height)) # sort predictions in descending order # TODO maybe remove this and make it explicit in the documentation inds = prediction.get_field("objectness").sort(descending=True)[1] prediction = prediction[inds] ann_ids = dataset.coco.getAnnIds(imgIds=original_id) anno = dataset.coco.loadAnns(ann_ids) gt_boxes = [obj["bbox"] for obj in anno if obj["iscrowd"] == 0] gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes gt_boxes = BoxList(gt_boxes, (image_width, image_height), mode="xywh").convert( "xyxy" ) gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0]) if len(gt_boxes) == 0: continue valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]) gt_boxes = gt_boxes[valid_gt_inds] num_pos += len(gt_boxes) if len(gt_boxes) == 0: continue if len(prediction) == 0: continue if limit is not None and len(prediction) > limit: prediction = prediction[:limit] overlaps = boxlist_iou(prediction, gt_boxes) _gt_overlaps = torch.zeros(len(gt_boxes)) for j in range(min(len(prediction), len(gt_boxes))): # find which proposal box maximally covers each gt box # and get the iou amount of coverage for each gt box max_overlaps, argmax_overlaps = overlaps.max(dim=0) # find which gt box is 'best' covered (i.e. 'best' = most iou) gt_ovr, gt_ind = max_overlaps.max(dim=0) assert gt_ovr >= 0 # find the proposal box that covers the best covered gt box box_ind = argmax_overlaps[gt_ind] # record the iou coverage of this gt box _gt_overlaps[j] = overlaps[box_ind, gt_ind] assert _gt_overlaps[j] == gt_ovr # mark the proposal box and the gt box as used overlaps[box_ind, :] = -1 overlaps[:, gt_ind] = -1 # append recorded iou coverage level gt_overlaps.append(_gt_overlaps) gt_overlaps = torch.cat(gt_overlaps, dim=0) gt_overlaps, _ = torch.sort(gt_overlaps) if thresholds is None: step = 0.05 thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) recalls = torch.zeros_like(thresholds) # compute recall for each iou threshold for i, t in enumerate(thresholds): recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) # ar = 2 * np.trapz(recalls, thresholds) ar = recalls.mean() return { "ar": ar, "recalls": recalls, "thresholds": thresholds, "gt_overlaps": gt_overlaps, "num_pos": num_pos, } def evaluate_predictions_on_coco( coco_gt, coco_results, json_result_file, iou_type="bbox" ): import json with open(json_result_file, "w") as f: json.dump(coco_results, f) from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval coco_dt = coco_gt.loadRes(str(json_result_file)) if coco_results else COCO() # coco_dt = coco_gt.loadRes(coco_results) coco_eval = COCOeval(coco_gt, coco_dt, iou_type) coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() return coco_eval class COCOResults(object): METRICS = { "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"], "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"], "box_proposal": [ "AR@100", "ARs@100", "ARm@100", "ARl@100", "AR@1000", "ARs@1000", "ARm@1000", "ARl@1000", ], "keypoints": ["AP", "AP50", "AP75", "APm", "APl"], } def __init__(self, *iou_types): allowed_types = ("box_proposal", "bbox", "segm", "keypoints") assert all(iou_type in allowed_types for iou_type in iou_types) results = OrderedDict() for iou_type in iou_types: results[iou_type] = OrderedDict( [(metric, -1) for metric in COCOResults.METRICS[iou_type]] ) self.results = results def update(self, coco_eval): if coco_eval is None: return from pycocotools.cocoeval import COCOeval assert isinstance(coco_eval, COCOeval) s = coco_eval.stats iou_type = coco_eval.params.iouType res = self.results[iou_type] metrics = COCOResults.METRICS[iou_type] for idx, metric in enumerate(metrics): res[metric] = s[idx] def __repr__(self): results = '\n' for task, metrics in self.results.items(): results += 'Task: {}\n'.format(task) metric_names = metrics.keys() metric_vals = ['{:.4f}'.format(v) for v in metrics.values()] results += (', '.join(metric_names) + '\n') results += (', '.join(metric_vals) + '\n') return results def check_expected_results(results, expected_results, sigma_tol): if not expected_results: return logger = logging.getLogger("maskrcnn_benchmark.inference") for task, metric, (mean, std) in expected_results: actual_val = results.results[task][metric] lo = mean - sigma_tol * std hi = mean + sigma_tol * std ok = (lo < actual_val) and (actual_val < hi) msg = ( "{} > {} sanity check (actual vs. expected): " "{:.3f} vs. mean={:.4f}, std={:.4}, range=({:.4f}, {:.4f})" ).format(task, metric, actual_val, mean, std, lo, hi) if not ok: msg = "FAIL: " + msg logger.error(msg) else: msg = "PASS: " + msg logger.info(msg) ================================================ FILE: maskrcnn_benchmark/data/datasets/evaluation/coco/coco_eval_wrapper.py ================================================ # COCO style evaluation for custom datasets derived from AbstractDataset # by botcs@github import logging import os import json from maskrcnn_benchmark.data.datasets.coco import COCODataset from .coco_eval import do_coco_evaluation as orig_evaluation from .abs_to_coco import convert_abstract_to_coco def do_coco_evaluation( dataset, predictions, box_only, output_folder, iou_types, expected_results, expected_results_sigma_tol, ): logger = logging.getLogger("maskrcnn_benchmark.inference") logger.info("Converting annotations to COCO format...") coco_annotation_dict = convert_abstract_to_coco(dataset) dataset_name = dataset.__class__.__name__ coco_annotation_path = os.path.join(output_folder, dataset_name + ".json") logger.info("Saving annotations to %s" % coco_annotation_path) with open(coco_annotation_path, "w") as f: json.dump(coco_annotation_dict, f, indent=2) logger.info("Loading annotations as COCODataset") coco_dataset = COCODataset( ann_file=coco_annotation_path, root="", remove_images_without_annotations=False, transforms=None, # transformations should be already saved to the json ) return orig_evaluation( dataset=coco_dataset, predictions=predictions, box_only=box_only, output_folder=output_folder, iou_types=iou_types, expected_results=expected_results, expected_results_sigma_tol=expected_results, ) ================================================ FILE: maskrcnn_benchmark/data/datasets/evaluation/voc/__init__.py ================================================ import logging from .voc_eval import do_voc_evaluation def voc_evaluation(dataset, predictions, output_folder, box_only, **_): logger = logging.getLogger("maskrcnn_benchmark.inference") if box_only: logger.warning("voc evaluation doesn't support box_only, ignored.") logger.info("performing voc evaluation, ignored iou_types.") return do_voc_evaluation( dataset=dataset, predictions=predictions, output_folder=output_folder, logger=logger, ) ================================================ FILE: maskrcnn_benchmark/data/datasets/evaluation/voc/voc_eval.py ================================================ # A modification version from chainercv repository. # (See https://github.com/chainer/chainercv/blob/master/chainercv/evaluations/eval_detection_voc.py) from __future__ import division import os from collections import defaultdict import numpy as np from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou def do_voc_evaluation(dataset, predictions, output_folder, logger): # TODO need to make the use_07_metric format available # for the user to choose pred_boxlists = [] gt_boxlists = [] for image_id, prediction in enumerate(predictions): img_info = dataset.get_img_info(image_id) image_width = img_info["width"] image_height = img_info["height"] prediction = prediction.resize((image_width, image_height)) pred_boxlists.append(prediction) gt_boxlist = dataset.get_groundtruth(image_id) gt_boxlists.append(gt_boxlist) result = eval_detection_voc( pred_boxlists=pred_boxlists, gt_boxlists=gt_boxlists, iou_thresh=0.5, use_07_metric=True, ) result_str = "mAP: {:.4f}\n".format(result["map"]) for i, ap in enumerate(result["ap"]): if i == 0: # skip background continue result_str += "{:<16}: {:.4f}\n".format( dataset.map_class_id_to_class_name(i), ap ) logger.info(result_str) if output_folder: with open(os.path.join(output_folder, "result.txt"), "w") as fid: fid.write(result_str) return result def eval_detection_voc(pred_boxlists, gt_boxlists, iou_thresh=0.5, use_07_metric=False): """Evaluate on voc dataset. Args: pred_boxlists(list[BoxList]): pred boxlist, has labels and scores fields. gt_boxlists(list[BoxList]): ground truth boxlist, has labels field. iou_thresh: iou thresh use_07_metric: boolean Returns: dict represents the results """ assert len(gt_boxlists) == len( pred_boxlists ), "Length of gt and pred lists need to be same." prec, rec = calc_detection_voc_prec_rec( pred_boxlists=pred_boxlists, gt_boxlists=gt_boxlists, iou_thresh=iou_thresh ) ap = calc_detection_voc_ap(prec, rec, use_07_metric=use_07_metric) return {"ap": ap, "map": np.nanmean(ap)} def calc_detection_voc_prec_rec(gt_boxlists, pred_boxlists, iou_thresh=0.5): """Calculate precision and recall based on evaluation code of PASCAL VOC. This function calculates precision and recall of predicted bounding boxes obtained from a dataset which has :math:`N` images. The code is based on the evaluation code used in PASCAL VOC Challenge. """ n_pos = defaultdict(int) score = defaultdict(list) match = defaultdict(list) for gt_boxlist, pred_boxlist in zip(gt_boxlists, pred_boxlists): pred_bbox = pred_boxlist.bbox.numpy() pred_label = pred_boxlist.get_field("labels").numpy() pred_score = pred_boxlist.get_field("scores").numpy() gt_bbox = gt_boxlist.bbox.numpy() gt_label = gt_boxlist.get_field("labels").numpy() gt_difficult = gt_boxlist.get_field("difficult").numpy() for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)): pred_mask_l = pred_label == l pred_bbox_l = pred_bbox[pred_mask_l] pred_score_l = pred_score[pred_mask_l] # sort by score order = pred_score_l.argsort()[::-1] pred_bbox_l = pred_bbox_l[order] pred_score_l = pred_score_l[order] gt_mask_l = gt_label == l gt_bbox_l = gt_bbox[gt_mask_l] gt_difficult_l = gt_difficult[gt_mask_l] n_pos[l] += np.logical_not(gt_difficult_l).sum() score[l].extend(pred_score_l) if len(pred_bbox_l) == 0: continue if len(gt_bbox_l) == 0: match[l].extend((0,) * pred_bbox_l.shape[0]) continue # VOC evaluation follows integer typed bounding boxes. pred_bbox_l = pred_bbox_l.copy() pred_bbox_l[:, 2:] += 1 gt_bbox_l = gt_bbox_l.copy() gt_bbox_l[:, 2:] += 1 iou = boxlist_iou( BoxList(pred_bbox_l, gt_boxlist.size), BoxList(gt_bbox_l, gt_boxlist.size), ).numpy() gt_index = iou.argmax(axis=1) # set -1 if there is no matching ground truth gt_index[iou.max(axis=1) < iou_thresh] = -1 del iou selec = np.zeros(gt_bbox_l.shape[0], dtype=bool) for gt_idx in gt_index: if gt_idx >= 0: if gt_difficult_l[gt_idx]: match[l].append(-1) else: if not selec[gt_idx]: match[l].append(1) else: match[l].append(0) selec[gt_idx] = True else: match[l].append(0) n_fg_class = max(n_pos.keys()) + 1 prec = [None] * n_fg_class rec = [None] * n_fg_class for l in n_pos.keys(): score_l = np.array(score[l]) match_l = np.array(match[l], dtype=np.int8) order = score_l.argsort()[::-1] match_l = match_l[order] tp = np.cumsum(match_l == 1) fp = np.cumsum(match_l == 0) # If an element of fp + tp is 0, # the corresponding element of prec[l] is nan. prec[l] = tp / (fp + tp) # If n_pos[l] is 0, rec[l] is None. if n_pos[l] > 0: rec[l] = tp / n_pos[l] return prec, rec def calc_detection_voc_ap(prec, rec, use_07_metric=False): """Calculate average precisions based on evaluation code of PASCAL VOC. This function calculates average precisions from given precisions and recalls. The code is based on the evaluation code used in PASCAL VOC Challenge. Args: prec (list of numpy.array): A list of arrays. :obj:`prec[l]` indicates precision for class :math:`l`. If :obj:`prec[l]` is :obj:`None`, this function returns :obj:`numpy.nan` for class :math:`l`. rec (list of numpy.array): A list of arrays. :obj:`rec[l]` indicates recall for class :math:`l`. If :obj:`rec[l]` is :obj:`None`, this function returns :obj:`numpy.nan` for class :math:`l`. use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric for calculating average precision. The default value is :obj:`False`. Returns: ~numpy.ndarray: This function returns an array of average precisions. The :math:`l`-th value corresponds to the average precision for class :math:`l`. If :obj:`prec[l]` or :obj:`rec[l]` is :obj:`None`, the corresponding value is set to :obj:`numpy.nan`. """ n_fg_class = len(prec) ap = np.empty(n_fg_class) for l in range(n_fg_class): if prec[l] is None or rec[l] is None: ap[l] = np.nan continue if use_07_metric: # 11 point metric ap[l] = 0 for t in np.arange(0.0, 1.1, 0.1): if np.sum(rec[l] >= t) == 0: p = 0 else: p = np.max(np.nan_to_num(prec[l])[rec[l] >= t]) ap[l] += p / 11 else: # correct AP calculation # first append sentinel values at the end mpre = np.concatenate(([0], np.nan_to_num(prec[l]), [0])) mrec = np.concatenate(([0], rec[l], [1])) mpre = np.maximum.accumulate(mpre[::-1])[::-1] # to calculate area under PR curve, look for points # where X axis (recall) changes value i = np.where(mrec[1:] != mrec[:-1])[0] # and sum (\Delta recall) * prec ap[l] = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) return ap ================================================ FILE: maskrcnn_benchmark/data/datasets/list_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """ Simple dataset class that wraps a list of path names """ from PIL import Image from maskrcnn_benchmark.structures.bounding_box import BoxList class ListDataset(object): def __init__(self, image_lists, transforms=None): self.image_lists = image_lists self.transforms = transforms def __getitem__(self, item): img = Image.open(self.image_lists[item]).convert("RGB") # dummy target w, h = img.size target = BoxList([[0, 0, w, h]], img.size, mode="xyxy") if self.transforms is not None: img, target = self.transforms(img, target) return img, target def __len__(self): return len(self.image_lists) def get_img_info(self, item): """ Return the image dimensions for the image, without loading and pre-processing it """ pass ================================================ FILE: maskrcnn_benchmark/data/datasets/voc.py ================================================ import os import torch import torch.utils.data from PIL import Image import sys if sys.version_info[0] == 2: import xml.etree.cElementTree as ET else: import xml.etree.ElementTree as ET from maskrcnn_benchmark.structures.bounding_box import BoxList class PascalVOCDataset(torch.utils.data.Dataset): CLASSES = ( "__background__ ", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor", ) def __init__(self, data_dir, split, use_difficult=False, transforms=None): self.root = data_dir self.image_set = split self.keep_difficult = use_difficult self.transforms = transforms self._annopath = os.path.join(self.root, "Annotations", "%s.xml") self._imgpath = os.path.join(self.root, "JPEGImages", "%s.jpg") self._imgsetpath = os.path.join(self.root, "ImageSets", "Main", "%s.txt") with open(self._imgsetpath % self.image_set) as f: self.ids = f.readlines() self.ids = [x.strip("\n") for x in self.ids] self.id_to_img_map = {k: v for k, v in enumerate(self.ids)} cls = PascalVOCDataset.CLASSES self.class_to_ind = dict(zip(cls, range(len(cls)))) self.categories = dict(zip(range(len(cls)), cls)) def __getitem__(self, index): img_id = self.ids[index] img = Image.open(self._imgpath % img_id).convert("RGB") target = self.get_groundtruth(index) target = target.clip_to_image(remove_empty=True) if self.transforms is not None: img, target = self.transforms(img, target) return img, target, index def __len__(self): return len(self.ids) def get_groundtruth(self, index): img_id = self.ids[index] anno = ET.parse(self._annopath % img_id).getroot() anno = self._preprocess_annotation(anno) height, width = anno["im_info"] target = BoxList(anno["boxes"], (width, height), mode="xyxy") target.add_field("labels", anno["labels"]) target.add_field("difficult", anno["difficult"]) return target def _preprocess_annotation(self, target): boxes = [] gt_classes = [] difficult_boxes = [] TO_REMOVE = 1 for obj in target.iter("object"): difficult = int(obj.find("difficult").text) == 1 if not self.keep_difficult and difficult: continue name = obj.find("name").text.lower().strip() bb = obj.find("bndbox") # Make pixel indexes 0-based # Refer to "https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/pascal_voc.py#L208-L211" box = [ bb.find("xmin").text, bb.find("ymin").text, bb.find("xmax").text, bb.find("ymax").text, ] bndbox = tuple( map(lambda x: x - TO_REMOVE, list(map(int, box))) ) boxes.append(bndbox) gt_classes.append(self.class_to_ind[name]) difficult_boxes.append(difficult) size = target.find("size") im_info = tuple(map(int, (size.find("height").text, size.find("width").text))) res = { "boxes": torch.tensor(boxes, dtype=torch.float32), "labels": torch.tensor(gt_classes), "difficult": torch.tensor(difficult_boxes), "im_info": im_info, } return res def get_img_info(self, index): img_id = self.ids[index] anno = ET.parse(self._annopath % img_id).getroot() size = anno.find("size") im_info = tuple(map(int, (size.find("height").text, size.find("width").text))) return {"height": im_info[0], "width": im_info[1]} def map_class_id_to_class_name(self, class_id): return PascalVOCDataset.CLASSES[class_id] ================================================ FILE: maskrcnn_benchmark/data/samplers/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from .distributed import DistributedSampler from .grouped_batch_sampler import GroupedBatchSampler from .iteration_based_batch_sampler import IterationBasedBatchSampler __all__ = ["DistributedSampler", "GroupedBatchSampler", "IterationBasedBatchSampler"] ================================================ FILE: maskrcnn_benchmark/data/samplers/distributed.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. # Code is copy-pasted exactly as in torch.utils.data.distributed. # FIXME remove this once c10d fixes the bug it has import math import torch import torch.distributed as dist from torch.utils.data.sampler import Sampler class DistributedSampler(Sampler): """Sampler that restricts data loading to a subset of the dataset. It is especially useful in conjunction with :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each process can pass a DistributedSampler instance as a DataLoader sampler, and load a subset of the original dataset that is exclusive to it. .. note:: Dataset is assumed to be of constant size. Arguments: dataset: Dataset used for sampling. num_replicas (optional): Number of processes participating in distributed training. rank (optional): Rank of the current process within num_replicas. """ def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle def __iter__(self): if self.shuffle: # deterministically shuffle based on epoch g = torch.Generator() g.manual_seed(self.epoch) indices = torch.randperm(len(self.dataset), generator=g).tolist() else: indices = torch.arange(len(self.dataset)).tolist() # add extra samples to make it evenly divisible indices += indices[: (self.total_size - len(indices))] assert len(indices) == self.total_size # subsample offset = self.num_samples * self.rank indices = indices[offset : offset + self.num_samples] assert len(indices) == self.num_samples return iter(indices) def __len__(self): return self.num_samples def set_epoch(self, epoch): self.epoch = epoch ================================================ FILE: maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import itertools import torch from torch.utils.data.sampler import BatchSampler from torch.utils.data.sampler import Sampler class GroupedBatchSampler(BatchSampler): """ Wraps another sampler to yield a mini-batch of indices. It enforces that elements from the same group should appear in groups of batch_size. It also tries to provide mini-batches which follows an ordering which is as close as possible to the ordering from the original sampler. Arguments: sampler (Sampler): Base sampler. batch_size (int): Size of mini-batch. drop_uneven (bool): If ``True``, the sampler will drop the batches whose size is less than ``batch_size`` """ def __init__(self, sampler, group_ids, batch_size, drop_uneven=False): if not isinstance(sampler, Sampler): raise ValueError( "sampler should be an instance of " "torch.utils.data.Sampler, but got sampler={}".format(sampler) ) self.sampler = sampler self.group_ids = torch.as_tensor(group_ids) assert self.group_ids.dim() == 1 self.batch_size = batch_size self.drop_uneven = drop_uneven self.groups = torch.unique(self.group_ids).sort(0)[0] self._can_reuse_batches = False def _prepare_batches(self): dataset_size = len(self.group_ids) # get the sampled indices from the sampler sampled_ids = torch.as_tensor(list(self.sampler)) # potentially not all elements of the dataset were sampled # by the sampler (e.g., DistributedSampler). # construct a tensor which contains -1 if the element was # not sampled, and a non-negative number indicating the # order where the element was sampled. # for example. if sampled_ids = [3, 1] and dataset_size = 5, # the order is [-1, 1, -1, 0, -1] order = torch.full((dataset_size,), -1, dtype=torch.int64) order[sampled_ids] = torch.arange(len(sampled_ids)) # get a mask with the elements that were sampled mask = order >= 0 # find the elements that belong to each individual cluster clusters = [(self.group_ids == i) & mask for i in self.groups] # get relative order of the elements inside each cluster # that follows the order from the sampler relative_order = [order[cluster] for cluster in clusters] # with the relative order, find the absolute order in the # sampled space permutation_ids = [s[s.sort()[1]] for s in relative_order] # permute each cluster so that they follow the order from # the sampler permuted_clusters = [sampled_ids[idx] for idx in permutation_ids] # splits each cluster in batch_size, and merge as a list of tensors splits = [c.split(self.batch_size) for c in permuted_clusters] merged = tuple(itertools.chain.from_iterable(splits)) # now each batch internally has the right order, but # they are grouped by clusters. Find the permutation between # different batches that brings them as close as possible to # the order that we have in the sampler. For that, we will consider the # ordering as coming from the first element of each batch, and sort # correspondingly first_element_of_batch = [t[0].item() for t in merged] # get and inverse mapping from sampled indices and the position where # they occur (as returned by the sampler) inv_sampled_ids_map = {v: k for k, v in enumerate(sampled_ids.tolist())} # from the first element in each batch, get a relative ordering first_index_of_batch = torch.as_tensor( [inv_sampled_ids_map[s] for s in first_element_of_batch] ) # permute the batches so that they approximately follow the order # from the sampler permutation_order = first_index_of_batch.sort(0)[1].tolist() # finally, permute the batches batches = [merged[i].tolist() for i in permutation_order] if self.drop_uneven: kept = [] for batch in batches: if len(batch) == self.batch_size: kept.append(batch) batches = kept return batches def __iter__(self): if self._can_reuse_batches: batches = self._batches self._can_reuse_batches = False else: batches = self._prepare_batches() self._batches = batches return iter(batches) def __len__(self): if not hasattr(self, "_batches"): self._batches = self._prepare_batches() self._can_reuse_batches = True return len(self._batches) ================================================ FILE: maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from torch.utils.data.sampler import BatchSampler class IterationBasedBatchSampler(BatchSampler): """ Wraps a BatchSampler, resampling from it until a specified number of iterations have been sampled """ def __init__(self, batch_sampler, num_iterations, start_iter=0): self.batch_sampler = batch_sampler self.num_iterations = num_iterations self.start_iter = start_iter def __iter__(self): iteration = self.start_iter while iteration <= self.num_iterations: # if the underlying sampler has a set_epoch method, like # DistributedSampler, used for making each process see # a different split of the dataset, then set it if hasattr(self.batch_sampler.sampler, "set_epoch"): self.batch_sampler.sampler.set_epoch(iteration) for batch in self.batch_sampler: iteration += 1 if iteration > self.num_iterations: break yield batch def __len__(self): return self.num_iterations ================================================ FILE: maskrcnn_benchmark/data/transforms/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from .transforms import Compose from .transforms import Resize from .transforms import RandomHorizontalFlip from .transforms import ToTensor from .transforms import Normalize from .build import build_transforms ================================================ FILE: maskrcnn_benchmark/data/transforms/build.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from . import transforms as T def build_transforms(cfg, is_train=True): if is_train: min_size = cfg.INPUT.MIN_SIZE_TRAIN max_size = cfg.INPUT.MAX_SIZE_TRAIN flip_horizontal_prob = cfg.INPUT.HORIZONTAL_FLIP_PROB_TRAIN flip_vertical_prob = cfg.INPUT.VERTICAL_FLIP_PROB_TRAIN brightness = cfg.INPUT.BRIGHTNESS contrast = cfg.INPUT.CONTRAST saturation = cfg.INPUT.SATURATION hue = cfg.INPUT.HUE else: min_size = cfg.INPUT.MIN_SIZE_TEST max_size = cfg.INPUT.MAX_SIZE_TEST flip_horizontal_prob = 0.0 flip_vertical_prob = 0.0 brightness = 0.0 contrast = 0.0 saturation = 0.0 hue = 0.0 to_bgr255 = cfg.INPUT.TO_BGR255 normalize_transform = T.Normalize( mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, to_bgr255=to_bgr255 ) color_jitter = T.ColorJitter( brightness=brightness, contrast=contrast, saturation=saturation, hue=hue, ) transform = T.Compose( [ color_jitter, T.Resize(min_size, max_size), T.RandomHorizontalFlip(flip_horizontal_prob), T.RandomVerticalFlip(flip_vertical_prob), T.ToTensor(), normalize_transform, ] ) return transform ================================================ FILE: maskrcnn_benchmark/data/transforms/transforms.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import random import torch import torchvision from torchvision.transforms import functional as F class Compose(object): def __init__(self, transforms): self.transforms = transforms def __call__(self, image, target): for t in self.transforms: image, target = t(image, target) return image, target def __repr__(self): format_string = self.__class__.__name__ + "(" for t in self.transforms: format_string += "\n" format_string += " {0}".format(t) format_string += "\n)" return format_string class Resize(object): def __init__(self, min_size, max_size): if not isinstance(min_size, (list, tuple)): min_size = (min_size,) self.min_size = min_size self.max_size = max_size # modified from torchvision to add support for max size def get_size(self, image_size): w, h = image_size size = random.choice(self.min_size) max_size = self.max_size if max_size is not None: min_original_size = float(min((w, h))) max_original_size = float(max((w, h))) if max_original_size / min_original_size * size > max_size: size = int(round(max_size * min_original_size / max_original_size)) if (w <= h and w == size) or (h <= w and h == size): return (h, w) if w < h: ow = size oh = int(size * h / w) else: oh = size ow = int(size * w / h) return (oh, ow) def __call__(self, image, target=None): size = self.get_size(image.size) image = F.resize(image, size) if target is None: return image target = target.resize(image.size) return image, target class RandomHorizontalFlip(object): def __init__(self, prob=0.5): self.prob = prob def __call__(self, image, target): if random.random() < self.prob: image = F.hflip(image) target = target.transpose(0) return image, target class RandomVerticalFlip(object): def __init__(self, prob=0.5): self.prob = prob def __call__(self, image, target): if random.random() < self.prob: image = F.vflip(image) target = target.transpose(1) return image, target class ColorJitter(object): def __init__(self, brightness=None, contrast=None, saturation=None, hue=None, ): self.color_jitter = torchvision.transforms.ColorJitter( brightness=brightness, contrast=contrast, saturation=saturation, hue=hue,) def __call__(self, image, target): image = self.color_jitter(image) return image, target class ToTensor(object): def __call__(self, image, target): return F.to_tensor(image), target class Normalize(object): def __init__(self, mean, std, to_bgr255=True): self.mean = mean self.std = std self.to_bgr255 = to_bgr255 def __call__(self, image, target=None): if self.to_bgr255: image = image[[2, 1, 0]] * 255 image = F.normalize(image, mean=self.mean, std=self.std) if target is None: return image return image, target ================================================ FILE: maskrcnn_benchmark/engine/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. ================================================ FILE: maskrcnn_benchmark/engine/bbox_aug.py ================================================ import torch import torchvision.transforms as TT from maskrcnn_benchmark.config import cfg from maskrcnn_benchmark.data import transforms as T from maskrcnn_benchmark.structures.image_list import to_image_list from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.modeling.roi_heads.box_head.inference import make_roi_box_post_processor def im_detect_bbox_aug(model, images, device): # Collect detections computed under different transformations boxlists_ts = [] for _ in range(len(images)): boxlists_ts.append([]) def add_preds_t(boxlists_t): for i, boxlist_t in enumerate(boxlists_t): if len(boxlists_ts[i]) == 0: # The first one is identity transform, no need to resize the boxlist boxlists_ts[i].append(boxlist_t) else: # Resize the boxlist as the first one boxlists_ts[i].append(boxlist_t.resize(boxlists_ts[i][0].size)) # Compute detections for the original image (identity transform) boxlists_i = im_detect_bbox( model, images, cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST, device ) add_preds_t(boxlists_i) # Perform detection on the horizontally flipped image if cfg.TEST.BBOX_AUG.H_FLIP: boxlists_hf = im_detect_bbox_hflip( model, images, cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST, device ) add_preds_t(boxlists_hf) # Compute detections at different scales for scale in cfg.TEST.BBOX_AUG.SCALES: max_size = cfg.TEST.BBOX_AUG.MAX_SIZE boxlists_scl = im_detect_bbox_scale( model, images, scale, max_size, device ) add_preds_t(boxlists_scl) if cfg.TEST.BBOX_AUG.SCALE_H_FLIP: boxlists_scl_hf = im_detect_bbox_scale( model, images, scale, max_size, device, hflip=True ) add_preds_t(boxlists_scl_hf) # Merge boxlists detected by different bbox aug params boxlists = [] for i, boxlist_ts in enumerate(boxlists_ts): bbox = torch.cat([boxlist_t.bbox for boxlist_t in boxlist_ts]) scores = torch.cat([boxlist_t.get_field('scores') for boxlist_t in boxlist_ts]) boxlist = BoxList(bbox, boxlist_ts[0].size, boxlist_ts[0].mode) boxlist.add_field('scores', scores) boxlists.append(boxlist) # Apply NMS and limit the final detections results = [] post_processor = make_roi_box_post_processor(cfg) for boxlist in boxlists: results.append(post_processor.filter_results(boxlist, cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES)) return results def im_detect_bbox(model, images, target_scale, target_max_size, device): """ Performs bbox detection on the original image. """ transform = TT.Compose([ T.Resize(target_scale, target_max_size), TT.ToTensor(), T.Normalize( mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, to_bgr255=cfg.INPUT.TO_BGR255 ) ]) images = [transform(image) for image in images] images = to_image_list(images, cfg.DATALOADER.SIZE_DIVISIBILITY) return model(images.to(device)) def im_detect_bbox_hflip(model, images, target_scale, target_max_size, device): """ Performs bbox detection on the horizontally flipped image. Function signature is the same as for im_detect_bbox. """ transform = TT.Compose([ T.Resize(target_scale, target_max_size), TT.RandomHorizontalFlip(1.0), TT.ToTensor(), T.Normalize( mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, to_bgr255=cfg.INPUT.TO_BGR255 ) ]) images = [transform(image) for image in images] images = to_image_list(images, cfg.DATALOADER.SIZE_DIVISIBILITY) boxlists = model(images.to(device)) # Invert the detections computed on the flipped image boxlists_inv = [boxlist.transpose(0) for boxlist in boxlists] return boxlists_inv def im_detect_bbox_scale(model, images, target_scale, target_max_size, device, hflip=False): """ Computes bbox detections at the given scale. Returns predictions in the scaled image space. """ if hflip: boxlists_scl = im_detect_bbox_hflip(model, images, target_scale, target_max_size, device) else: boxlists_scl = im_detect_bbox(model, images, target_scale, target_max_size, device) return boxlists_scl ================================================ FILE: maskrcnn_benchmark/engine/inference.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import logging import time import os import torch from tqdm import tqdm from maskrcnn_benchmark.data.datasets.evaluation import evaluate from ..utils.comm import is_main_process, get_world_size from ..utils.comm import all_gather from ..utils.comm import synchronize from ..utils.timer import Timer, get_time_str from .bbox_aug import im_detect_bbox_aug def compute_on_dataset(model, data_loader, device, bbox_aug, timer=None): model.eval() results_dict = {} cpu_device = torch.device("cpu") for _, batch in enumerate(tqdm(data_loader)): images, targets, image_ids = batch with torch.no_grad(): if timer: timer.tic() if bbox_aug: output = im_detect_bbox_aug(model, images, device) else: output = model(images.to(device)) if timer: if not device.type == 'cpu': torch.cuda.synchronize() timer.toc() output = [o.to(cpu_device) for o in output] results_dict.update( {img_id: result for img_id, result in zip(image_ids, output)} ) return results_dict def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu): all_predictions = all_gather(predictions_per_gpu) if not is_main_process(): return # merge the list of dicts predictions = {} for p in all_predictions: predictions.update(p) # convert a dict where the key is the index in a list image_ids = list(sorted(predictions.keys())) if len(image_ids) != image_ids[-1] + 1: logger = logging.getLogger("maskrcnn_benchmark.inference") logger.warning( "Number of images that were gathered from multiple processes is not " "a contiguous set. Some images might be missing from the evaluation" ) # convert to a list predictions = [predictions[i] for i in image_ids] return predictions def inference( model, data_loader, dataset_name, iou_types=("bbox",), box_only=False, bbox_aug=False, device="cuda", expected_results=(), expected_results_sigma_tol=4, output_folder=None, ): # convert to a torch.device for efficiency device = torch.device(device) num_devices = get_world_size() logger = logging.getLogger("maskrcnn_benchmark.inference") dataset = data_loader.dataset logger.info("Start evaluation on {} dataset({} images).".format(dataset_name, len(dataset))) total_timer = Timer() inference_timer = Timer() total_timer.tic() predictions = compute_on_dataset(model, data_loader, device, bbox_aug, inference_timer) # wait for all processes to complete before measuring the time synchronize() total_time = total_timer.toc() total_time_str = get_time_str(total_time) logger.info( "Total run time: {} ({} s / img per device, on {} devices)".format( total_time_str, total_time * num_devices / len(dataset), num_devices ) ) total_infer_time = get_time_str(inference_timer.total_time) logger.info( "Model inference time: {} ({} s / img per device, on {} devices)".format( total_infer_time, inference_timer.total_time * num_devices / len(dataset), num_devices, ) ) predictions = _accumulate_predictions_from_multiple_gpus(predictions) if not is_main_process(): return if output_folder: torch.save(predictions, os.path.join(output_folder, "predictions.pth")) extra_args = dict( box_only=box_only, iou_types=iou_types, expected_results=expected_results, expected_results_sigma_tol=expected_results_sigma_tol, ) return evaluate(dataset=dataset, predictions=predictions, output_folder=output_folder, **extra_args) ================================================ FILE: maskrcnn_benchmark/engine/trainer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import datetime import logging import os import time import torch import torch.distributed as dist from tqdm import tqdm from maskrcnn_benchmark.data import make_data_loader from maskrcnn_benchmark.utils.comm import get_world_size, synchronize from maskrcnn_benchmark.utils.metric_logger import MetricLogger from maskrcnn_benchmark.engine.inference import inference from apex import amp def reduce_loss_dict(loss_dict): """ Reduce the loss dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as loss_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): loss_names = [] all_losses = [] for k in sorted(loss_dict.keys()): loss_names.append(k) all_losses.append(loss_dict[k]) all_losses = torch.stack(all_losses, dim=0) dist.reduce(all_losses, dst=0) if dist.get_rank() == 0: # only main process gets accumulated, so only divide by # world_size in this case all_losses /= world_size reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} return reduced_losses def do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, ): logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() iou_types = ("bbox",) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm",) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints",) dataset_names = cfg.DATASETS.TEST for iteration, (images, targets, _) in enumerate(data_loader, start_iter): if any(len(target) < 1 for target in targets): logger.error(f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}" ) continue data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device) targets = [target.to(device) for target in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward() optimizer.step() scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join( [ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ] ).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, ) ) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if data_loader_val is not None and test_period > 0 and iteration % test_period == 0: meters_val = MetricLogger(delimiter=" ") synchronize() _ = inference( # The result can be used for additional logging, e. g. for TensorBoard model, # The method changes the segmentation mask format in a data loader, # so every time a new data loader is created: make_data_loader(cfg, is_train=False, is_distributed=(get_world_size() > 1), is_for_period=True), dataset_name="[Validation]", iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=None, ) synchronize() model.train() with torch.no_grad(): # Should be one image for each GPU: for iteration_val, (images_val, targets_val, _) in enumerate(tqdm(data_loader_val)): images_val = images_val.to(device) targets_val = [target.to(device) for target in targets_val] loss_dict = model(images_val, targets_val) losses = sum(loss for loss in loss_dict.values()) loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters_val.update(loss=losses_reduced, **loss_dict_reduced) synchronize() logger.info( meters_val.delimiter.join( [ "[Validation]: ", "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ] ).format( eta=eta_string, iter=iteration, meters=str(meters_val), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, ) ) if iteration == max_iter: checkpointer.save("model_final", **arguments) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info( "Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter) ) ) ================================================ FILE: maskrcnn_benchmark/layers/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from .batch_norm import FrozenBatchNorm2d from .misc import Conv2d from .misc import DFConv2d from .misc import ConvTranspose2d from .misc import BatchNorm2d from .misc import interpolate from .nms import nms from .roi_align import ROIAlign from .roi_align import roi_align from .roi_pool import ROIPool from .roi_pool import roi_pool from .smooth_l1_loss import smooth_l1_loss from .sigmoid_focal_loss import SigmoidFocalLoss from .dcn.deform_conv_func import deform_conv, modulated_deform_conv from .dcn.deform_conv_module import DeformConv, ModulatedDeformConv, ModulatedDeformConvPack from .dcn.deform_pool_func import deform_roi_pooling from .dcn.deform_pool_module import DeformRoIPooling, DeformRoIPoolingPack, ModulatedDeformRoIPoolingPack __all__ = [ "nms", "roi_align", "ROIAlign", "roi_pool", "ROIPool", "smooth_l1_loss", "Conv2d", "DFConv2d", "ConvTranspose2d", "interpolate", "BatchNorm2d", "FrozenBatchNorm2d", "SigmoidFocalLoss", 'deform_conv', 'modulated_deform_conv', 'DeformConv', 'ModulatedDeformConv', 'ModulatedDeformConvPack', 'deform_roi_pooling', 'DeformRoIPooling', 'DeformRoIPoolingPack', 'ModulatedDeformRoIPoolingPack', ] ================================================ FILE: maskrcnn_benchmark/layers/_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import glob import os.path import torch try: from torch.utils.cpp_extension import load as load_ext from torch.utils.cpp_extension import CUDA_HOME except ImportError: raise ImportError("The cpp layer extensions requires PyTorch 0.4 or higher") def _load_C_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) this_dir = os.path.dirname(this_dir) this_dir = os.path.join(this_dir, "csrc") main_file = glob.glob(os.path.join(this_dir, "*.cpp")) source_cpu = glob.glob(os.path.join(this_dir, "cpu", "*.cpp")) source_cuda = glob.glob(os.path.join(this_dir, "cuda", "*.cu")) source = main_file + source_cpu extra_cflags = [] if torch.cuda.is_available() and CUDA_HOME is not None: source.extend(source_cuda) extra_cflags = ["-DWITH_CUDA"] source = [os.path.join(this_dir, s) for s in source] extra_include_paths = [this_dir] return load_ext( "torchvision", source, extra_cflags=extra_cflags, extra_include_paths=extra_include_paths, ) _C = _load_C_extensions() ================================================ FILE: maskrcnn_benchmark/layers/batch_norm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch import nn class FrozenBatchNorm2d(nn.Module): """ BatchNorm2d where the batch statistics and the affine parameters are fixed """ def __init__(self, n): super(FrozenBatchNorm2d, self).__init__() self.register_buffer("weight", torch.ones(n)) self.register_buffer("bias", torch.zeros(n)) self.register_buffer("running_mean", torch.zeros(n)) self.register_buffer("running_var", torch.ones(n)) def forward(self, x): # Cast all fixed parameters to half() if necessary if x.dtype == torch.float16: self.weight = self.weight.half() self.bias = self.bias.half() self.running_mean = self.running_mean.half() self.running_var = self.running_var.half() scale = self.weight * self.running_var.rsqrt() bias = self.bias - self.running_mean * scale scale = scale.reshape(1, -1, 1, 1) bias = bias.reshape(1, -1, 1, 1) return x * scale + bias ================================================ FILE: maskrcnn_benchmark/layers/dcn/__init__.py ================================================ # # Copied From [mmdetection](https://github.com/open-mmlab/mmdetection/tree/master/mmdet/ops/dcn) # ================================================ FILE: maskrcnn_benchmark/layers/dcn/deform_conv_func.py ================================================ import torch from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair from maskrcnn_benchmark import _C class DeformConvFunction(Function): @staticmethod def forward( ctx, input, offset, weight, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, im2col_step=64 ): if input is not None and input.dim() != 4: raise ValueError( "Expected 4D tensor as input, got {}D tensor instead.".format( input.dim())) ctx.stride = _pair(stride) ctx.padding = _pair(padding) ctx.dilation = _pair(dilation) ctx.groups = groups ctx.deformable_groups = deformable_groups ctx.im2col_step = im2col_step ctx.save_for_backward(input, offset, weight) output = input.new_empty( DeformConvFunction._output_size(input, weight, ctx.padding, ctx.dilation, ctx.stride)) ctx.bufs_ = [input.new_empty(0), input.new_empty(0)] # columns, ones if not input.is_cuda: raise NotImplementedError else: cur_im2col_step = min(ctx.im2col_step, input.shape[0]) assert (input.shape[0] % cur_im2col_step) == 0, 'im2col step must divide batchsize' _C.deform_conv_forward( input, weight, offset, output, ctx.bufs_[0], ctx.bufs_[1], weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0], ctx.padding[1], ctx.padding[0], ctx.dilation[1], ctx.dilation[0], ctx.groups, ctx.deformable_groups, cur_im2col_step ) return output @staticmethod @once_differentiable def backward(ctx, grad_output): input, offset, weight = ctx.saved_tensors grad_input = grad_offset = grad_weight = None if not grad_output.is_cuda: raise NotImplementedError else: cur_im2col_step = min(ctx.im2col_step, input.shape[0]) assert (input.shape[0] % cur_im2col_step) == 0, 'im2col step must divide batchsize' if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]: grad_input = torch.zeros_like(input) grad_offset = torch.zeros_like(offset) _C.deform_conv_backward_input( input, offset, grad_output, grad_input, grad_offset, weight, ctx.bufs_[0], weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0], ctx.padding[1], ctx.padding[0], ctx.dilation[1], ctx.dilation[0], ctx.groups, ctx.deformable_groups, cur_im2col_step ) if ctx.needs_input_grad[2]: grad_weight = torch.zeros_like(weight) _C.deform_conv_backward_parameters( input, offset, grad_output, grad_weight, ctx.bufs_[0], ctx.bufs_[1], weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0], ctx.padding[1], ctx.padding[0], ctx.dilation[1], ctx.dilation[0], ctx.groups, ctx.deformable_groups, 1, cur_im2col_step ) return (grad_input, grad_offset, grad_weight, None, None, None, None, None) @staticmethod def _output_size(input, weight, padding, dilation, stride): channels = weight.size(0) output_size = (input.size(0), channels) for d in range(input.dim() - 2): in_size = input.size(d + 2) pad = padding[d] kernel = dilation[d] * (weight.size(d + 2) - 1) + 1 stride_ = stride[d] output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, ) if not all(map(lambda s: s > 0, output_size)): raise ValueError( "convolution input is too small (output would be {})".format( 'x'.join(map(str, output_size)))) return output_size class ModulatedDeformConvFunction(Function): @staticmethod def forward( ctx, input, offset, mask, weight, bias=None, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1 ): ctx.stride = stride ctx.padding = padding ctx.dilation = dilation ctx.groups = groups ctx.deformable_groups = deformable_groups ctx.with_bias = bias is not None if not ctx.with_bias: bias = input.new_empty(1) # fake tensor if not input.is_cuda: raise NotImplementedError if weight.requires_grad or mask.requires_grad or offset.requires_grad \ or input.requires_grad: ctx.save_for_backward(input, offset, mask, weight, bias) output = input.new_empty( ModulatedDeformConvFunction._infer_shape(ctx, input, weight)) ctx._bufs = [input.new_empty(0), input.new_empty(0)] _C.modulated_deform_conv_forward( input, weight, bias, ctx._bufs[0], offset, mask, output, ctx._bufs[1], weight.shape[2], weight.shape[3], ctx.stride, ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation, ctx.groups, ctx.deformable_groups, ctx.with_bias ) return output @staticmethod @once_differentiable def backward(ctx, grad_output): if not grad_output.is_cuda: raise NotImplementedError input, offset, mask, weight, bias = ctx.saved_tensors grad_input = torch.zeros_like(input) grad_offset = torch.zeros_like(offset) grad_mask = torch.zeros_like(mask) grad_weight = torch.zeros_like(weight) grad_bias = torch.zeros_like(bias) _C.modulated_deform_conv_backward( input, weight, bias, ctx._bufs[0], offset, mask, ctx._bufs[1], grad_input, grad_weight, grad_bias, grad_offset, grad_mask, grad_output, weight.shape[2], weight.shape[3], ctx.stride, ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation, ctx.groups, ctx.deformable_groups, ctx.with_bias ) if not ctx.with_bias: grad_bias = None return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias, None, None, None, None, None) @staticmethod def _infer_shape(ctx, input, weight): n = input.size(0) channels_out = weight.size(0) height, width = input.shape[2:4] kernel_h, kernel_w = weight.shape[2:4] height_out = (height + 2 * ctx.padding - (ctx.dilation * (kernel_h - 1) + 1)) // ctx.stride + 1 width_out = (width + 2 * ctx.padding - (ctx.dilation * (kernel_w - 1) + 1)) // ctx.stride + 1 return n, channels_out, height_out, width_out deform_conv = DeformConvFunction.apply modulated_deform_conv = ModulatedDeformConvFunction.apply ================================================ FILE: maskrcnn_benchmark/layers/dcn/deform_conv_module.py ================================================ import math import torch import torch.nn as nn from torch.nn.modules.utils import _pair from .deform_conv_func import deform_conv, modulated_deform_conv class DeformConv(nn.Module): def __init__( self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=False ): assert not bias super(DeformConv, self).__init__() self.with_bias = bias assert in_channels % groups == 0, \ 'in_channels {} cannot be divisible by groups {}'.format( in_channels, groups) assert out_channels % groups == 0, \ 'out_channels {} cannot be divisible by groups {}'.format( out_channels, groups) self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = _pair(kernel_size) self.stride = _pair(stride) self.padding = _pair(padding) self.dilation = _pair(dilation) self.groups = groups self.deformable_groups = deformable_groups self.weight = nn.Parameter( torch.Tensor(out_channels, in_channels // self.groups, *self.kernel_size)) self.reset_parameters() def reset_parameters(self): n = self.in_channels for k in self.kernel_size: n *= k stdv = 1. / math.sqrt(n) self.weight.data.uniform_(-stdv, stdv) def forward(self, input, offset): return deform_conv(input, offset, self.weight, self.stride, self.padding, self.dilation, self.groups, self.deformable_groups) def __repr__(self): return "".join([ "{}(".format(self.__class__.__name__), "in_channels={}, ".format(self.in_channels), "out_channels={}, ".format(self.out_channels), "kernel_size={}, ".format(self.kernel_size), "stride={}, ".format(self.stride), "dilation={}, ".format(self.dilation), "padding={}, ".format(self.padding), "groups={}, ".format(self.groups), "deformable_groups={}, ".format(self.deformable_groups), "bias={})".format(self.with_bias), ]) class ModulatedDeformConv(nn.Module): def __init__( self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=True ): super(ModulatedDeformConv, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = _pair(kernel_size) self.stride = stride self.padding = padding self.dilation = dilation self.groups = groups self.deformable_groups = deformable_groups self.with_bias = bias self.weight = nn.Parameter(torch.Tensor( out_channels, in_channels // groups, *self.kernel_size )) if bias: self.bias = nn.Parameter(torch.Tensor(out_channels)) else: self.register_parameter('bias', None) self.reset_parameters() def reset_parameters(self): n = self.in_channels for k in self.kernel_size: n *= k stdv = 1. / math.sqrt(n) self.weight.data.uniform_(-stdv, stdv) if self.bias is not None: self.bias.data.zero_() def forward(self, input, offset, mask): return modulated_deform_conv( input, offset, mask, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups, self.deformable_groups) def __repr__(self): return "".join([ "{}(".format(self.__class__.__name__), "in_channels={}, ".format(self.in_channels), "out_channels={}, ".format(self.out_channels), "kernel_size={}, ".format(self.kernel_size), "stride={}, ".format(self.stride), "dilation={}, ".format(self.dilation), "padding={}, ".format(self.padding), "groups={}, ".format(self.groups), "deformable_groups={}, ".format(self.deformable_groups), "bias={})".format(self.with_bias), ]) class ModulatedDeformConvPack(ModulatedDeformConv): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=True): super(ModulatedDeformConvPack, self).__init__( in_channels, out_channels, kernel_size, stride, padding, dilation, groups, deformable_groups, bias) self.conv_offset_mask = nn.Conv2d( self.in_channels // self.groups, self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1], kernel_size=self.kernel_size, stride=_pair(self.stride), padding=_pair(self.padding), bias=True) self.init_offset() def init_offset(self): self.conv_offset_mask.weight.data.zero_() self.conv_offset_mask.bias.data.zero_() def forward(self, input): out = self.conv_offset_mask(input) o1, o2, mask = torch.chunk(out, 3, dim=1) offset = torch.cat((o1, o2), dim=1) mask = torch.sigmoid(mask) return modulated_deform_conv( input, offset, mask, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups, self.deformable_groups) ================================================ FILE: maskrcnn_benchmark/layers/dcn/deform_pool_func.py ================================================ import torch from torch.autograd import Function from torch.autograd.function import once_differentiable from maskrcnn_benchmark import _C class DeformRoIPoolingFunction(Function): @staticmethod def forward( ctx, data, rois, offset, spatial_scale, out_size, out_channels, no_trans, group_size=1, part_size=None, sample_per_part=4, trans_std=.0 ): ctx.spatial_scale = spatial_scale ctx.out_size = out_size ctx.out_channels = out_channels ctx.no_trans = no_trans ctx.group_size = group_size ctx.part_size = out_size if part_size is None else part_size ctx.sample_per_part = sample_per_part ctx.trans_std = trans_std assert 0.0 <= ctx.trans_std <= 1.0 if not data.is_cuda: raise NotImplementedError n = rois.shape[0] output = data.new_empty(n, out_channels, out_size, out_size) output_count = data.new_empty(n, out_channels, out_size, out_size) _C.deform_psroi_pooling_forward( data, rois, offset, output, output_count, ctx.no_trans, ctx.spatial_scale, ctx.out_channels, ctx.group_size, ctx.out_size, ctx.part_size, ctx.sample_per_part, ctx.trans_std ) if data.requires_grad or rois.requires_grad or offset.requires_grad: ctx.save_for_backward(data, rois, offset) ctx.output_count = output_count return output @staticmethod @once_differentiable def backward(ctx, grad_output): if not grad_output.is_cuda: raise NotImplementedError data, rois, offset = ctx.saved_tensors output_count = ctx.output_count grad_input = torch.zeros_like(data) grad_rois = None grad_offset = torch.zeros_like(offset) _C.deform_psroi_pooling_backward( grad_output, data, rois, offset, output_count, grad_input, grad_offset, ctx.no_trans, ctx.spatial_scale, ctx.out_channels, ctx.group_size, ctx.out_size, ctx.part_size, ctx.sample_per_part, ctx.trans_std ) return (grad_input, grad_rois, grad_offset, None, None, None, None, None, None, None, None) deform_roi_pooling = DeformRoIPoolingFunction.apply ================================================ FILE: maskrcnn_benchmark/layers/dcn/deform_pool_module.py ================================================ from torch import nn from .deform_pool_func import deform_roi_pooling class DeformRoIPooling(nn.Module): def __init__(self, spatial_scale, out_size, out_channels, no_trans, group_size=1, part_size=None, sample_per_part=4, trans_std=.0): super(DeformRoIPooling, self).__init__() self.spatial_scale = spatial_scale self.out_size = out_size self.out_channels = out_channels self.no_trans = no_trans self.group_size = group_size self.part_size = out_size if part_size is None else part_size self.sample_per_part = sample_per_part self.trans_std = trans_std def forward(self, data, rois, offset): if self.no_trans: offset = data.new_empty(0) return deform_roi_pooling( data, rois, offset, self.spatial_scale, self.out_size, self.out_channels, self.no_trans, self.group_size, self.part_size, self.sample_per_part, self.trans_std) class DeformRoIPoolingPack(DeformRoIPooling): def __init__(self, spatial_scale, out_size, out_channels, no_trans, group_size=1, part_size=None, sample_per_part=4, trans_std=.0, deform_fc_channels=1024): super(DeformRoIPoolingPack, self).__init__(spatial_scale, out_size, out_channels, no_trans, group_size, part_size, sample_per_part, trans_std) self.deform_fc_channels = deform_fc_channels if not no_trans: self.offset_fc = nn.Sequential( nn.Linear(self.out_size * self.out_size * self.out_channels, self.deform_fc_channels), nn.ReLU(inplace=True), nn.Linear(self.deform_fc_channels, self.deform_fc_channels), nn.ReLU(inplace=True), nn.Linear(self.deform_fc_channels, self.out_size * self.out_size * 2)) self.offset_fc[-1].weight.data.zero_() self.offset_fc[-1].bias.data.zero_() def forward(self, data, rois): assert data.size(1) == self.out_channels if self.no_trans: offset = data.new_empty(0) return deform_roi_pooling( data, rois, offset, self.spatial_scale, self.out_size, self.out_channels, self.no_trans, self.group_size, self.part_size, self.sample_per_part, self.trans_std) else: n = rois.shape[0] offset = data.new_empty(0) x = deform_roi_pooling(data, rois, offset, self.spatial_scale, self.out_size, self.out_channels, True, self.group_size, self.part_size, self.sample_per_part, self.trans_std) offset = self.offset_fc(x.view(n, -1)) offset = offset.view(n, 2, self.out_size, self.out_size) return deform_roi_pooling( data, rois, offset, self.spatial_scale, self.out_size, self.out_channels, self.no_trans, self.group_size, self.part_size, self.sample_per_part, self.trans_std) class ModulatedDeformRoIPoolingPack(DeformRoIPooling): def __init__(self, spatial_scale, out_size, out_channels, no_trans, group_size=1, part_size=None, sample_per_part=4, trans_std=.0, deform_fc_channels=1024): super(ModulatedDeformRoIPoolingPack, self).__init__( spatial_scale, out_size, out_channels, no_trans, group_size, part_size, sample_per_part, trans_std) self.deform_fc_channels = deform_fc_channels if not no_trans: self.offset_fc = nn.Sequential( nn.Linear(self.out_size * self.out_size * self.out_channels, self.deform_fc_channels), nn.ReLU(inplace=True), nn.Linear(self.deform_fc_channels, self.deform_fc_channels), nn.ReLU(inplace=True), nn.Linear(self.deform_fc_channels, self.out_size * self.out_size * 2)) self.offset_fc[-1].weight.data.zero_() self.offset_fc[-1].bias.data.zero_() self.mask_fc = nn.Sequential( nn.Linear(self.out_size * self.out_size * self.out_channels, self.deform_fc_channels), nn.ReLU(inplace=True), nn.Linear(self.deform_fc_channels, self.out_size * self.out_size * 1), nn.Sigmoid()) self.mask_fc[2].weight.data.zero_() self.mask_fc[2].bias.data.zero_() def forward(self, data, rois): assert data.size(1) == self.out_channels if self.no_trans: offset = data.new_empty(0) return deform_roi_pooling( data, rois, offset, self.spatial_scale, self.out_size, self.out_channels, self.no_trans, self.group_size, self.part_size, self.sample_per_part, self.trans_std) else: n = rois.shape[0] offset = data.new_empty(0) x = deform_roi_pooling(data, rois, offset, self.spatial_scale, self.out_size, self.out_channels, True, self.group_size, self.part_size, self.sample_per_part, self.trans_std) offset = self.offset_fc(x.view(n, -1)) offset = offset.view(n, 2, self.out_size, self.out_size) mask = self.mask_fc(x.view(n, -1)) mask = mask.view(n, 1, self.out_size, self.out_size) return deform_roi_pooling( data, rois, offset, self.spatial_scale, self.out_size, self.out_channels, self.no_trans, self.group_size, self.part_size, self.sample_per_part, self.trans_std) * mask ================================================ FILE: maskrcnn_benchmark/layers/misc.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """ helper class that supports empty tensors on some nn functions. Ideally, add support directly in PyTorch to empty tensors in those functions. This can be removed once https://github.com/pytorch/pytorch/issues/12013 is implemented """ import math import torch from torch import nn from torch.nn.modules.utils import _ntuple class _NewEmptyTensorOp(torch.autograd.Function): @staticmethod def forward(ctx, x, new_shape): ctx.shape = x.shape return x.new_empty(new_shape) @staticmethod def backward(ctx, grad): shape = ctx.shape return _NewEmptyTensorOp.apply(grad, shape), None class Conv2d(torch.nn.Conv2d): def forward(self, x): if x.numel() > 0: return super(Conv2d, self).forward(x) # get output shape output_shape = [ (i + 2 * p - (di * (k - 1) + 1)) // d + 1 for i, p, di, k, d in zip( x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride ) ] output_shape = [x.shape[0], self.weight.shape[0]] + output_shape return _NewEmptyTensorOp.apply(x, output_shape) class ConvTranspose2d(torch.nn.ConvTranspose2d): def forward(self, x): if x.numel() > 0: return super(ConvTranspose2d, self).forward(x) # get output shape output_shape = [ (i - 1) * d - 2 * p + (di * (k - 1) + 1) + op for i, p, di, k, d, op in zip( x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride, self.output_padding, ) ] output_shape = [x.shape[0], self.bias.shape[0]] + output_shape return _NewEmptyTensorOp.apply(x, output_shape) class BatchNorm2d(torch.nn.BatchNorm2d): def forward(self, x): if x.numel() > 0: return super(BatchNorm2d, self).forward(x) # get output shape output_shape = x.shape return _NewEmptyTensorOp.apply(x, output_shape) def interpolate( input, size=None, scale_factor=None, mode="nearest", align_corners=None ): if input.numel() > 0: return torch.nn.functional.interpolate( input, size, scale_factor, mode, align_corners ) def _check_size_scale_factor(dim): if size is None and scale_factor is None: raise ValueError("either size or scale_factor should be defined") if size is not None and scale_factor is not None: raise ValueError("only one of size or scale_factor should be defined") if ( scale_factor is not None and isinstance(scale_factor, tuple) and len(scale_factor) != dim ): raise ValueError( "scale_factor shape must match input shape. " "Input is {}D, scale_factor size is {}".format(dim, len(scale_factor)) ) def _output_size(dim): _check_size_scale_factor(dim) if size is not None: return size scale_factors = _ntuple(dim)(scale_factor) # math.floor might return float in py2.7 return [ int(math.floor(input.size(i + 2) * scale_factors[i])) for i in range(dim) ] output_shape = tuple(_output_size(2)) output_shape = input.shape[:-2] + output_shape return _NewEmptyTensorOp.apply(input, output_shape) class DFConv2d(nn.Module): """Deformable convolutional layer""" def __init__( self, in_channels, out_channels, with_modulated_dcn=True, kernel_size=3, stride=1, groups=1, dilation=1, deformable_groups=1, bias=False ): super(DFConv2d, self).__init__() if isinstance(kernel_size, (list, tuple)): assert isinstance(stride, (list, tuple)) assert isinstance(dilation, (list, tuple)) assert len(kernel_size) == 2 assert len(stride) == 2 assert len(dilation) == 2 padding = ( dilation[0] * (kernel_size[0] - 1) // 2, dilation[1] * (kernel_size[1] - 1) // 2 ) offset_base_channels = kernel_size[0] * kernel_size[1] else: padding = dilation * (kernel_size - 1) // 2 offset_base_channels = kernel_size * kernel_size if with_modulated_dcn: from maskrcnn_benchmark.layers import ModulatedDeformConv offset_channels = offset_base_channels * 3 #default: 27 conv_block = ModulatedDeformConv else: from maskrcnn_benchmark.layers import DeformConv offset_channels = offset_base_channels * 2 #default: 18 conv_block = DeformConv self.offset = Conv2d( in_channels, deformable_groups * offset_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=1, dilation=dilation ) for l in [self.offset,]: nn.init.kaiming_uniform_(l.weight, a=1) torch.nn.init.constant_(l.bias, 0.) self.conv = conv_block( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, deformable_groups=deformable_groups, bias=bias ) self.with_modulated_dcn = with_modulated_dcn self.kernel_size = kernel_size self.stride = stride self.padding = padding self.dilation = dilation def forward(self, x): if x.numel() > 0: if not self.with_modulated_dcn: offset = self.offset(x) x = self.conv(x, offset) else: offset_mask = self.offset(x) offset = offset_mask[:, :18, :, :] mask = offset_mask[:, -9:, :, :].sigmoid() x = self.conv(x, offset, mask) return x # get output shape output_shape = [ (i + 2 * p - (di * (k - 1) + 1)) // d + 1 for i, p, di, k, d in zip( x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride ) ] output_shape = [x.shape[0], self.conv.weight.shape[0]] + output_shape return _NewEmptyTensorOp.apply(x, output_shape) ================================================ FILE: maskrcnn_benchmark/layers/nms.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. # from ._utils import _C from maskrcnn_benchmark import _C from apex import amp # Only valid with fp32 inputs - give AMP the hint nms = amp.float_function(_C.nms) # nms.__doc__ = """ # This function performs Non-maximum suppresion""" ================================================ FILE: maskrcnn_benchmark/layers/roi_align.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch import nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair from maskrcnn_benchmark import _C from apex import amp class _ROIAlign(Function): @staticmethod def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio): ctx.save_for_backward(roi) ctx.output_size = _pair(output_size) ctx.spatial_scale = spatial_scale ctx.sampling_ratio = sampling_ratio ctx.input_shape = input.size() output = _C.roi_align_forward( input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio ) return output @staticmethod @once_differentiable def backward(ctx, grad_output): rois, = ctx.saved_tensors output_size = ctx.output_size spatial_scale = ctx.spatial_scale sampling_ratio = ctx.sampling_ratio bs, ch, h, w = ctx.input_shape grad_input = _C.roi_align_backward( grad_output, rois, spatial_scale, output_size[0], output_size[1], bs, ch, h, w, sampling_ratio, ) return grad_input, None, None, None, None roi_align = _ROIAlign.apply class ROIAlign(nn.Module): def __init__(self, output_size, spatial_scale, sampling_ratio): super(ROIAlign, self).__init__() self.output_size = output_size self.spatial_scale = spatial_scale self.sampling_ratio = sampling_ratio @amp.float_function def forward(self, input, rois): return roi_align( input, rois, self.output_size, self.spatial_scale, self.sampling_ratio ) def __repr__(self): tmpstr = self.__class__.__name__ + "(" tmpstr += "output_size=" + str(self.output_size) tmpstr += ", spatial_scale=" + str(self.spatial_scale) tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) tmpstr += ")" return tmpstr ================================================ FILE: maskrcnn_benchmark/layers/roi_pool.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch import nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair from maskrcnn_benchmark import _C from apex import amp class _ROIPool(Function): @staticmethod def forward(ctx, input, roi, output_size, spatial_scale): ctx.output_size = _pair(output_size) ctx.spatial_scale = spatial_scale ctx.input_shape = input.size() output, argmax = _C.roi_pool_forward( input, roi, spatial_scale, output_size[0], output_size[1] ) ctx.save_for_backward(input, roi, argmax) return output @staticmethod @once_differentiable def backward(ctx, grad_output): input, rois, argmax = ctx.saved_tensors output_size = ctx.output_size spatial_scale = ctx.spatial_scale bs, ch, h, w = ctx.input_shape grad_input = _C.roi_pool_backward( grad_output, input, rois, argmax, spatial_scale, output_size[0], output_size[1], bs, ch, h, w, ) return grad_input, None, None, None roi_pool = _ROIPool.apply class ROIPool(nn.Module): def __init__(self, output_size, spatial_scale): super(ROIPool, self).__init__() self.output_size = output_size self.spatial_scale = spatial_scale @amp.float_function def forward(self, input, rois): return roi_pool(input, rois, self.output_size, self.spatial_scale) def __repr__(self): tmpstr = self.__class__.__name__ + "(" tmpstr += "output_size=" + str(self.output_size) tmpstr += ", spatial_scale=" + str(self.spatial_scale) tmpstr += ")" return tmpstr ================================================ FILE: maskrcnn_benchmark/layers/sigmoid_focal_loss.py ================================================ import torch from torch import nn from torch.autograd import Function from torch.autograd.function import once_differentiable from maskrcnn_benchmark import _C # TODO: Use JIT to replace CUDA implementation in the future. class _SigmoidFocalLoss(Function): @staticmethod def forward(ctx, logits, targets, gamma, alpha): ctx.save_for_backward(logits, targets) num_classes = logits.shape[1] ctx.num_classes = num_classes ctx.gamma = gamma ctx.alpha = alpha losses = _C.sigmoid_focalloss_forward( logits, targets, num_classes, gamma, alpha ) return losses @staticmethod @once_differentiable def backward(ctx, d_loss): logits, targets = ctx.saved_tensors num_classes = ctx.num_classes gamma = ctx.gamma alpha = ctx.alpha d_loss = d_loss.contiguous() d_logits = _C.sigmoid_focalloss_backward( logits, targets, d_loss, num_classes, gamma, alpha ) return d_logits, None, None, None, None sigmoid_focal_loss_cuda = _SigmoidFocalLoss.apply def sigmoid_focal_loss_cpu(logits, targets, gamma, alpha): num_classes = logits.shape[1] dtype = targets.dtype device = targets.device class_range = torch.arange(1, num_classes+1, dtype=dtype, device=device).unsqueeze(0) t = targets.unsqueeze(1) p = torch.sigmoid(logits) term1 = (1 - p) ** gamma * torch.log(p) term2 = p ** gamma * torch.log(1 - p) return -(t == class_range).float() * term1 * alpha - ((t != class_range) * (t >= 0)).float() * term2 * (1 - alpha) class SigmoidFocalLoss(nn.Module): def __init__(self, gamma, alpha): super(SigmoidFocalLoss, self).__init__() self.gamma = gamma self.alpha = alpha def forward(self, logits, targets): device = logits.device if logits.is_cuda: loss_func = sigmoid_focal_loss_cuda else: loss_func = sigmoid_focal_loss_cpu loss = loss_func(logits, targets, self.gamma, self.alpha) return loss.sum() def __repr__(self): tmpstr = self.__class__.__name__ + "(" tmpstr += "gamma=" + str(self.gamma) tmpstr += ", alpha=" + str(self.alpha) tmpstr += ")" return tmpstr ================================================ FILE: maskrcnn_benchmark/layers/smooth_l1_loss.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch # TODO maybe push this to nn? def smooth_l1_loss(input, target, beta=1. / 9, size_average=True): """ very similar to the smooth_l1_loss from pytorch, but with the extra beta parameter """ n = torch.abs(input - target) cond = n < beta loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta) if size_average: return loss.mean() return loss.sum() ================================================ FILE: maskrcnn_benchmark/modeling/__init__.py ================================================ ================================================ FILE: maskrcnn_benchmark/modeling/backbone/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from .backbone import build_backbone from . import fbnet ================================================ FILE: maskrcnn_benchmark/modeling/backbone/backbone.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from collections import OrderedDict from torch import nn from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.modeling.make_layers import conv_with_kaiming_uniform from . import fpn as fpn_module from . import resnet @registry.BACKBONES.register("R-50-C4") @registry.BACKBONES.register("R-50-C5") @registry.BACKBONES.register("R-101-C4") @registry.BACKBONES.register("R-101-C5") def build_resnet_backbone(cfg): body = resnet.ResNet(cfg) model = nn.Sequential(OrderedDict([("body", body)])) model.out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS return model @registry.BACKBONES.register("R-50-FPN") @registry.BACKBONES.register("R-101-FPN") @registry.BACKBONES.register("R-152-FPN") def build_resnet_fpn_backbone(cfg): body = resnet.ResNet(cfg) in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS fpn = fpn_module.FPN( in_channels_list=[ in_channels_stage2, in_channels_stage2 * 2, in_channels_stage2 * 4, in_channels_stage2 * 8, ], out_channels=out_channels, conv_block=conv_with_kaiming_uniform( cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU ), top_blocks=fpn_module.LastLevelMaxPool(), ) model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)])) model.out_channels = out_channels return model @registry.BACKBONES.register("R-50-FPN-RETINANET") @registry.BACKBONES.register("R-101-FPN-RETINANET") def build_resnet_fpn_p3p7_backbone(cfg): body = resnet.ResNet(cfg) in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS in_channels_p6p7 = in_channels_stage2 * 8 if cfg.MODEL.RETINANET.USE_C5 \ else out_channels fpn = fpn_module.FPN( in_channels_list=[ 0, in_channels_stage2 * 2, in_channels_stage2 * 4, in_channels_stage2 * 8, ], out_channels=out_channels, conv_block=conv_with_kaiming_uniform( cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU ), top_blocks=fpn_module.LastLevelP6P7(in_channels_p6p7, out_channels), ) model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)])) model.out_channels = out_channels return model def build_backbone(cfg): assert cfg.MODEL.BACKBONE.CONV_BODY in registry.BACKBONES, \ "cfg.MODEL.BACKBONE.CONV_BODY: {} are not registered in registry".format( cfg.MODEL.BACKBONE.CONV_BODY ) return registry.BACKBONES[cfg.MODEL.BACKBONE.CONV_BODY](cfg) ================================================ FILE: maskrcnn_benchmark/modeling/backbone/fbnet.py ================================================ from __future__ import absolute_import, division, print_function, unicode_literals import copy import json import logging from collections import OrderedDict from . import ( fbnet_builder as mbuilder, fbnet_modeldef as modeldef, ) import torch.nn as nn from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.modeling.rpn import rpn from maskrcnn_benchmark.modeling import poolers logger = logging.getLogger(__name__) def create_builder(cfg): bn_type = cfg.MODEL.FBNET.BN_TYPE if bn_type == "gn": bn_type = (bn_type, cfg.GROUP_NORM.NUM_GROUPS) factor = cfg.MODEL.FBNET.SCALE_FACTOR arch = cfg.MODEL.FBNET.ARCH arch_def = cfg.MODEL.FBNET.ARCH_DEF if len(arch_def) > 0: arch_def = json.loads(arch_def) if arch in modeldef.MODEL_ARCH: if len(arch_def) > 0: assert ( arch_def == modeldef.MODEL_ARCH[arch] ), "Two architectures with the same name {},\n{},\n{}".format( arch, arch_def, modeldef.MODEL_ARCH[arch] ) arch_def = modeldef.MODEL_ARCH[arch] else: assert arch_def is not None and len(arch_def) > 0 arch_def = mbuilder.unify_arch_def(arch_def) rpn_stride = arch_def.get("rpn_stride", None) if rpn_stride is not None: assert ( cfg.MODEL.RPN.ANCHOR_STRIDE[0] == rpn_stride ), "Needs to set cfg.MODEL.RPN.ANCHOR_STRIDE to {}, got {}".format( rpn_stride, cfg.MODEL.RPN.ANCHOR_STRIDE ) width_divisor = cfg.MODEL.FBNET.WIDTH_DIVISOR dw_skip_bn = cfg.MODEL.FBNET.DW_CONV_SKIP_BN dw_skip_relu = cfg.MODEL.FBNET.DW_CONV_SKIP_RELU logger.info( "Building fbnet model with arch {} (without scaling):\n{}".format( arch, arch_def ) ) builder = mbuilder.FBNetBuilder( width_ratio=factor, bn_type=bn_type, width_divisor=width_divisor, dw_skip_bn=dw_skip_bn, dw_skip_relu=dw_skip_relu, ) return builder, arch_def def _get_trunk_cfg(arch_def): """ Get all stages except the last one """ num_stages = mbuilder.get_num_stages(arch_def) trunk_stages = arch_def.get("backbone", range(num_stages - 1)) ret = mbuilder.get_blocks(arch_def, stage_indices=trunk_stages) return ret class FBNetTrunk(nn.Module): def __init__( self, builder, arch_def, dim_in, ): super(FBNetTrunk, self).__init__() self.first = builder.add_first(arch_def["first"], dim_in=dim_in) trunk_cfg = _get_trunk_cfg(arch_def) self.stages = builder.add_blocks(trunk_cfg["stages"]) # return features for each stage def forward(self, x): y = self.first(x) y = self.stages(y) ret = [y] return ret @registry.BACKBONES.register("FBNet") def add_conv_body(cfg, dim_in=3): builder, arch_def = create_builder(cfg) body = FBNetTrunk(builder, arch_def, dim_in) model = nn.Sequential(OrderedDict([("body", body)])) model.out_channels = builder.last_depth return model def _get_rpn_stage(arch_def, num_blocks): rpn_stage = arch_def.get("rpn") ret = mbuilder.get_blocks(arch_def, stage_indices=rpn_stage) if num_blocks > 0: logger.warn('Use last {} blocks in {} as rpn'.format(num_blocks, ret)) block_count = len(ret["stages"]) assert num_blocks <= block_count, "use block {}, block count {}".format( num_blocks, block_count ) blocks = range(block_count - num_blocks, block_count) ret = mbuilder.get_blocks(ret, block_indices=blocks) return ret["stages"] class FBNetRPNHead(nn.Module): def __init__( self, cfg, in_channels, builder, arch_def, ): super(FBNetRPNHead, self).__init__() assert in_channels == builder.last_depth rpn_bn_type = cfg.MODEL.FBNET.RPN_BN_TYPE if len(rpn_bn_type) > 0: builder.bn_type = rpn_bn_type use_blocks = cfg.MODEL.FBNET.RPN_HEAD_BLOCKS stages = _get_rpn_stage(arch_def, use_blocks) self.head = builder.add_blocks(stages) self.out_channels = builder.last_depth def forward(self, x): x = [self.head(y) for y in x] return x @registry.RPN_HEADS.register("FBNet.rpn_head") def add_rpn_head(cfg, in_channels, num_anchors): builder, model_arch = create_builder(cfg) builder.last_depth = in_channels assert in_channels == builder.last_depth # builder.name_prefix = "[rpn]" rpn_feature = FBNetRPNHead(cfg, in_channels, builder, model_arch) rpn_regressor = rpn.RPNHeadConvRegressor( cfg, rpn_feature.out_channels, num_anchors) return nn.Sequential(rpn_feature, rpn_regressor) def _get_head_stage(arch, head_name, blocks): # use default name 'head' if the specific name 'head_name' does not existed if head_name not in arch: head_name = "head" head_stage = arch.get(head_name) ret = mbuilder.get_blocks(arch, stage_indices=head_stage, block_indices=blocks) return ret["stages"] # name mapping for head names in arch def and cfg ARCH_CFG_NAME_MAPPING = { "bbox": "ROI_BOX_HEAD", "kpts": "ROI_KEYPOINT_HEAD", "mask": "ROI_MASK_HEAD", } class FBNetROIHead(nn.Module): def __init__( self, cfg, in_channels, builder, arch_def, head_name, use_blocks, stride_init, last_layer_scale, ): super(FBNetROIHead, self).__init__() assert in_channels == builder.last_depth assert isinstance(use_blocks, list) head_cfg_name = ARCH_CFG_NAME_MAPPING[head_name] self.pooler = poolers.make_pooler(cfg, head_cfg_name) stage = _get_head_stage(arch_def, head_name, use_blocks) assert stride_init in [0, 1, 2] if stride_init != 0: stage[0]["block"][3] = stride_init blocks = builder.add_blocks(stage) last_info = copy.deepcopy(arch_def["last"]) last_info[1] = last_layer_scale last = builder.add_last(last_info) self.head = nn.Sequential(OrderedDict([ ("blocks", blocks), ("last", last) ])) self.out_channels = builder.last_depth def forward(self, x, proposals): x = self.pooler(x, proposals) x = self.head(x) return x @registry.ROI_BOX_FEATURE_EXTRACTORS.register("FBNet.roi_head") def add_roi_head(cfg, in_channels): builder, model_arch = create_builder(cfg) builder.last_depth = in_channels # builder.name_prefix = "_[bbox]_" return FBNetROIHead( cfg, in_channels, builder, model_arch, head_name="bbox", use_blocks=cfg.MODEL.FBNET.DET_HEAD_BLOCKS, stride_init=cfg.MODEL.FBNET.DET_HEAD_STRIDE, last_layer_scale=cfg.MODEL.FBNET.DET_HEAD_LAST_SCALE, ) @registry.ROI_KEYPOINT_FEATURE_EXTRACTORS.register("FBNet.roi_head_keypoints") def add_roi_head_keypoints(cfg, in_channels): builder, model_arch = create_builder(cfg) builder.last_depth = in_channels # builder.name_prefix = "_[kpts]_" return FBNetROIHead( cfg, in_channels, builder, model_arch, head_name="kpts", use_blocks=cfg.MODEL.FBNET.KPTS_HEAD_BLOCKS, stride_init=cfg.MODEL.FBNET.KPTS_HEAD_STRIDE, last_layer_scale=cfg.MODEL.FBNET.KPTS_HEAD_LAST_SCALE, ) @registry.ROI_MASK_FEATURE_EXTRACTORS.register("FBNet.roi_head_mask") def add_roi_head_mask(cfg, in_channels): builder, model_arch = create_builder(cfg) builder.last_depth = in_channels # builder.name_prefix = "_[mask]_" return FBNetROIHead( cfg, in_channels, builder, model_arch, head_name="mask", use_blocks=cfg.MODEL.FBNET.MASK_HEAD_BLOCKS, stride_init=cfg.MODEL.FBNET.MASK_HEAD_STRIDE, last_layer_scale=cfg.MODEL.FBNET.MASK_HEAD_LAST_SCALE, ) ================================================ FILE: maskrcnn_benchmark/modeling/backbone/fbnet_builder.py ================================================ """ FBNet model builder """ from __future__ import absolute_import, division, print_function, unicode_literals import copy import logging import math from collections import OrderedDict import torch import torch.nn as nn from maskrcnn_benchmark.layers import ( BatchNorm2d, Conv2d, FrozenBatchNorm2d, interpolate, ) from maskrcnn_benchmark.layers.misc import _NewEmptyTensorOp logger = logging.getLogger(__name__) def _py2_round(x): return math.floor(x + 0.5) if x >= 0.0 else math.ceil(x - 0.5) def _get_divisible_by(num, divisible_by, min_val): ret = int(num) if divisible_by > 0 and num % divisible_by != 0: ret = int((_py2_round(num / divisible_by) or min_val) * divisible_by) return ret PRIMITIVES = { "skip": lambda C_in, C_out, expansion, stride, **kwargs: Identity( C_in, C_out, stride ), "ir_k3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, expansion, stride, **kwargs ), "ir_k5": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, expansion, stride, kernel=5, **kwargs ), "ir_k7": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, expansion, stride, kernel=7, **kwargs ), "ir_k1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, expansion, stride, kernel=1, **kwargs ), "shuffle": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, expansion, stride, shuffle_type="mid", pw_group=4, **kwargs ), "basic_block": lambda C_in, C_out, expansion, stride, **kwargs: CascadeConv3x3( C_in, C_out, stride ), "shift_5x5": lambda C_in, C_out, expansion, stride, **kwargs: ShiftBlock5x5( C_in, C_out, expansion, stride ), # layer search 2 "ir_k3_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=3, **kwargs ), "ir_k3_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 3, stride, kernel=3, **kwargs ), "ir_k3_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 6, stride, kernel=3, **kwargs ), "ir_k3_s4": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 4, stride, kernel=3, shuffle_type="mid", pw_group=4, **kwargs ), "ir_k5_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=5, **kwargs ), "ir_k5_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 3, stride, kernel=5, **kwargs ), "ir_k5_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 6, stride, kernel=5, **kwargs ), "ir_k5_s4": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 4, stride, kernel=5, shuffle_type="mid", pw_group=4, **kwargs ), # layer search se "ir_k3_e1_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=3, se=True, **kwargs ), "ir_k3_e3_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 3, stride, kernel=3, se=True, **kwargs ), "ir_k3_e6_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 6, stride, kernel=3, se=True, **kwargs ), "ir_k3_s4_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 4, stride, kernel=3, shuffle_type="mid", pw_group=4, se=True, **kwargs ), "ir_k5_e1_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=5, se=True, **kwargs ), "ir_k5_e3_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 3, stride, kernel=5, se=True, **kwargs ), "ir_k5_e6_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 6, stride, kernel=5, se=True, **kwargs ), "ir_k5_s4_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 4, stride, kernel=5, shuffle_type="mid", pw_group=4, se=True, **kwargs ), # layer search 3 (in addition to layer search 2) "ir_k3_s2": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=3, shuffle_type="mid", pw_group=2, **kwargs ), "ir_k5_s2": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=5, shuffle_type="mid", pw_group=2, **kwargs ), "ir_k3_s2_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=3, shuffle_type="mid", pw_group=2, se=True, **kwargs ), "ir_k5_s2_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=5, shuffle_type="mid", pw_group=2, se=True, **kwargs ), # layer search 4 (in addition to layer search 3) "ir_k3_sep": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, expansion, stride, kernel=3, cdw=True, **kwargs ), "ir_k33_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=3, cdw=True, **kwargs ), "ir_k33_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 3, stride, kernel=3, cdw=True, **kwargs ), "ir_k33_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 6, stride, kernel=3, cdw=True, **kwargs ), # layer search 5 (in addition to layer search 4) "ir_k7_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=7, **kwargs ), "ir_k7_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 3, stride, kernel=7, **kwargs ), "ir_k7_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 6, stride, kernel=7, **kwargs ), "ir_k7_sep": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, expansion, stride, kernel=7, cdw=True, **kwargs ), "ir_k7_sep_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=7, cdw=True, **kwargs ), "ir_k7_sep_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 3, stride, kernel=7, cdw=True, **kwargs ), "ir_k7_sep_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 6, stride, kernel=7, cdw=True, **kwargs ), } class Identity(nn.Module): def __init__(self, C_in, C_out, stride): super(Identity, self).__init__() self.conv = ( ConvBNRelu( C_in, C_out, kernel=1, stride=stride, pad=0, no_bias=1, use_relu="relu", bn_type="bn", ) if C_in != C_out or stride != 1 else None ) def forward(self, x): if self.conv: out = self.conv(x) else: out = x return out class CascadeConv3x3(nn.Sequential): def __init__(self, C_in, C_out, stride): assert stride in [1, 2] ops = [ Conv2d(C_in, C_in, 3, stride, 1, bias=False), BatchNorm2d(C_in), nn.ReLU(inplace=True), Conv2d(C_in, C_out, 3, 1, 1, bias=False), BatchNorm2d(C_out), ] super(CascadeConv3x3, self).__init__(*ops) self.res_connect = (stride == 1) and (C_in == C_out) def forward(self, x): y = super(CascadeConv3x3, self).forward(x) if self.res_connect: y += x return y class Shift(nn.Module): def __init__(self, C, kernel_size, stride, padding): super(Shift, self).__init__() self.C = C kernel = torch.zeros((C, 1, kernel_size, kernel_size), dtype=torch.float32) ch_idx = 0 assert stride in [1, 2] self.stride = stride self.padding = padding self.kernel_size = kernel_size self.dilation = 1 hks = kernel_size // 2 ksq = kernel_size ** 2 for i in range(kernel_size): for j in range(kernel_size): if i == hks and j == hks: num_ch = C // ksq + C % ksq else: num_ch = C // ksq kernel[ch_idx : ch_idx + num_ch, 0, i, j] = 1 ch_idx += num_ch self.register_parameter("bias", None) self.kernel = nn.Parameter(kernel, requires_grad=False) def forward(self, x): if x.numel() > 0: return nn.functional.conv2d( x, self.kernel, self.bias, (self.stride, self.stride), (self.padding, self.padding), self.dilation, self.C, # groups ) output_shape = [ (i + 2 * p - (di * (k - 1) + 1)) // d + 1 for i, p, di, k, d in zip( x.shape[-2:], (self.padding, self.dilation), (self.dilation, self.dilation), (self.kernel_size, self.kernel_size), (self.stride, self.stride), ) ] output_shape = [x.shape[0], self.C] + output_shape return _NewEmptyTensorOp.apply(x, output_shape) class ShiftBlock5x5(nn.Sequential): def __init__(self, C_in, C_out, expansion, stride): assert stride in [1, 2] self.res_connect = (stride == 1) and (C_in == C_out) C_mid = _get_divisible_by(C_in * expansion, 8, 8) ops = [ # pw Conv2d(C_in, C_mid, 1, 1, 0, bias=False), BatchNorm2d(C_mid), nn.ReLU(inplace=True), # shift Shift(C_mid, 5, stride, 2), # pw-linear Conv2d(C_mid, C_out, 1, 1, 0, bias=False), BatchNorm2d(C_out), ] super(ShiftBlock5x5, self).__init__(*ops) def forward(self, x): y = super(ShiftBlock5x5, self).forward(x) if self.res_connect: y += x return y class ChannelShuffle(nn.Module): def __init__(self, groups): super(ChannelShuffle, self).__init__() self.groups = groups def forward(self, x): """Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]""" N, C, H, W = x.size() g = self.groups assert C % g == 0, "Incompatible group size {} for input channel {}".format( g, C ) return ( x.view(N, g, int(C / g), H, W) .permute(0, 2, 1, 3, 4) .contiguous() .view(N, C, H, W) ) class ConvBNRelu(nn.Sequential): def __init__( self, input_depth, output_depth, kernel, stride, pad, no_bias, use_relu, bn_type, group=1, *args, **kwargs ): super(ConvBNRelu, self).__init__() assert use_relu in ["relu", None] if isinstance(bn_type, (list, tuple)): assert len(bn_type) == 2 assert bn_type[0] == "gn" gn_group = bn_type[1] bn_type = bn_type[0] assert bn_type in ["bn", "af", "gn", None] assert stride in [1, 2, 4] op = Conv2d( input_depth, output_depth, kernel_size=kernel, stride=stride, padding=pad, bias=not no_bias, groups=group, *args, **kwargs ) nn.init.kaiming_normal_(op.weight, mode="fan_out", nonlinearity="relu") if op.bias is not None: nn.init.constant_(op.bias, 0.0) self.add_module("conv", op) if bn_type == "bn": bn_op = BatchNorm2d(output_depth) elif bn_type == "gn": bn_op = nn.GroupNorm(num_groups=gn_group, num_channels=output_depth) elif bn_type == "af": bn_op = FrozenBatchNorm2d(output_depth) if bn_type is not None: self.add_module("bn", bn_op) if use_relu == "relu": self.add_module("relu", nn.ReLU(inplace=True)) class SEModule(nn.Module): reduction = 4 def __init__(self, C): super(SEModule, self).__init__() mid = max(C // self.reduction, 8) conv1 = Conv2d(C, mid, 1, 1, 0) conv2 = Conv2d(mid, C, 1, 1, 0) self.op = nn.Sequential( nn.AdaptiveAvgPool2d(1), conv1, nn.ReLU(inplace=True), conv2, nn.Sigmoid() ) def forward(self, x): return x * self.op(x) class Upsample(nn.Module): def __init__(self, scale_factor, mode, align_corners=None): super(Upsample, self).__init__() self.scale = scale_factor self.mode = mode self.align_corners = align_corners def forward(self, x): return interpolate( x, scale_factor=self.scale, mode=self.mode, align_corners=self.align_corners ) def _get_upsample_op(stride): assert ( stride in [1, 2, 4] or stride in [-1, -2, -4] or (isinstance(stride, tuple) and all(x in [-1, -2, -4] for x in stride)) ) scales = stride ret = None if isinstance(stride, tuple) or stride < 0: scales = [-x for x in stride] if isinstance(stride, tuple) else -stride stride = 1 ret = Upsample(scale_factor=scales, mode="nearest", align_corners=None) return ret, stride class IRFBlock(nn.Module): def __init__( self, input_depth, output_depth, expansion, stride, bn_type="bn", kernel=3, width_divisor=1, shuffle_type=None, pw_group=1, se=False, cdw=False, dw_skip_bn=False, dw_skip_relu=False, ): super(IRFBlock, self).__init__() assert kernel in [1, 3, 5, 7], kernel self.use_res_connect = stride == 1 and input_depth == output_depth self.output_depth = output_depth mid_depth = int(input_depth * expansion) mid_depth = _get_divisible_by(mid_depth, width_divisor, width_divisor) # pw self.pw = ConvBNRelu( input_depth, mid_depth, kernel=1, stride=1, pad=0, no_bias=1, use_relu="relu", bn_type=bn_type, group=pw_group, ) # negative stride to do upsampling self.upscale, stride = _get_upsample_op(stride) # dw if kernel == 1: self.dw = nn.Sequential() elif cdw: dw1 = ConvBNRelu( mid_depth, mid_depth, kernel=kernel, stride=stride, pad=(kernel // 2), group=mid_depth, no_bias=1, use_relu="relu", bn_type=bn_type, ) dw2 = ConvBNRelu( mid_depth, mid_depth, kernel=kernel, stride=1, pad=(kernel // 2), group=mid_depth, no_bias=1, use_relu="relu" if not dw_skip_relu else None, bn_type=bn_type if not dw_skip_bn else None, ) self.dw = nn.Sequential(OrderedDict([("dw1", dw1), ("dw2", dw2)])) else: self.dw = ConvBNRelu( mid_depth, mid_depth, kernel=kernel, stride=stride, pad=(kernel // 2), group=mid_depth, no_bias=1, use_relu="relu" if not dw_skip_relu else None, bn_type=bn_type if not dw_skip_bn else None, ) # pw-linear self.pwl = ConvBNRelu( mid_depth, output_depth, kernel=1, stride=1, pad=0, no_bias=1, use_relu=None, bn_type=bn_type, group=pw_group, ) self.shuffle_type = shuffle_type if shuffle_type is not None: self.shuffle = ChannelShuffle(pw_group) self.se4 = SEModule(output_depth) if se else nn.Sequential() self.output_depth = output_depth def forward(self, x): y = self.pw(x) if self.shuffle_type == "mid": y = self.shuffle(y) if self.upscale is not None: y = self.upscale(y) y = self.dw(y) y = self.pwl(y) if self.use_res_connect: y += x y = self.se4(y) return y def _expand_block_cfg(block_cfg): assert isinstance(block_cfg, list) ret = [] for idx in range(block_cfg[2]): cur = copy.deepcopy(block_cfg) cur[2] = 1 cur[3] = 1 if idx >= 1 else cur[3] ret.append(cur) return ret def expand_stage_cfg(stage_cfg): """ For a single stage """ assert isinstance(stage_cfg, list) ret = [] for x in stage_cfg: ret += _expand_block_cfg(x) return ret def expand_stages_cfg(stage_cfgs): """ For a list of stages """ assert isinstance(stage_cfgs, list) ret = [] for x in stage_cfgs: ret.append(expand_stage_cfg(x)) return ret def _block_cfgs_to_list(block_cfgs): assert isinstance(block_cfgs, list) ret = [] for stage_idx, stage in enumerate(block_cfgs): stage = expand_stage_cfg(stage) for block_idx, block in enumerate(stage): cur = {"stage_idx": stage_idx, "block_idx": block_idx, "block": block} ret.append(cur) return ret def _add_to_arch(arch, info, name): """ arch = [{block_0}, {block_1}, ...] info = [ # stage 0 [ block0_info, block1_info, ... ], ... ] convert to: arch = [ { block_0, name: block0_info, }, { block_1, name: block1_info, }, ... ] """ assert isinstance(arch, list) and all(isinstance(x, dict) for x in arch) assert isinstance(info, list) and all(isinstance(x, list) for x in info) idx = 0 for stage_idx, stage in enumerate(info): for block_idx, block in enumerate(stage): assert ( arch[idx]["stage_idx"] == stage_idx and arch[idx]["block_idx"] == block_idx ), "Index ({}, {}) does not match for block {}".format( stage_idx, block_idx, arch[idx] ) assert name not in arch[idx] arch[idx][name] = block idx += 1 def unify_arch_def(arch_def): """ unify the arch_def to: { ..., "arch": [ { "stage_idx": idx, "block_idx": idx, ... }, {}, ... ] } """ ret = copy.deepcopy(arch_def) assert "block_cfg" in arch_def and "stages" in arch_def["block_cfg"] assert "stages" not in ret # copy 'first', 'last' etc. inside arch_def['block_cfg'] to ret ret.update({x: arch_def["block_cfg"][x] for x in arch_def["block_cfg"]}) ret["stages"] = _block_cfgs_to_list(arch_def["block_cfg"]["stages"]) del ret["block_cfg"] assert "block_op_type" in arch_def _add_to_arch(ret["stages"], arch_def["block_op_type"], "block_op_type") del ret["block_op_type"] return ret def get_num_stages(arch_def): ret = 0 for x in arch_def["stages"]: ret = max(x["stage_idx"], ret) ret = ret + 1 return ret def get_blocks(arch_def, stage_indices=None, block_indices=None): ret = copy.deepcopy(arch_def) ret["stages"] = [] for block in arch_def["stages"]: keep = True if stage_indices not in (None, []) and block["stage_idx"] not in stage_indices: keep = False if block_indices not in (None, []) and block["block_idx"] not in block_indices: keep = False if keep: ret["stages"].append(block) return ret class FBNetBuilder(object): def __init__( self, width_ratio, bn_type="bn", width_divisor=1, dw_skip_bn=False, dw_skip_relu=False, ): self.width_ratio = width_ratio self.last_depth = -1 self.bn_type = bn_type self.width_divisor = width_divisor self.dw_skip_bn = dw_skip_bn self.dw_skip_relu = dw_skip_relu def add_first(self, stage_info, dim_in=3, pad=True): # stage_info: [c, s, kernel] assert len(stage_info) >= 2 channel = stage_info[0] stride = stage_info[1] out_depth = self._get_divisible_width(int(channel * self.width_ratio)) kernel = 3 if len(stage_info) > 2: kernel = stage_info[2] out = ConvBNRelu( dim_in, out_depth, kernel=kernel, stride=stride, pad=kernel // 2 if pad else 0, no_bias=1, use_relu="relu", bn_type=self.bn_type, ) self.last_depth = out_depth return out def add_blocks(self, blocks): """ blocks: [{}, {}, ...] """ assert isinstance(blocks, list) and all( isinstance(x, dict) for x in blocks ), blocks modules = OrderedDict() for block in blocks: stage_idx = block["stage_idx"] block_idx = block["block_idx"] block_op_type = block["block_op_type"] tcns = block["block"] n = tcns[2] assert n == 1 nnblock = self.add_ir_block(tcns, [block_op_type]) nn_name = "xif{}_{}".format(stage_idx, block_idx) assert nn_name not in modules modules[nn_name] = nnblock ret = nn.Sequential(modules) return ret def add_last(self, stage_info): """ skip last layer if channel_scale == 0 use the same output channel if channel_scale < 0 """ assert len(stage_info) == 2 channels = stage_info[0] channel_scale = stage_info[1] if channel_scale == 0.0: return nn.Sequential() if channel_scale > 0: last_channel = ( int(channels * self.width_ratio) if self.width_ratio > 1.0 else channels ) last_channel = int(last_channel * channel_scale) else: last_channel = int(self.last_depth * (-channel_scale)) last_channel = self._get_divisible_width(last_channel) if last_channel == 0: return nn.Sequential() dim_in = self.last_depth ret = ConvBNRelu( dim_in, last_channel, kernel=1, stride=1, pad=0, no_bias=1, use_relu="relu", bn_type=self.bn_type, ) self.last_depth = last_channel return ret # def add_final_pool(self, model, blob_in, kernel_size): # ret = model.AveragePool(blob_in, "final_avg", kernel=kernel_size, stride=1) # return ret def _add_ir_block( self, dim_in, dim_out, stride, expand_ratio, block_op_type, **kwargs ): ret = PRIMITIVES[block_op_type]( dim_in, dim_out, expansion=expand_ratio, stride=stride, bn_type=self.bn_type, width_divisor=self.width_divisor, dw_skip_bn=self.dw_skip_bn, dw_skip_relu=self.dw_skip_relu, **kwargs ) return ret, ret.output_depth def add_ir_block(self, tcns, block_op_types, **kwargs): t, c, n, s = tcns assert n == 1 out_depth = self._get_divisible_width(int(c * self.width_ratio)) dim_in = self.last_depth op, ret_depth = self._add_ir_block( dim_in, out_depth, stride=s, expand_ratio=t, block_op_type=block_op_types[0], **kwargs ) self.last_depth = ret_depth return op def _get_divisible_width(self, width): ret = _get_divisible_by(int(width), self.width_divisor, self.width_divisor) return ret ================================================ FILE: maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py ================================================ from __future__ import absolute_import, division, print_function, unicode_literals def add_archs(archs): global MODEL_ARCH for x in archs: assert x not in MODEL_ARCH, "Duplicated model name {} existed".format(x) MODEL_ARCH[x] = archs[x] MODEL_ARCH = { "default": { "block_op_type": [ # stage 0 ["ir_k3"], # stage 1 ["ir_k3"] * 2, # stage 2 ["ir_k3"] * 3, # stage 3 ["ir_k3"] * 7, # stage 4, bbox head ["ir_k3"] * 4, # stage 5, rpn ["ir_k3"] * 3, # stage 5, mask head ["ir_k3"] * 5, ], "block_cfg": { "first": [32, 2], "stages": [ # [t, c, n, s] # stage 0 [[1, 16, 1, 1]], # stage 1 [[6, 24, 2, 2]], # stage 2 [[6, 32, 3, 2]], # stage 3 [[6, 64, 4, 2], [6, 96, 3, 1]], # stage 4, bbox head [[4, 160, 1, 2], [6, 160, 2, 1], [6, 240, 1, 1]], # [[6, 160, 3, 2], [6, 320, 1, 1]], # stage 5, rpn head [[6, 96, 3, 1]], # stage 6, mask head [[4, 160, 1, 1], [6, 160, 3, 1], [3, 80, 1, -2]], ], # [c, channel_scale] "last": [0, 0.0], "backbone": [0, 1, 2, 3], "rpn": [5], "bbox": [4], "mask": [6], }, }, "xirb16d_dsmask": { "block_op_type": [ # stage 0 ["ir_k3"], # stage 1 ["ir_k3"] * 2, # stage 2 ["ir_k3"] * 3, # stage 3 ["ir_k3"] * 7, # stage 4, bbox head ["ir_k3"] * 4, # stage 5, mask head ["ir_k3"] * 5, # stage 6, rpn ["ir_k3"] * 3, ], "block_cfg": { "first": [16, 2], "stages": [ # [t, c, n, s] # stage 0 [[1, 16, 1, 1]], # stage 1 [[6, 32, 2, 2]], # stage 2 [[6, 48, 3, 2]], # stage 3 [[6, 96, 4, 2], [6, 128, 3, 1]], # stage 4, bbox head [[4, 128, 1, 2], [6, 128, 2, 1], [6, 160, 1, 1]], # stage 5, mask head [[4, 128, 1, 2], [6, 128, 2, 1], [6, 128, 1, -2], [3, 64, 1, -2]], # stage 6, rpn head [[6, 128, 3, 1]], ], # [c, channel_scale] "last": [0, 0.0], "backbone": [0, 1, 2, 3], "rpn": [6], "bbox": [4], "mask": [5], }, }, "mobilenet_v2": { "block_op_type": [ # stage 0 ["ir_k3"], # stage 1 ["ir_k3"] * 2, # stage 2 ["ir_k3"] * 3, # stage 3 ["ir_k3"] * 7, # stage 4 ["ir_k3"] * 4, ], "block_cfg": { "first": [32, 2], "stages": [ # [t, c, n, s] # stage 0 [[1, 16, 1, 1]], # stage 1 [[6, 24, 2, 2]], # stage 2 [[6, 32, 3, 2]], # stage 3 [[6, 64, 4, 2], [6, 96, 3, 1]], # stage 4 [[6, 160, 3, 1], [6, 320, 1, 1]], ], # [c, channel_scale] "last": [0, 0.0], "backbone": [0, 1, 2, 3], "bbox": [4], }, }, } MODEL_ARCH_CHAM = { "cham_v1a": { "block_op_type": [ # stage 0 ["ir_k3"], # stage 1 ["ir_k7"] * 2, # stage 2 ["ir_k3"] * 5, # stage 3 ["ir_k5"] * 7 + ["ir_k3"] * 5, # stage 4, bbox head ["ir_k3"] * 5, # stage 5, rpn ["ir_k3"] * 3, ], "block_cfg": { "first": [32, 2], "stages": [ # [t, c, n, s] # stage 0 [[1, 24, 1, 1]], # stage 1 [[4, 48, 2, 2]], # stage 2 [[7, 64, 5, 2]], # stage 3 [[12, 56, 7, 2], [8, 88, 5, 1]], # stage 4, bbox head [[7, 152, 4, 2], [10, 104, 1, 1]], # stage 5, rpn head [[8, 88, 3, 1]], ], # [c, channel_scale] "last": [0, 0.0], "backbone": [0, 1, 2, 3], "rpn": [5], "bbox": [4], }, }, "cham_v2": { "block_op_type": [ # stage 0 ["ir_k3"], # stage 1 ["ir_k5"] * 4, # stage 2 ["ir_k7"] * 6, # stage 3 ["ir_k5"] * 3 + ["ir_k3"] * 6, # stage 4, bbox head ["ir_k3"] * 7, # stage 5, rpn ["ir_k3"] * 1, ], "block_cfg": { "first": [32, 2], "stages": [ # [t, c, n, s] # stage 0 [[1, 24, 1, 1]], # stage 1 [[8, 32, 4, 2]], # stage 2 [[5, 48, 6, 2]], # stage 3 [[9, 56, 3, 2], [6, 56, 6, 1]], # stage 4, bbox head [[2, 160, 6, 2], [6, 112, 1, 1]], # stage 5, rpn head [[6, 56, 1, 1]], ], # [c, channel_scale] "last": [0, 0.0], "backbone": [0, 1, 2, 3], "rpn": [5], "bbox": [4], }, }, } add_archs(MODEL_ARCH_CHAM) ================================================ FILE: maskrcnn_benchmark/modeling/backbone/fpn.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch import torch.nn.functional as F from torch import nn class FPN(nn.Module): """ Module that adds FPN on top of a list of feature maps. The feature maps are currently supposed to be in increasing depth order, and must be consecutive """ def __init__( self, in_channels_list, out_channels, conv_block, top_blocks=None ): """ Arguments: in_channels_list (list[int]): number of channels for each feature map that will be fed out_channels (int): number of channels of the FPN representation top_blocks (nn.Module or None): if provided, an extra operation will be performed on the output of the last (smallest resolution) FPN output, and the result will extend the result list """ super(FPN, self).__init__() self.inner_blocks = [] self.layer_blocks = [] for idx, in_channels in enumerate(in_channels_list, 1): inner_block = "fpn_inner{}".format(idx) layer_block = "fpn_layer{}".format(idx) if in_channels == 0: continue inner_block_module = conv_block(in_channels, out_channels, 1) layer_block_module = conv_block(out_channels, out_channels, 3, 1) self.add_module(inner_block, inner_block_module) self.add_module(layer_block, layer_block_module) self.inner_blocks.append(inner_block) self.layer_blocks.append(layer_block) self.top_blocks = top_blocks def forward(self, x): """ Arguments: x (list[Tensor]): feature maps for each feature level. Returns: results (tuple[Tensor]): feature maps after FPN layers. They are ordered from highest resolution first. """ last_inner = getattr(self, self.inner_blocks[-1])(x[-1]) results = [] results.append(getattr(self, self.layer_blocks[-1])(last_inner)) for feature, inner_block, layer_block in zip( x[:-1][::-1], self.inner_blocks[:-1][::-1], self.layer_blocks[:-1][::-1] ): if not inner_block: continue inner_top_down = F.interpolate(last_inner, scale_factor=2, mode="nearest") inner_lateral = getattr(self, inner_block)(feature) # TODO use size instead of scale to make it robust to different sizes # inner_top_down = F.upsample(last_inner, size=inner_lateral.shape[-2:], # mode='bilinear', align_corners=False) last_inner = inner_lateral + inner_top_down results.insert(0, getattr(self, layer_block)(last_inner)) if isinstance(self.top_blocks, LastLevelP6P7): last_results = self.top_blocks(x[-1], results[-1]) results.extend(last_results) elif isinstance(self.top_blocks, LastLevelMaxPool): last_results = self.top_blocks(results[-1]) results.extend(last_results) return tuple(results) class LastLevelMaxPool(nn.Module): def forward(self, x): return [F.max_pool2d(x, 1, 2, 0)] class LastLevelP6P7(nn.Module): """ This module is used in RetinaNet to generate extra layers, P6 and P7. """ def __init__(self, in_channels, out_channels): super(LastLevelP6P7, self).__init__() self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1) self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1) for module in [self.p6, self.p7]: nn.init.kaiming_uniform_(module.weight, a=1) nn.init.constant_(module.bias, 0) self.use_P5 = in_channels == out_channels def forward(self, c5, p5): x = p5 if self.use_P5 else c5 p6 = self.p6(x) p7 = self.p7(F.relu(p6)) return [p6, p7] ================================================ FILE: maskrcnn_benchmark/modeling/backbone/resnet.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """ Variant of the resnet module that takes cfg as an argument. Example usage. Strings may be specified in the config file. model = ResNet( "StemWithFixedBatchNorm", "BottleneckWithFixedBatchNorm", "ResNet50StagesTo4", ) OR: model = ResNet( "StemWithGN", "BottleneckWithGN", "ResNet50StagesTo4", ) Custom implementations may be written in user code and hooked in via the `register_*` functions. """ from collections import namedtuple import torch import torch.nn.functional as F from torch import nn from maskrcnn_benchmark.layers import FrozenBatchNorm2d from maskrcnn_benchmark.layers import Conv2d from maskrcnn_benchmark.layers import DFConv2d from maskrcnn_benchmark.modeling.make_layers import group_norm from maskrcnn_benchmark.utils.registry import Registry # ResNet stage specification StageSpec = namedtuple( "StageSpec", [ "index", # Index of the stage, eg 1, 2, ..,. 5 "block_count", # Number of residual blocks in the stage "return_features", # True => return the last feature map from this stage ], ) # ----------------------------------------------------------------------------- # Standard ResNet models # ----------------------------------------------------------------------------- # ResNet-50 (including all stages) ResNet50StagesTo5 = tuple( StageSpec(index=i, block_count=c, return_features=r) for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 6, False), (4, 3, True)) ) # ResNet-50 up to stage 4 (excludes stage 5) ResNet50StagesTo4 = tuple( StageSpec(index=i, block_count=c, return_features=r) for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 6, True)) ) # ResNet-101 (including all stages) ResNet101StagesTo5 = tuple( StageSpec(index=i, block_count=c, return_features=r) for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 23, False), (4, 3, True)) ) # ResNet-101 up to stage 4 (excludes stage 5) ResNet101StagesTo4 = tuple( StageSpec(index=i, block_count=c, return_features=r) for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 23, True)) ) # ResNet-50-FPN (including all stages) ResNet50FPNStagesTo5 = tuple( StageSpec(index=i, block_count=c, return_features=r) for (i, c, r) in ((1, 3, True), (2, 4, True), (3, 6, True), (4, 3, True)) ) # ResNet-101-FPN (including all stages) ResNet101FPNStagesTo5 = tuple( StageSpec(index=i, block_count=c, return_features=r) for (i, c, r) in ((1, 3, True), (2, 4, True), (3, 23, True), (4, 3, True)) ) # ResNet-152-FPN (including all stages) ResNet152FPNStagesTo5 = tuple( StageSpec(index=i, block_count=c, return_features=r) for (i, c, r) in ((1, 3, True), (2, 8, True), (3, 36, True), (4, 3, True)) ) class ResNet(nn.Module): def __init__(self, cfg): super(ResNet, self).__init__() # If we want to use the cfg in forward(), then we should make a copy # of it and store it for later use: # self.cfg = cfg.clone() # Translate string names to implementations stem_module = _STEM_MODULES[cfg.MODEL.RESNETS.STEM_FUNC] stage_specs = _STAGE_SPECS[cfg.MODEL.BACKBONE.CONV_BODY] transformation_module = _TRANSFORMATION_MODULES[cfg.MODEL.RESNETS.TRANS_FUNC] # Construct the stem module self.stem = stem_module(cfg) # Constuct the specified ResNet stages num_groups = cfg.MODEL.RESNETS.NUM_GROUPS width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS stage2_bottleneck_channels = num_groups * width_per_group stage2_out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS self.stages = [] self.return_features = {} for stage_spec in stage_specs: name = "layer" + str(stage_spec.index) stage2_relative_factor = 2 ** (stage_spec.index - 1) bottleneck_channels = stage2_bottleneck_channels * stage2_relative_factor out_channels = stage2_out_channels * stage2_relative_factor stage_with_dcn = cfg.MODEL.RESNETS.STAGE_WITH_DCN[stage_spec.index -1] module = _make_stage( transformation_module, in_channels, bottleneck_channels, out_channels, stage_spec.block_count, num_groups, cfg.MODEL.RESNETS.STRIDE_IN_1X1, first_stride=int(stage_spec.index > 1) + 1, dcn_config={ "stage_with_dcn": stage_with_dcn, "with_modulated_dcn": cfg.MODEL.RESNETS.WITH_MODULATED_DCN, "deformable_groups": cfg.MODEL.RESNETS.DEFORMABLE_GROUPS, } ) in_channels = out_channels self.add_module(name, module) self.stages.append(name) self.return_features[name] = stage_spec.return_features # Optionally freeze (requires_grad=False) parts of the backbone self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_CONV_BODY_AT) def _freeze_backbone(self, freeze_at): if freeze_at < 0: return for stage_index in range(freeze_at): if stage_index == 0: m = self.stem # stage 0 is the stem else: m = getattr(self, "layer" + str(stage_index)) for p in m.parameters(): p.requires_grad = False def forward(self, x): outputs = [] x = self.stem(x) for stage_name in self.stages: x = getattr(self, stage_name)(x) if self.return_features[stage_name]: outputs.append(x) return outputs class ResNetHead(nn.Module): def __init__( self, block_module, stages, num_groups=1, width_per_group=64, stride_in_1x1=True, stride_init=None, res2_out_channels=256, dilation=1, dcn_config={} ): super(ResNetHead, self).__init__() stage2_relative_factor = 2 ** (stages[0].index - 1) stage2_bottleneck_channels = num_groups * width_per_group out_channels = res2_out_channels * stage2_relative_factor in_channels = out_channels // 2 bottleneck_channels = stage2_bottleneck_channels * stage2_relative_factor block_module = _TRANSFORMATION_MODULES[block_module] self.stages = [] stride = stride_init for stage in stages: name = "layer" + str(stage.index) if not stride: stride = int(stage.index > 1) + 1 module = _make_stage( block_module, in_channels, bottleneck_channels, out_channels, stage.block_count, num_groups, stride_in_1x1, first_stride=stride, dilation=dilation, dcn_config=dcn_config ) stride = None self.add_module(name, module) self.stages.append(name) self.out_channels = out_channels def forward(self, x): for stage in self.stages: x = getattr(self, stage)(x) return x def _make_stage( transformation_module, in_channels, bottleneck_channels, out_channels, block_count, num_groups, stride_in_1x1, first_stride, dilation=1, dcn_config={} ): blocks = [] stride = first_stride for _ in range(block_count): blocks.append( transformation_module( in_channels, bottleneck_channels, out_channels, num_groups, stride_in_1x1, stride, dilation=dilation, dcn_config=dcn_config ) ) stride = 1 in_channels = out_channels return nn.Sequential(*blocks) class Bottleneck(nn.Module): def __init__( self, in_channels, bottleneck_channels, out_channels, num_groups, stride_in_1x1, stride, dilation, norm_func, dcn_config ): super(Bottleneck, self).__init__() self.downsample = None if in_channels != out_channels: down_stride = stride if dilation == 1 else 1 self.downsample = nn.Sequential( Conv2d( in_channels, out_channels, kernel_size=1, stride=down_stride, bias=False ), norm_func(out_channels), ) for modules in [self.downsample,]: for l in modules.modules(): if isinstance(l, Conv2d): nn.init.kaiming_uniform_(l.weight, a=1) if dilation > 1: stride = 1 # reset to be 1 # The original MSRA ResNet models have stride in the first 1x1 conv # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have # stride in the 3x3 conv stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) self.conv1 = Conv2d( in_channels, bottleneck_channels, kernel_size=1, stride=stride_1x1, bias=False, ) self.bn1 = norm_func(bottleneck_channels) # TODO: specify init for the above with_dcn = dcn_config.get("stage_with_dcn", False) if with_dcn: deformable_groups = dcn_config.get("deformable_groups", 1) with_modulated_dcn = dcn_config.get("with_modulated_dcn", False) self.conv2 = DFConv2d( bottleneck_channels, bottleneck_channels, with_modulated_dcn=with_modulated_dcn, kernel_size=3, stride=stride_3x3, groups=num_groups, dilation=dilation, deformable_groups=deformable_groups, bias=False ) else: self.conv2 = Conv2d( bottleneck_channels, bottleneck_channels, kernel_size=3, stride=stride_3x3, padding=dilation, bias=False, groups=num_groups, dilation=dilation ) nn.init.kaiming_uniform_(self.conv2.weight, a=1) self.bn2 = norm_func(bottleneck_channels) self.conv3 = Conv2d( bottleneck_channels, out_channels, kernel_size=1, bias=False ) self.bn3 = norm_func(out_channels) for l in [self.conv1, self.conv3,]: nn.init.kaiming_uniform_(l.weight, a=1) def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = F.relu_(out) out = self.conv2(out) out = self.bn2(out) out = F.relu_(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = F.relu_(out) return out class BaseStem(nn.Module): def __init__(self, cfg, norm_func): super(BaseStem, self).__init__() out_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS self.conv1 = Conv2d( 3, out_channels, kernel_size=7, stride=2, padding=3, bias=False ) self.bn1 = norm_func(out_channels) for l in [self.conv1,]: nn.init.kaiming_uniform_(l.weight, a=1) def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = F.relu_(x) x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) return x class BottleneckWithFixedBatchNorm(Bottleneck): def __init__( self, in_channels, bottleneck_channels, out_channels, num_groups=1, stride_in_1x1=True, stride=1, dilation=1, dcn_config={} ): super(BottleneckWithFixedBatchNorm, self).__init__( in_channels=in_channels, bottleneck_channels=bottleneck_channels, out_channels=out_channels, num_groups=num_groups, stride_in_1x1=stride_in_1x1, stride=stride, dilation=dilation, norm_func=FrozenBatchNorm2d, dcn_config=dcn_config ) class StemWithFixedBatchNorm(BaseStem): def __init__(self, cfg): super(StemWithFixedBatchNorm, self).__init__( cfg, norm_func=FrozenBatchNorm2d ) class BottleneckWithGN(Bottleneck): def __init__( self, in_channels, bottleneck_channels, out_channels, num_groups=1, stride_in_1x1=True, stride=1, dilation=1, dcn_config={} ): super(BottleneckWithGN, self).__init__( in_channels=in_channels, bottleneck_channels=bottleneck_channels, out_channels=out_channels, num_groups=num_groups, stride_in_1x1=stride_in_1x1, stride=stride, dilation=dilation, norm_func=group_norm, dcn_config=dcn_config ) class StemWithGN(BaseStem): def __init__(self, cfg): super(StemWithGN, self).__init__(cfg, norm_func=group_norm) _TRANSFORMATION_MODULES = Registry({ "BottleneckWithFixedBatchNorm": BottleneckWithFixedBatchNorm, "BottleneckWithGN": BottleneckWithGN, }) _STEM_MODULES = Registry({ "StemWithFixedBatchNorm": StemWithFixedBatchNorm, "StemWithGN": StemWithGN, }) _STAGE_SPECS = Registry({ "R-50-C4": ResNet50StagesTo4, "R-50-C5": ResNet50StagesTo5, "R-101-C4": ResNet101StagesTo4, "R-101-C5": ResNet101StagesTo5, "R-50-FPN": ResNet50FPNStagesTo5, "R-50-FPN-RETINANET": ResNet50FPNStagesTo5, "R-101-FPN": ResNet101FPNStagesTo5, "R-101-FPN-RETINANET": ResNet101FPNStagesTo5, "R-152-FPN": ResNet152FPNStagesTo5, }) ================================================ FILE: maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch class BalancedPositiveNegativeSampler(object): """ This class samples batches, ensuring that they contain a fixed proportion of positives """ def __init__(self, batch_size_per_image, positive_fraction): """ Arguments: batch_size_per_image (int): number of elements to be selected per image positive_fraction (float): percentage of positive elements per batch """ self.batch_size_per_image = batch_size_per_image self.positive_fraction = positive_fraction def __call__(self, matched_idxs): """ Arguments: matched idxs: list of tensors containing -1, 0 or positive values. Each tensor corresponds to a specific image. -1 values are ignored, 0 are considered as negatives and > 0 as positives. Returns: pos_idx (list[tensor]) neg_idx (list[tensor]) Returns two lists of binary masks for each image. The first list contains the positive elements that were selected, and the second list the negative example. """ pos_idx = [] neg_idx = [] for matched_idxs_per_image in matched_idxs: positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1) negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1) num_pos = int(self.batch_size_per_image * self.positive_fraction) # protect against not enough positive examples num_pos = min(positive.numel(), num_pos) num_neg = self.batch_size_per_image - num_pos # protect against not enough negative examples num_neg = min(negative.numel(), num_neg) # randomly select positive and negative examples perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos] perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg] pos_idx_per_image = positive[perm1] neg_idx_per_image = negative[perm2] # create binary mask from indices pos_idx_per_image_mask = torch.zeros_like( matched_idxs_per_image, dtype=torch.bool ) neg_idx_per_image_mask = torch.zeros_like( matched_idxs_per_image, dtype=torch.bool ) pos_idx_per_image_mask[pos_idx_per_image] = 1 neg_idx_per_image_mask[neg_idx_per_image] = 1 pos_idx.append(pos_idx_per_image_mask) neg_idx.append(neg_idx_per_image_mask) return pos_idx, neg_idx ================================================ FILE: maskrcnn_benchmark/modeling/box_coder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import math import torch class BoxCoder(object): """ This class encodes and decodes a set of bounding boxes into the representation used for training the regressors. """ def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)): """ Arguments: weights (4-element tuple) bbox_xform_clip (float) """ self.weights = weights self.bbox_xform_clip = bbox_xform_clip def encode(self, reference_boxes, proposals): """ Encode a set of proposals with respect to some reference boxes Arguments: reference_boxes (Tensor): reference boxes proposals (Tensor): boxes to be encoded """ TO_REMOVE = 1 # TODO remove ex_widths = proposals[:, 2] - proposals[:, 0] + TO_REMOVE ex_heights = proposals[:, 3] - proposals[:, 1] + TO_REMOVE ex_ctr_x = proposals[:, 0] + 0.5 * ex_widths ex_ctr_y = proposals[:, 1] + 0.5 * ex_heights gt_widths = reference_boxes[:, 2] - reference_boxes[:, 0] + TO_REMOVE gt_heights = reference_boxes[:, 3] - reference_boxes[:, 1] + TO_REMOVE gt_ctr_x = reference_boxes[:, 0] + 0.5 * gt_widths gt_ctr_y = reference_boxes[:, 1] + 0.5 * gt_heights wx, wy, ww, wh = self.weights targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights targets_dw = ww * torch.log(gt_widths / ex_widths) targets_dh = wh * torch.log(gt_heights / ex_heights) targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh), dim=1) return targets def decode(self, rel_codes, boxes): """ From a set of original boxes and encoded relative box offsets, get the decoded boxes. Arguments: rel_codes (Tensor): encoded boxes boxes (Tensor): reference boxes. """ boxes = boxes.to(rel_codes.dtype) TO_REMOVE = 1 # TODO remove widths = boxes[:, 2] - boxes[:, 0] + TO_REMOVE heights = boxes[:, 3] - boxes[:, 1] + TO_REMOVE ctr_x = boxes[:, 0] + 0.5 * widths ctr_y = boxes[:, 1] + 0.5 * heights wx, wy, ww, wh = self.weights dx = rel_codes[:, 0::4] / wx dy = rel_codes[:, 1::4] / wy dw = rel_codes[:, 2::4] / ww dh = rel_codes[:, 3::4] / wh # Prevent sending too large values into torch.exp() dw = torch.clamp(dw, max=self.bbox_xform_clip) dh = torch.clamp(dh, max=self.bbox_xform_clip) pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] pred_w = torch.exp(dw) * widths[:, None] pred_h = torch.exp(dh) * heights[:, None] pred_boxes = torch.zeros_like(rel_codes) # x1 pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w # y1 pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h # x2 (note: "- 1" is correct; don't be fooled by the asymmetry) pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1 # y2 (note: "- 1" is correct; don't be fooled by the asymmetry) pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1 return pred_boxes ================================================ FILE: maskrcnn_benchmark/modeling/detector/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from .detectors import build_detection_model ================================================ FILE: maskrcnn_benchmark/modeling/detector/detectors.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from .generalized_rcnn import GeneralizedRCNN _DETECTION_META_ARCHITECTURES = {"GeneralizedRCNN": GeneralizedRCNN} def build_detection_model(cfg): meta_arch = _DETECTION_META_ARCHITECTURES[cfg.MODEL.META_ARCHITECTURE] return meta_arch(cfg) ================================================ FILE: maskrcnn_benchmark/modeling/detector/generalized_rcnn.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """ Implements the Generalized R-CNN framework """ import torch from torch import nn from maskrcnn_benchmark.structures.image_list import to_image_list from ..backbone import build_backbone from ..rpn.rpn import build_rpn from ..roi_heads.roi_heads import build_roi_heads class GeneralizedRCNN(nn.Module): """ Main class for Generalized R-CNN. Currently supports boxes and masks. It consists of three main parts: - backbone - rpn - heads: takes the features + the proposals from the RPN and computes detections / masks from it. """ def __init__(self, cfg): super(GeneralizedRCNN, self).__init__() self.backbone = build_backbone(cfg) self.rpn = build_rpn(cfg, self.backbone.out_channels) self.roi_heads = build_roi_heads(cfg, self.backbone.out_channels) def forward(self, images, targets=None): """ Arguments: images (list[Tensor] or ImageList): images to be processed targets (list[BoxList]): ground-truth boxes present in the image (optional) Returns: result (list[BoxList] or dict[Tensor]): the output from the model. During training, it returns a dict[Tensor] which contains the losses. During testing, it returns list[BoxList] contains additional fields like `scores`, `labels` and `mask` (for Mask R-CNN models). """ if self.training and targets is None: raise ValueError("In training mode, targets should be passed") images = to_image_list(images) features = self.backbone(images.tensors) proposals, proposal_losses = self.rpn(images, features, targets) if self.roi_heads: x, result, detector_losses = self.roi_heads(features, proposals, targets) else: # RPN-only models don't have roi_heads x = features result = proposals detector_losses = {} if self.training: losses = {} losses.update(detector_losses) losses.update(proposal_losses) return losses return result ================================================ FILE: maskrcnn_benchmark/modeling/make_layers.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """ Miscellaneous utility functions """ import torch from torch import nn from torch.nn import functional as F from maskrcnn_benchmark.config import cfg from maskrcnn_benchmark.layers import Conv2d from maskrcnn_benchmark.modeling.poolers import Pooler def get_group_gn(dim, dim_per_gp, num_groups): """get number of groups used by GroupNorm, based on number of channels.""" assert dim_per_gp == -1 or num_groups == -1, \ "GroupNorm: can only specify G or C/G." if dim_per_gp > 0: assert dim % dim_per_gp == 0, \ "dim: {}, dim_per_gp: {}".format(dim, dim_per_gp) group_gn = dim // dim_per_gp else: assert dim % num_groups == 0, \ "dim: {}, num_groups: {}".format(dim, num_groups) group_gn = num_groups return group_gn def group_norm(out_channels, affine=True, divisor=1): out_channels = out_channels // divisor dim_per_gp = cfg.MODEL.GROUP_NORM.DIM_PER_GP // divisor num_groups = cfg.MODEL.GROUP_NORM.NUM_GROUPS // divisor eps = cfg.MODEL.GROUP_NORM.EPSILON # default: 1e-5 return torch.nn.GroupNorm( get_group_gn(out_channels, dim_per_gp, num_groups), out_channels, eps, affine ) def make_conv3x3( in_channels, out_channels, dilation=1, stride=1, use_gn=False, use_relu=False, kaiming_init=True ): conv = Conv2d( in_channels, out_channels, kernel_size=3, stride=stride, padding=dilation, dilation=dilation, bias=False if use_gn else True ) if kaiming_init: nn.init.kaiming_normal_( conv.weight, mode="fan_out", nonlinearity="relu" ) else: torch.nn.init.normal_(conv.weight, std=0.01) if not use_gn: nn.init.constant_(conv.bias, 0) module = [conv,] if use_gn: module.append(group_norm(out_channels)) if use_relu: module.append(nn.ReLU(inplace=True)) if len(module) > 1: return nn.Sequential(*module) return conv def make_fc(dim_in, hidden_dim, use_gn=False): ''' Caffe2 implementation uses XavierFill, which in fact corresponds to kaiming_uniform_ in PyTorch ''' if use_gn: fc = nn.Linear(dim_in, hidden_dim, bias=False) nn.init.kaiming_uniform_(fc.weight, a=1) return nn.Sequential(fc, group_norm(hidden_dim)) fc = nn.Linear(dim_in, hidden_dim) nn.init.kaiming_uniform_(fc.weight, a=1) nn.init.constant_(fc.bias, 0) return fc def conv_with_kaiming_uniform(use_gn=False, use_relu=False): def make_conv( in_channels, out_channels, kernel_size, stride=1, dilation=1 ): conv = Conv2d( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=dilation * (kernel_size - 1) // 2, dilation=dilation, bias=False if use_gn else True ) # Caffe2 implementation uses XavierFill, which in fact # corresponds to kaiming_uniform_ in PyTorch nn.init.kaiming_uniform_(conv.weight, a=1) if not use_gn: nn.init.constant_(conv.bias, 0) module = [conv,] if use_gn: module.append(group_norm(out_channels)) if use_relu: module.append(nn.ReLU(inplace=True)) if len(module) > 1: return nn.Sequential(*module) return conv return make_conv ================================================ FILE: maskrcnn_benchmark/modeling/matcher.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch class Matcher(object): """ This class assigns to each predicted "element" (e.g., a box) a ground-truth element. Each predicted element will have exactly zero or one matches; each ground-truth element may be assigned to zero or more predicted elements. Matching is based on the MxN match_quality_matrix, that characterizes how well each (ground-truth, predicted)-pair match. For example, if the elements are boxes, the matrix may contain box IoU overlap values. The matcher returns a tensor of size N containing the index of the ground-truth element m that matches to prediction n. If there is no match, a negative value is returned. """ BELOW_LOW_THRESHOLD = -1 BETWEEN_THRESHOLDS = -2 def __init__(self, high_threshold, low_threshold, allow_low_quality_matches=False): """ Args: high_threshold (float): quality values greater than or equal to this value are candidate matches. low_threshold (float): a lower quality threshold used to stratify matches into three levels: 1) matches >= high_threshold 2) BETWEEN_THRESHOLDS matches in [low_threshold, high_threshold) 3) BELOW_LOW_THRESHOLD matches in [0, low_threshold) allow_low_quality_matches (bool): if True, produce additional matches for predictions that have only low-quality match candidates. See set_low_quality_matches_ for more details. """ assert low_threshold <= high_threshold self.high_threshold = high_threshold self.low_threshold = low_threshold self.allow_low_quality_matches = allow_low_quality_matches def __call__(self, match_quality_matrix): """ Args: match_quality_matrix (Tensor[float]): an MxN tensor, containing the pairwise quality between M ground-truth elements and N predicted elements. Returns: matches (Tensor[int64]): an N tensor where N[i] is a matched gt in [0, M - 1] or a negative value indicating that prediction i could not be matched. """ if match_quality_matrix.numel() == 0: # empty targets or proposals not supported during training if match_quality_matrix.shape[0] == 0: raise ValueError( "No ground-truth boxes available for one of the images " "during training") else: raise ValueError( "No proposal boxes available for one of the images " "during training") # match_quality_matrix is M (gt) x N (predicted) # Max over gt elements (dim 0) to find best gt candidate for each prediction matched_vals, matches = match_quality_matrix.max(dim=0) if self.allow_low_quality_matches: all_matches = matches.clone() # Assign candidate matches with low quality to negative (unassigned) values below_low_threshold = matched_vals < self.low_threshold between_thresholds = (matched_vals >= self.low_threshold) & ( matched_vals < self.high_threshold ) matches[below_low_threshold] = Matcher.BELOW_LOW_THRESHOLD matches[between_thresholds] = Matcher.BETWEEN_THRESHOLDS if self.allow_low_quality_matches: self.set_low_quality_matches_(matches, all_matches, match_quality_matrix) return matches def set_low_quality_matches_(self, matches, all_matches, match_quality_matrix): """ Produce additional matches for predictions that have only low-quality matches. Specifically, for each ground-truth find the set of predictions that have maximum overlap with it (including ties); for each prediction in that set, if it is unmatched, then match it to the ground-truth with which it has the highest quality value. """ # For each gt, find the prediction with which it has highest quality highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1) # Find highest quality match available, even if it is low, including ties gt_pred_pairs_of_highest_quality = torch.nonzero( match_quality_matrix == highest_quality_foreach_gt[:, None] ) # Example gt_pred_pairs_of_highest_quality: # tensor([[ 0, 39796], # [ 1, 32055], # [ 1, 32070], # [ 2, 39190], # [ 2, 40255], # [ 3, 40390], # [ 3, 41455], # [ 4, 45470], # [ 5, 45325], # [ 5, 46390]]) # Each row is a (gt index, prediction index) # Note how gt items 1, 2, 3, and 5 each have two ties pred_inds_to_update = gt_pred_pairs_of_highest_quality[:, 1] matches[pred_inds_to_update] = all_matches[pred_inds_to_update] ================================================ FILE: maskrcnn_benchmark/modeling/poolers.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch import torch.nn.functional as F from torch import nn from maskrcnn_benchmark.layers import ROIAlign from .utils import cat class LevelMapper(object): """Determine which FPN level each RoI in a set of RoIs should map to based on the heuristic in the FPN paper. """ def __init__(self, k_min, k_max, canonical_scale=224, canonical_level=4, eps=1e-6): """ Arguments: k_min (int) k_max (int) canonical_scale (int) canonical_level (int) eps (float) """ self.k_min = k_min self.k_max = k_max self.s0 = canonical_scale self.lvl0 = canonical_level self.eps = eps def __call__(self, boxlists): """ Arguments: boxlists (list[BoxList]) """ # Compute level ids s = torch.sqrt(cat([boxlist.area() for boxlist in boxlists])) # Eqn.(1) in FPN paper target_lvls = torch.floor(self.lvl0 + torch.log2(s / self.s0 + self.eps)) target_lvls = torch.clamp(target_lvls, min=self.k_min, max=self.k_max) return target_lvls.to(torch.int64) - self.k_min class Pooler(nn.Module): """ Pooler for Detection with or without FPN. It currently hard-code ROIAlign in the implementation, but that can be made more generic later on. Also, the requirement of passing the scales is not strictly necessary, as they can be inferred from the size of the feature map / size of original image, which is available thanks to the BoxList. """ def __init__(self, output_size, scales, sampling_ratio): """ Arguments: output_size (list[tuple[int]] or list[int]): output size for the pooled region scales (list[float]): scales for each Pooler sampling_ratio (int): sampling ratio for ROIAlign """ super(Pooler, self).__init__() poolers = [] for scale in scales: poolers.append( ROIAlign( output_size, spatial_scale=scale, sampling_ratio=sampling_ratio ) ) self.poolers = nn.ModuleList(poolers) self.output_size = output_size # get the levels in the feature map by leveraging the fact that the network always # downsamples by a factor of 2 at each level. lvl_min = -torch.log2(torch.tensor(scales[0], dtype=torch.float32)).item() lvl_max = -torch.log2(torch.tensor(scales[-1], dtype=torch.float32)).item() self.map_levels = LevelMapper(lvl_min, lvl_max) def convert_to_roi_format(self, boxes): concat_boxes = cat([b.bbox for b in boxes], dim=0) device, dtype = concat_boxes.device, concat_boxes.dtype ids = cat( [ torch.full((len(b), 1), i, dtype=dtype, device=device) for i, b in enumerate(boxes) ], dim=0, ) rois = torch.cat([ids, concat_boxes], dim=1) return rois def forward(self, x, boxes): """ Arguments: x (list[Tensor]): feature maps for each level boxes (list[BoxList]): boxes to be used to perform the pooling operation. Returns: result (Tensor) """ num_levels = len(self.poolers) rois = self.convert_to_roi_format(boxes) if num_levels == 1: return self.poolers[0](x[0], rois) levels = self.map_levels(boxes) num_rois = len(rois) num_channels = x[0].shape[1] output_size = self.output_size[0] dtype, device = x[0].dtype, x[0].device result = torch.zeros( (num_rois, num_channels, output_size, output_size), dtype=dtype, device=device, ) for level, (per_level_feature, pooler) in enumerate(zip(x, self.poolers)): idx_in_level = torch.nonzero(levels == level).squeeze(1) rois_per_level = rois[idx_in_level] result[idx_in_level] = pooler(per_level_feature, rois_per_level).to(dtype) return result def make_pooler(cfg, head_name): resolution = cfg.MODEL[head_name].POOLER_RESOLUTION scales = cfg.MODEL[head_name].POOLER_SCALES sampling_ratio = cfg.MODEL[head_name].POOLER_SAMPLING_RATIO pooler = Pooler( output_size=(resolution, resolution), scales=scales, sampling_ratio=sampling_ratio, ) return pooler ================================================ FILE: maskrcnn_benchmark/modeling/registry.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from maskrcnn_benchmark.utils.registry import Registry BACKBONES = Registry() RPN_HEADS = Registry() ROI_BOX_FEATURE_EXTRACTORS = Registry() ROI_BOX_PREDICTOR = Registry() ROI_KEYPOINT_FEATURE_EXTRACTORS = Registry() ROI_KEYPOINT_PREDICTOR = Registry() ROI_MASK_FEATURE_EXTRACTORS = Registry() ROI_MASK_PREDICTOR = Registry() ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/__init__.py ================================================ ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/box_head/__init__.py ================================================ ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch import nn from .roi_box_feature_extractors import make_roi_box_feature_extractor from .roi_box_predictors import make_roi_box_predictor from .inference import make_roi_box_post_processor from .loss import make_roi_box_loss_evaluator class ROIBoxHead(torch.nn.Module): """ Generic Box Head class. """ def __init__(self, cfg, in_channels): super(ROIBoxHead, self).__init__() self.feature_extractor = make_roi_box_feature_extractor(cfg, in_channels) self.predictor = make_roi_box_predictor( cfg, self.feature_extractor.out_channels) self.post_processor = make_roi_box_post_processor(cfg) self.loss_evaluator = make_roi_box_loss_evaluator(cfg) def forward(self, features, proposals, targets=None): """ Arguments: features (list[Tensor]): feature-maps from possibly several levels proposals (list[BoxList]): proposal boxes targets (list[BoxList], optional): the ground-truth targets. Returns: x (Tensor): the result of the feature extractor proposals (list[BoxList]): during training, the subsampled proposals are returned. During testing, the predicted boxlists are returned losses (dict[Tensor]): During training, returns the losses for the head. During testing, returns an empty dict. """ if self.training: # Faster R-CNN subsamples during training the proposals with a fixed # positive / negative ratio with torch.no_grad(): proposals = self.loss_evaluator.subsample(proposals, targets) # extract features that will be fed to the final classifier. The # feature_extractor generally corresponds to the pooler + heads x = self.feature_extractor(features, proposals) # final classifier that converts the features into predictions class_logits, box_regression = self.predictor(x) if not self.training: result = self.post_processor((class_logits, box_regression), proposals) return x, result, {} loss_classifier, loss_box_reg = self.loss_evaluator( [class_logits], [box_regression] ) return ( x, proposals, dict(loss_classifier=loss_classifier, loss_box_reg=loss_box_reg), ) def build_roi_box_head(cfg, in_channels): """ Constructs a new box head. By default, uses ROIBoxHead, but if it turns out not to be enough, just register a new class and make it a parameter in the config """ return ROIBoxHead(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch import torch.nn.functional as F from torch import nn from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist from maskrcnn_benchmark.modeling.box_coder import BoxCoder class PostProcessor(nn.Module): """ From a set of classification scores, box regression and proposals, computes the post-processed boxes, and applies NMS to obtain the final results """ def __init__( self, score_thresh=0.05, nms=0.5, detections_per_img=100, box_coder=None, cls_agnostic_bbox_reg=False, bbox_aug_enabled=False ): """ Arguments: score_thresh (float) nms (float) detections_per_img (int) box_coder (BoxCoder) """ super(PostProcessor, self).__init__() self.score_thresh = score_thresh self.nms = nms self.detections_per_img = detections_per_img if box_coder is None: box_coder = BoxCoder(weights=(10., 10., 5., 5.)) self.box_coder = box_coder self.cls_agnostic_bbox_reg = cls_agnostic_bbox_reg self.bbox_aug_enabled = bbox_aug_enabled def forward(self, x, boxes): """ Arguments: x (tuple[tensor, tensor]): x contains the class logits and the box_regression from the model. boxes (list[BoxList]): bounding boxes that are used as reference, one for each image Returns: results (list[BoxList]): one BoxList for each image, containing the extra fields labels and scores """ class_logits, box_regression = x class_prob = F.softmax(class_logits, -1) # TODO think about a representation of batch of boxes image_shapes = [box.size for box in boxes] boxes_per_image = [len(box) for box in boxes] concat_boxes = torch.cat([a.bbox for a in boxes], dim=0) if self.cls_agnostic_bbox_reg: box_regression = box_regression[:, -4:] proposals = self.box_coder.decode( box_regression.view(sum(boxes_per_image), -1), concat_boxes ) if self.cls_agnostic_bbox_reg: proposals = proposals.repeat(1, class_prob.shape[1]) num_classes = class_prob.shape[1] proposals = proposals.split(boxes_per_image, dim=0) class_prob = class_prob.split(boxes_per_image, dim=0) results = [] for prob, boxes_per_img, image_shape in zip( class_prob, proposals, image_shapes ): boxlist = self.prepare_boxlist(boxes_per_img, prob, image_shape) boxlist = boxlist.clip_to_image(remove_empty=False) if not self.bbox_aug_enabled: # If bbox aug is enabled, we will do it later boxlist = self.filter_results(boxlist, num_classes) results.append(boxlist) return results def prepare_boxlist(self, boxes, scores, image_shape): """ Returns BoxList from `boxes` and adds probability scores information as an extra field `boxes` has shape (#detections, 4 * #classes), where each row represents a list of predicted bounding boxes for each of the object classes in the dataset (including the background class). The detections in each row originate from the same object proposal. `scores` has shape (#detection, #classes), where each row represents a list of object detection confidence scores for each of the object classes in the dataset (including the background class). `scores[i, j]`` corresponds to the box at `boxes[i, j * 4:(j + 1) * 4]`. """ boxes = boxes.reshape(-1, 4) scores = scores.reshape(-1) boxlist = BoxList(boxes, image_shape, mode="xyxy") boxlist.add_field("scores", scores) return boxlist def filter_results(self, boxlist, num_classes): """Returns bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). """ # unwrap the boxlist to avoid additional overhead. # if we had multi-class NMS, we could perform this directly on the boxlist boxes = boxlist.bbox.reshape(-1, num_classes * 4) scores = boxlist.get_field("scores").reshape(-1, num_classes) device = scores.device result = [] # Apply threshold on detection probabilities and apply NMS # Skip j = 0, because it's the background class inds_all = scores > self.score_thresh for j in range(1, num_classes): inds = inds_all[:, j].nonzero().squeeze(1) scores_j = scores[inds, j] boxes_j = boxes[inds, j * 4 : (j + 1) * 4] boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") boxlist_for_class.add_field("scores", scores_j) boxlist_for_class = boxlist_nms( boxlist_for_class, self.nms ) num_labels = len(boxlist_for_class) boxlist_for_class.add_field( "labels", torch.full((num_labels,), j, dtype=torch.int64, device=device) ) result.append(boxlist_for_class) result = cat_boxlist(result) number_of_detections = len(result) # Limit to max_per_image detections **over all classes** if number_of_detections > self.detections_per_img > 0: cls_scores = result.get_field("scores") image_thresh, _ = torch.kthvalue( cls_scores.cpu(), number_of_detections - self.detections_per_img + 1 ) keep = cls_scores >= image_thresh.item() keep = torch.nonzero(keep).squeeze(1) result = result[keep] return result def make_roi_box_post_processor(cfg): use_fpn = cfg.MODEL.ROI_HEADS.USE_FPN bbox_reg_weights = cfg.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS box_coder = BoxCoder(weights=bbox_reg_weights) score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH nms_thresh = cfg.MODEL.ROI_HEADS.NMS detections_per_img = cfg.MODEL.ROI_HEADS.DETECTIONS_PER_IMG cls_agnostic_bbox_reg = cfg.MODEL.CLS_AGNOSTIC_BBOX_REG bbox_aug_enabled = cfg.TEST.BBOX_AUG.ENABLED postprocessor = PostProcessor( score_thresh, nms_thresh, detections_per_img, box_coder, cls_agnostic_bbox_reg, bbox_aug_enabled ) return postprocessor ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/box_head/loss.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch.nn import functional as F from maskrcnn_benchmark.layers import smooth_l1_loss from maskrcnn_benchmark.modeling.box_coder import BoxCoder from maskrcnn_benchmark.modeling.matcher import Matcher from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou from maskrcnn_benchmark.modeling.balanced_positive_negative_sampler import ( BalancedPositiveNegativeSampler ) from maskrcnn_benchmark.modeling.utils import cat class FastRCNNLossComputation(object): """ Computes the loss for Faster R-CNN. Also supports FPN """ def __init__( self, proposal_matcher, fg_bg_sampler, box_coder, cls_agnostic_bbox_reg=False ): """ Arguments: proposal_matcher (Matcher) fg_bg_sampler (BalancedPositiveNegativeSampler) box_coder (BoxCoder) """ self.proposal_matcher = proposal_matcher self.fg_bg_sampler = fg_bg_sampler self.box_coder = box_coder self.cls_agnostic_bbox_reg = cls_agnostic_bbox_reg def match_targets_to_proposals(self, proposal, target): match_quality_matrix = boxlist_iou(target, proposal) matched_idxs = self.proposal_matcher(match_quality_matrix) # Fast RCNN only need "labels" field for selecting the targets target = target.copy_with_fields("labels") # get the targets corresponding GT for each proposal # NB: need to clamp the indices because we can have a single # GT in the image, and matched_idxs can be -2, which goes # out of bounds matched_targets = target[matched_idxs.clamp(min=0)] matched_targets.add_field("matched_idxs", matched_idxs) return matched_targets def prepare_targets(self, proposals, targets): labels = [] regression_targets = [] for proposals_per_image, targets_per_image in zip(proposals, targets): matched_targets = self.match_targets_to_proposals( proposals_per_image, targets_per_image ) matched_idxs = matched_targets.get_field("matched_idxs") labels_per_image = matched_targets.get_field("labels") labels_per_image = labels_per_image.to(dtype=torch.int64) # Label background (below the low threshold) bg_inds = matched_idxs == Matcher.BELOW_LOW_THRESHOLD labels_per_image[bg_inds] = 0 # Label ignore proposals (between low and high thresholds) ignore_inds = matched_idxs == Matcher.BETWEEN_THRESHOLDS labels_per_image[ignore_inds] = -1 # -1 is ignored by sampler # compute regression targets regression_targets_per_image = self.box_coder.encode( matched_targets.bbox, proposals_per_image.bbox ) labels.append(labels_per_image) regression_targets.append(regression_targets_per_image) return labels, regression_targets def subsample(self, proposals, targets): """ This method performs the positive/negative sampling, and return the sampled proposals. Note: this function keeps a state. Arguments: proposals (list[BoxList]) targets (list[BoxList]) """ labels, regression_targets = self.prepare_targets(proposals, targets) sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels) proposals = list(proposals) # add corresponding label and regression_targets information to the bounding boxes for labels_per_image, regression_targets_per_image, proposals_per_image in zip( labels, regression_targets, proposals ): proposals_per_image.add_field("labels", labels_per_image) proposals_per_image.add_field( "regression_targets", regression_targets_per_image ) # distributed sampled proposals, that were obtained on all feature maps # concatenated via the fg_bg_sampler, into individual feature map levels for img_idx, (pos_inds_img, neg_inds_img) in enumerate( zip(sampled_pos_inds, sampled_neg_inds) ): img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1) proposals_per_image = proposals[img_idx][img_sampled_inds] proposals[img_idx] = proposals_per_image self._proposals = proposals return proposals def __call__(self, class_logits, box_regression): """ Computes the loss for Faster R-CNN. This requires that the subsample method has been called beforehand. Arguments: class_logits (list[Tensor]) box_regression (list[Tensor]) Returns: classification_loss (Tensor) box_loss (Tensor) """ class_logits = cat(class_logits, dim=0) box_regression = cat(box_regression, dim=0) device = class_logits.device if not hasattr(self, "_proposals"): raise RuntimeError("subsample needs to be called before") proposals = self._proposals labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0) regression_targets = cat( [proposal.get_field("regression_targets") for proposal in proposals], dim=0 ) classification_loss = F.cross_entropy(class_logits, labels) # get indices that correspond to the regression targets for # the corresponding ground truth labels, to be used with # advanced indexing sampled_pos_inds_subset = torch.nonzero(labels > 0).squeeze(1) labels_pos = labels[sampled_pos_inds_subset] if self.cls_agnostic_bbox_reg: map_inds = torch.tensor([4, 5, 6, 7], device=device) else: map_inds = 4 * labels_pos[:, None] + torch.tensor( [0, 1, 2, 3], device=device) box_loss = smooth_l1_loss( box_regression[sampled_pos_inds_subset[:, None], map_inds], regression_targets[sampled_pos_inds_subset], size_average=False, beta=1, ) box_loss = box_loss / labels.numel() return classification_loss, box_loss def make_roi_box_loss_evaluator(cfg): matcher = Matcher( cfg.MODEL.ROI_HEADS.FG_IOU_THRESHOLD, cfg.MODEL.ROI_HEADS.BG_IOU_THRESHOLD, allow_low_quality_matches=False, ) bbox_reg_weights = cfg.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS box_coder = BoxCoder(weights=bbox_reg_weights) fg_bg_sampler = BalancedPositiveNegativeSampler( cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE, cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION ) cls_agnostic_bbox_reg = cfg.MODEL.CLS_AGNOSTIC_BBOX_REG loss_evaluator = FastRCNNLossComputation( matcher, fg_bg_sampler, box_coder, cls_agnostic_bbox_reg ) return loss_evaluator ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_feature_extractors.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch import nn from torch.nn import functional as F from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.modeling.backbone import resnet from maskrcnn_benchmark.modeling.poolers import Pooler from maskrcnn_benchmark.modeling.make_layers import group_norm from maskrcnn_benchmark.modeling.make_layers import make_fc @registry.ROI_BOX_FEATURE_EXTRACTORS.register("ResNet50Conv5ROIFeatureExtractor") class ResNet50Conv5ROIFeatureExtractor(nn.Module): def __init__(self, config, in_channels): super(ResNet50Conv5ROIFeatureExtractor, self).__init__() resolution = config.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION scales = config.MODEL.ROI_BOX_HEAD.POOLER_SCALES sampling_ratio = config.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO pooler = Pooler( output_size=(resolution, resolution), scales=scales, sampling_ratio=sampling_ratio, ) stage = resnet.StageSpec(index=4, block_count=3, return_features=False) head = resnet.ResNetHead( block_module=config.MODEL.RESNETS.TRANS_FUNC, stages=(stage,), num_groups=config.MODEL.RESNETS.NUM_GROUPS, width_per_group=config.MODEL.RESNETS.WIDTH_PER_GROUP, stride_in_1x1=config.MODEL.RESNETS.STRIDE_IN_1X1, stride_init=None, res2_out_channels=config.MODEL.RESNETS.RES2_OUT_CHANNELS, dilation=config.MODEL.RESNETS.RES5_DILATION ) self.pooler = pooler self.head = head self.out_channels = head.out_channels def forward(self, x, proposals): x = self.pooler(x, proposals) x = self.head(x) return x @registry.ROI_BOX_FEATURE_EXTRACTORS.register("FPN2MLPFeatureExtractor") class FPN2MLPFeatureExtractor(nn.Module): """ Heads for FPN for classification """ def __init__(self, cfg, in_channels): super(FPN2MLPFeatureExtractor, self).__init__() resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO pooler = Pooler( output_size=(resolution, resolution), scales=scales, sampling_ratio=sampling_ratio, ) input_size = in_channels * resolution ** 2 representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN self.pooler = pooler self.fc6 = make_fc(input_size, representation_size, use_gn) self.fc7 = make_fc(representation_size, representation_size, use_gn) self.out_channels = representation_size def forward(self, x, proposals): x = self.pooler(x, proposals) x = x.view(x.size(0), -1) x = F.relu(self.fc6(x)) x = F.relu(self.fc7(x)) return x @registry.ROI_BOX_FEATURE_EXTRACTORS.register("FPNXconv1fcFeatureExtractor") class FPNXconv1fcFeatureExtractor(nn.Module): """ Heads for FPN for classification """ def __init__(self, cfg, in_channels): super(FPNXconv1fcFeatureExtractor, self).__init__() resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO pooler = Pooler( output_size=(resolution, resolution), scales=scales, sampling_ratio=sampling_ratio, ) self.pooler = pooler use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN conv_head_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM num_stacked_convs = cfg.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS dilation = cfg.MODEL.ROI_BOX_HEAD.DILATION xconvs = [] for ix in range(num_stacked_convs): xconvs.append( nn.Conv2d( in_channels, conv_head_dim, kernel_size=3, stride=1, padding=dilation, dilation=dilation, bias=False if use_gn else True ) ) in_channels = conv_head_dim if use_gn: xconvs.append(group_norm(in_channels)) xconvs.append(nn.ReLU(inplace=True)) self.add_module("xconvs", nn.Sequential(*xconvs)) for modules in [self.xconvs,]: for l in modules.modules(): if isinstance(l, nn.Conv2d): torch.nn.init.normal_(l.weight, std=0.01) if not use_gn: torch.nn.init.constant_(l.bias, 0) input_size = conv_head_dim * resolution ** 2 representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM self.fc6 = make_fc(input_size, representation_size, use_gn=False) self.out_channels = representation_size def forward(self, x, proposals): x = self.pooler(x, proposals) x = self.xconvs(x) x = x.view(x.size(0), -1) x = F.relu(self.fc6(x)) return x def make_roi_box_feature_extractor(cfg, in_channels): func = registry.ROI_BOX_FEATURE_EXTRACTORS[ cfg.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR ] return func(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from maskrcnn_benchmark.modeling import registry from torch import nn @registry.ROI_BOX_PREDICTOR.register("FastRCNNPredictor") class FastRCNNPredictor(nn.Module): def __init__(self, config, in_channels): super(FastRCNNPredictor, self).__init__() assert in_channels is not None num_inputs = in_channels num_classes = config.MODEL.ROI_BOX_HEAD.NUM_CLASSES self.avgpool = nn.AdaptiveAvgPool2d(1) self.cls_score = nn.Linear(num_inputs, num_classes) num_bbox_reg_classes = 2 if config.MODEL.CLS_AGNOSTIC_BBOX_REG else num_classes self.bbox_pred = nn.Linear(num_inputs, num_bbox_reg_classes * 4) nn.init.normal_(self.cls_score.weight, mean=0, std=0.01) nn.init.constant_(self.cls_score.bias, 0) nn.init.normal_(self.bbox_pred.weight, mean=0, std=0.001) nn.init.constant_(self.bbox_pred.bias, 0) def forward(self, x): x = self.avgpool(x) x = x.view(x.size(0), -1) cls_logit = self.cls_score(x) bbox_pred = self.bbox_pred(x) return cls_logit, bbox_pred @registry.ROI_BOX_PREDICTOR.register("FPNPredictor") class FPNPredictor(nn.Module): def __init__(self, cfg, in_channels): super(FPNPredictor, self).__init__() num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES representation_size = in_channels self.cls_score = nn.Linear(representation_size, num_classes) num_bbox_reg_classes = 2 if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG else num_classes self.bbox_pred = nn.Linear(representation_size, num_bbox_reg_classes * 4) nn.init.normal_(self.cls_score.weight, std=0.01) nn.init.normal_(self.bbox_pred.weight, std=0.001) for l in [self.cls_score, self.bbox_pred]: nn.init.constant_(l.bias, 0) def forward(self, x): if x.ndimension() == 4: assert list(x.shape[2:]) == [1, 1] x = x.view(x.size(0), -1) scores = self.cls_score(x) bbox_deltas = self.bbox_pred(x) return scores, bbox_deltas def make_roi_box_predictor(cfg, in_channels): func = registry.ROI_BOX_PREDICTOR[cfg.MODEL.ROI_BOX_HEAD.PREDICTOR] return func(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/keypoint_head/__init__.py ================================================ ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/keypoint_head/inference.py ================================================ import torch from torch import nn class KeypointPostProcessor(nn.Module): def __init__(self, keypointer=None): super(KeypointPostProcessor, self).__init__() self.keypointer = keypointer def forward(self, x, boxes): mask_prob = x scores = None if self.keypointer: mask_prob, scores = self.keypointer(x, boxes) assert len(boxes) == 1, "Only non-batched inference supported for now" boxes_per_image = [box.bbox.size(0) for box in boxes] mask_prob = mask_prob.split(boxes_per_image, dim=0) scores = scores.split(boxes_per_image, dim=0) results = [] for prob, box, score in zip(mask_prob, boxes, scores): bbox = BoxList(box.bbox, box.size, mode="xyxy") for field in box.fields(): bbox.add_field(field, box.get_field(field)) prob = PersonKeypoints(prob, box.size) prob.add_field("logits", score) bbox.add_field("keypoints", prob) results.append(bbox) return results # TODO remove and use only the Keypointer import numpy as np import cv2 def heatmaps_to_keypoints(maps, rois): """Extract predicted keypoint locations from heatmaps. Output has shape (#rois, 4, #keypoints) with the 4 rows corresponding to (x, y, logit, prob) for each keypoint. """ # This function converts a discrete image coordinate in a HEATMAP_SIZE x # HEATMAP_SIZE image to a continuous keypoint coordinate. We maintain # consistency with keypoints_to_heatmap_labels by using the conversion from # Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a # continuous coordinate. offset_x = rois[:, 0] offset_y = rois[:, 1] widths = rois[:, 2] - rois[:, 0] heights = rois[:, 3] - rois[:, 1] widths = np.maximum(widths, 1) heights = np.maximum(heights, 1) widths_ceil = np.ceil(widths) heights_ceil = np.ceil(heights) # NCHW to NHWC for use with OpenCV maps = np.transpose(maps, [0, 2, 3, 1]) min_size = 0 # cfg.KRCNN.INFERENCE_MIN_SIZE num_keypoints = maps.shape[3] xy_preds = np.zeros((len(rois), 3, num_keypoints), dtype=np.float32) end_scores = np.zeros((len(rois), num_keypoints), dtype=np.float32) for i in range(len(rois)): if min_size > 0: roi_map_width = int(np.maximum(widths_ceil[i], min_size)) roi_map_height = int(np.maximum(heights_ceil[i], min_size)) else: roi_map_width = widths_ceil[i] roi_map_height = heights_ceil[i] width_correction = widths[i] / roi_map_width height_correction = heights[i] / roi_map_height roi_map = cv2.resize( maps[i], (roi_map_width, roi_map_height), interpolation=cv2.INTER_CUBIC ) # Bring back to CHW roi_map = np.transpose(roi_map, [2, 0, 1]) # roi_map_probs = scores_to_probs(roi_map.copy()) w = roi_map.shape[2] pos = roi_map.reshape(num_keypoints, -1).argmax(axis=1) x_int = pos % w y_int = (pos - x_int) // w # assert (roi_map_probs[k, y_int, x_int] == # roi_map_probs[k, :, :].max()) x = (x_int + 0.5) * width_correction y = (y_int + 0.5) * height_correction xy_preds[i, 0, :] = x + offset_x[i] xy_preds[i, 1, :] = y + offset_y[i] xy_preds[i, 2, :] = 1 end_scores[i, :] = roi_map[np.arange(num_keypoints), y_int, x_int] return np.transpose(xy_preds, [0, 2, 1]), end_scores from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.structures.keypoint import PersonKeypoints class Keypointer(object): """ Projects a set of masks in an image on the locations specified by the bounding boxes """ def __init__(self, padding=0): self.padding = padding def __call__(self, masks, boxes): # TODO do this properly if isinstance(boxes, BoxList): boxes = [boxes] assert len(boxes) == 1 result, scores = heatmaps_to_keypoints( masks.detach().cpu().numpy(), boxes[0].bbox.cpu().numpy() ) return torch.from_numpy(result).to(masks.device), torch.as_tensor(scores, device=masks.device) def make_roi_keypoint_post_processor(cfg): keypointer = Keypointer() keypoint_post_processor = KeypointPostProcessor(keypointer) return keypoint_post_processor ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/keypoint_head/keypoint_head.py ================================================ import torch from .roi_keypoint_feature_extractors import make_roi_keypoint_feature_extractor from .roi_keypoint_predictors import make_roi_keypoint_predictor from .inference import make_roi_keypoint_post_processor from .loss import make_roi_keypoint_loss_evaluator class ROIKeypointHead(torch.nn.Module): def __init__(self, cfg, in_channels): super(ROIKeypointHead, self).__init__() self.cfg = cfg.clone() self.feature_extractor = make_roi_keypoint_feature_extractor(cfg, in_channels) self.predictor = make_roi_keypoint_predictor( cfg, self.feature_extractor.out_channels) self.post_processor = make_roi_keypoint_post_processor(cfg) self.loss_evaluator = make_roi_keypoint_loss_evaluator(cfg) def forward(self, features, proposals, targets=None): """ Arguments: features (list[Tensor]): feature-maps from possibly several levels proposals (list[BoxList]): proposal boxes targets (list[BoxList], optional): the ground-truth targets. Returns: x (Tensor): the result of the feature extractor proposals (list[BoxList]): during training, the original proposals are returned. During testing, the predicted boxlists are returned with the `mask` field set losses (dict[Tensor]): During training, returns the losses for the head. During testing, returns an empty dict. """ if self.training: with torch.no_grad(): proposals = self.loss_evaluator.subsample(proposals, targets) x = self.feature_extractor(features, proposals) kp_logits = self.predictor(x) if not self.training: result = self.post_processor(kp_logits, proposals) return x, result, {} loss_kp = self.loss_evaluator(proposals, kp_logits) return x, proposals, dict(loss_kp=loss_kp) def build_roi_keypoint_head(cfg, in_channels): return ROIKeypointHead(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/keypoint_head/loss.py ================================================ import torch from torch.nn import functional as F from maskrcnn_benchmark.modeling.matcher import Matcher from maskrcnn_benchmark.modeling.balanced_positive_negative_sampler import ( BalancedPositiveNegativeSampler, ) from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou from maskrcnn_benchmark.modeling.utils import cat from maskrcnn_benchmark.layers import smooth_l1_loss from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist from maskrcnn_benchmark.structures.keypoint import keypoints_to_heat_map def project_keypoints_to_heatmap(keypoints, proposals, discretization_size): proposals = proposals.convert("xyxy") return keypoints_to_heat_map( keypoints.keypoints, proposals.bbox, discretization_size ) def cat_boxlist_with_keypoints(boxlists): assert all(boxlist.has_field("keypoints") for boxlist in boxlists) kp = [boxlist.get_field("keypoints").keypoints for boxlist in boxlists] kp = cat(kp, 0) fields = boxlists[0].get_fields() fields = [field for field in fields if field != "keypoints"] boxlists = [boxlist.copy_with_fields(fields) for boxlist in boxlists] boxlists = cat_boxlist(boxlists) boxlists.add_field("keypoints", kp) return boxlists def _within_box(points, boxes): """Validate which keypoints are contained inside a given box. points: NxKx2 boxes: Nx4 output: NxK """ x_within = (points[..., 0] >= boxes[:, 0, None]) & ( points[..., 0] <= boxes[:, 2, None] ) y_within = (points[..., 1] >= boxes[:, 1, None]) & ( points[..., 1] <= boxes[:, 3, None] ) return x_within & y_within class KeypointRCNNLossComputation(object): def __init__(self, proposal_matcher, fg_bg_sampler, discretization_size): """ Arguments: proposal_matcher (Matcher) fg_bg_sampler (BalancedPositiveNegativeSampler) discretization_size (int) """ self.proposal_matcher = proposal_matcher self.fg_bg_sampler = fg_bg_sampler self.discretization_size = discretization_size def match_targets_to_proposals(self, proposal, target): match_quality_matrix = boxlist_iou(target, proposal) matched_idxs = self.proposal_matcher(match_quality_matrix) # Keypoint RCNN needs "labels" and "keypoints "fields for creating the targets target = target.copy_with_fields(["labels", "keypoints"]) # get the targets corresponding GT for each proposal # NB: need to clamp the indices because we can have a single # GT in the image, and matched_idxs can be -2, which goes # out of bounds matched_targets = target[matched_idxs.clamp(min=0)] matched_targets.add_field("matched_idxs", matched_idxs) return matched_targets def prepare_targets(self, proposals, targets): labels = [] keypoints = [] for proposals_per_image, targets_per_image in zip(proposals, targets): matched_targets = self.match_targets_to_proposals( proposals_per_image, targets_per_image ) matched_idxs = matched_targets.get_field("matched_idxs") labels_per_image = matched_targets.get_field("labels") labels_per_image = labels_per_image.to(dtype=torch.int64) # this can probably be removed, but is left here for clarity # and completeness # TODO check if this is the right one, as BELOW_THRESHOLD neg_inds = matched_idxs == Matcher.BELOW_LOW_THRESHOLD labels_per_image[neg_inds] = 0 keypoints_per_image = matched_targets.get_field("keypoints") within_box = _within_box( keypoints_per_image.keypoints, matched_targets.bbox ) vis_kp = keypoints_per_image.keypoints[..., 2] > 0 is_visible = (within_box & vis_kp).sum(1) > 0 labels_per_image[~is_visible] = -1 labels.append(labels_per_image) keypoints.append(keypoints_per_image) return labels, keypoints def subsample(self, proposals, targets): """ This method performs the positive/negative sampling, and return the sampled proposals. Note: this function keeps a state. Arguments: proposals (list[BoxList]) targets (list[BoxList]) """ labels, keypoints = self.prepare_targets(proposals, targets) sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels) proposals = list(proposals) # add corresponding label and regression_targets information to the bounding boxes for labels_per_image, keypoints_per_image, proposals_per_image in zip( labels, keypoints, proposals ): proposals_per_image.add_field("labels", labels_per_image) proposals_per_image.add_field("keypoints", keypoints_per_image) # distributed sampled proposals, that were obtained on all feature maps # concatenated via the fg_bg_sampler, into individual feature map levels for img_idx, (pos_inds_img, neg_inds_img) in enumerate( zip(sampled_pos_inds, sampled_neg_inds) ): img_sampled_inds = torch.nonzero(pos_inds_img).squeeze(1) proposals_per_image = proposals[img_idx][img_sampled_inds] proposals[img_idx] = proposals_per_image self._proposals = proposals return proposals def __call__(self, proposals, keypoint_logits): heatmaps = [] valid = [] for proposals_per_image in proposals: kp = proposals_per_image.get_field("keypoints") heatmaps_per_image, valid_per_image = project_keypoints_to_heatmap( kp, proposals_per_image, self.discretization_size ) heatmaps.append(heatmaps_per_image.view(-1)) valid.append(valid_per_image.view(-1)) keypoint_targets = cat(heatmaps, dim=0) valid = cat(valid, dim=0).to(dtype=torch.bool) valid = torch.nonzero(valid).squeeze(1) # torch.mean (in binary_cross_entropy_with_logits) does'nt # accept empty tensors, so handle it sepaartely if keypoint_targets.numel() == 0 or len(valid) == 0: return keypoint_logits.sum() * 0 N, K, H, W = keypoint_logits.shape keypoint_logits = keypoint_logits.view(N * K, H * W) keypoint_loss = F.cross_entropy(keypoint_logits[valid], keypoint_targets[valid]) return keypoint_loss def make_roi_keypoint_loss_evaluator(cfg): matcher = Matcher( cfg.MODEL.ROI_HEADS.FG_IOU_THRESHOLD, cfg.MODEL.ROI_HEADS.BG_IOU_THRESHOLD, allow_low_quality_matches=False, ) fg_bg_sampler = BalancedPositiveNegativeSampler( cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE, cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION ) resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.RESOLUTION loss_evaluator = KeypointRCNNLossComputation(matcher, fg_bg_sampler, resolution) return loss_evaluator ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_feature_extractors.py ================================================ from torch import nn from torch.nn import functional as F from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.modeling.poolers import Pooler from maskrcnn_benchmark.layers import Conv2d @registry.ROI_KEYPOINT_FEATURE_EXTRACTORS.register("KeypointRCNNFeatureExtractor") class KeypointRCNNFeatureExtractor(nn.Module): def __init__(self, cfg, in_channels): super(KeypointRCNNFeatureExtractor, self).__init__() resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION scales = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SCALES sampling_ratio = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO pooler = Pooler( output_size=(resolution, resolution), scales=scales, sampling_ratio=sampling_ratio, ) self.pooler = pooler input_features = in_channels layers = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS next_feature = input_features self.blocks = [] for layer_idx, layer_features in enumerate(layers, 1): layer_name = "conv_fcn{}".format(layer_idx) module = Conv2d(next_feature, layer_features, 3, stride=1, padding=1) nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") nn.init.constant_(module.bias, 0) self.add_module(layer_name, module) next_feature = layer_features self.blocks.append(layer_name) self.out_channels = layer_features def forward(self, x, proposals): x = self.pooler(x, proposals) for layer_name in self.blocks: x = F.relu(getattr(self, layer_name)(x)) return x def make_roi_keypoint_feature_extractor(cfg, in_channels): func = registry.ROI_KEYPOINT_FEATURE_EXTRACTORS[ cfg.MODEL.ROI_KEYPOINT_HEAD.FEATURE_EXTRACTOR ] return func(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_predictors.py ================================================ from torch import nn from maskrcnn_benchmark import layers from maskrcnn_benchmark.modeling import registry @registry.ROI_KEYPOINT_PREDICTOR.register("KeypointRCNNPredictor") class KeypointRCNNPredictor(nn.Module): def __init__(self, cfg, in_channels): super(KeypointRCNNPredictor, self).__init__() input_features = in_channels num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES deconv_kernel = 4 self.kps_score_lowres = layers.ConvTranspose2d( input_features, num_keypoints, deconv_kernel, stride=2, padding=deconv_kernel // 2 - 1, ) nn.init.kaiming_normal_( self.kps_score_lowres.weight, mode="fan_out", nonlinearity="relu" ) nn.init.constant_(self.kps_score_lowres.bias, 0) self.up_scale = 2 self.out_channels = num_keypoints def forward(self, x): x = self.kps_score_lowres(x) x = layers.interpolate( x, scale_factor=self.up_scale, mode="bilinear", align_corners=False ) return x def make_roi_keypoint_predictor(cfg, in_channels): func = registry.ROI_KEYPOINT_PREDICTOR[cfg.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR] return func(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/mask_head/__init__.py ================================================ ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/mask_head/inference.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import numpy as np import torch from torch import nn from maskrcnn_benchmark.layers.misc import interpolate from maskrcnn_benchmark.structures.bounding_box import BoxList # TODO check if want to return a single BoxList or a composite # object class MaskPostProcessor(nn.Module): """ From the results of the CNN, post process the masks by taking the mask corresponding to the class with max probability (which are of fixed size and directly output by the CNN) and return the masks in the mask field of the BoxList. If a masker object is passed, it will additionally project the masks in the image according to the locations in boxes, """ def __init__(self, masker=None): super(MaskPostProcessor, self).__init__() self.masker = masker def forward(self, x, boxes): """ Arguments: x (Tensor): the mask logits boxes (list[BoxList]): bounding boxes that are used as reference, one for each image Returns: results (list[BoxList]): one BoxList for each image, containing the extra field mask """ mask_prob = x.sigmoid() # select masks coresponding to the predicted classes num_masks = x.shape[0] labels = [bbox.get_field("labels") for bbox in boxes] labels = torch.cat(labels) index = torch.arange(num_masks, device=labels.device) mask_prob = mask_prob[index, labels][:, None] boxes_per_image = [len(box) for box in boxes] mask_prob = mask_prob.split(boxes_per_image, dim=0) if self.masker: mask_prob = self.masker(mask_prob, boxes) results = [] for prob, box in zip(mask_prob, boxes): bbox = BoxList(box.bbox, box.size, mode="xyxy") for field in box.fields(): bbox.add_field(field, box.get_field(field)) bbox.add_field("mask", prob) results.append(bbox) return results class MaskPostProcessorCOCOFormat(MaskPostProcessor): """ From the results of the CNN, post process the results so that the masks are pasted in the image, and additionally convert the results to COCO format. """ def forward(self, x, boxes): import pycocotools.mask as mask_util import numpy as np results = super(MaskPostProcessorCOCOFormat, self).forward(x, boxes) for result in results: masks = result.get_field("mask").cpu() rles = [ mask_util.encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0] for mask in masks ] for rle in rles: rle["counts"] = rle["counts"].decode("utf-8") result.add_field("mask", rles) return results # the next two functions should be merged inside Masker # but are kept here for the moment while we need them # temporarily gor paste_mask_in_image def expand_boxes(boxes, scale): w_half = (boxes[:, 2] - boxes[:, 0]) * .5 h_half = (boxes[:, 3] - boxes[:, 1]) * .5 x_c = (boxes[:, 2] + boxes[:, 0]) * .5 y_c = (boxes[:, 3] + boxes[:, 1]) * .5 w_half *= scale h_half *= scale boxes_exp = torch.zeros_like(boxes) boxes_exp[:, 0] = x_c - w_half boxes_exp[:, 2] = x_c + w_half boxes_exp[:, 1] = y_c - h_half boxes_exp[:, 3] = y_c + h_half return boxes_exp def expand_masks(mask, padding): N = mask.shape[0] M = mask.shape[-1] pad2 = 2 * padding scale = float(M + pad2) / M padded_mask = mask.new_zeros((N, 1, M + pad2, M + pad2)) padded_mask[:, :, padding:-padding, padding:-padding] = mask return padded_mask, scale def paste_mask_in_image(mask, box, im_h, im_w, thresh=0.5, padding=1): # Need to work on the CPU, where fp16 isn't supported - cast to float to avoid this mask = mask.float() box = box.float() padded_mask, scale = expand_masks(mask[None], padding=padding) mask = padded_mask[0, 0] box = expand_boxes(box[None], scale)[0] box = box.to(dtype=torch.int32) TO_REMOVE = 1 w = int(box[2] - box[0] + TO_REMOVE) h = int(box[3] - box[1] + TO_REMOVE) w = max(w, 1) h = max(h, 1) # Set shape to [batchxCxHxW] mask = mask.expand((1, 1, -1, -1)) # Resize mask mask = mask.to(torch.float32) mask = interpolate(mask, size=(h, w), mode='bilinear', align_corners=False) mask = mask[0][0] if thresh >= 0: mask = mask > thresh else: # for visualization and debugging, we also # allow it to return an unmodified mask mask = (mask * 255).to(torch.bool) im_mask = torch.zeros((im_h, im_w), dtype=torch.bool) x_0 = max(box[0], 0) x_1 = min(box[2] + 1, im_w) y_0 = max(box[1], 0) y_1 = min(box[3] + 1, im_h) im_mask[y_0:y_1, x_0:x_1] = mask[ (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0]) ] return im_mask class Masker(object): """ Projects a set of masks in an image on the locations specified by the bounding boxes """ def __init__(self, threshold=0.5, padding=1): self.threshold = threshold self.padding = padding def forward_single_image(self, masks, boxes): boxes = boxes.convert("xyxy") im_w, im_h = boxes.size res = [ paste_mask_in_image(mask[0], box, im_h, im_w, self.threshold, self.padding) for mask, box in zip(masks, boxes.bbox) ] if len(res) > 0: res = torch.stack(res, dim=0)[:, None] else: res = masks.new_empty((0, 1, masks.shape[-2], masks.shape[-1])) return res def __call__(self, masks, boxes): if isinstance(boxes, BoxList): boxes = [boxes] # Make some sanity check assert len(boxes) == len(masks), "Masks and boxes should have the same length." # TODO: Is this JIT compatible? # If not we should make it compatible. results = [] for mask, box in zip(masks, boxes): assert mask.shape[0] == len(box), "Number of objects should be the same." result = self.forward_single_image(mask, box) results.append(result) return results def make_roi_mask_post_processor(cfg): if cfg.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS: mask_threshold = cfg.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD masker = Masker(threshold=mask_threshold, padding=1) else: masker = None mask_post_processor = MaskPostProcessor(masker) return mask_post_processor ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/mask_head/loss.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch.nn import functional as F from maskrcnn_benchmark.layers import smooth_l1_loss from maskrcnn_benchmark.modeling.matcher import Matcher from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou from maskrcnn_benchmark.modeling.utils import cat def project_masks_on_boxes(segmentation_masks, proposals, discretization_size): """ Given segmentation masks and the bounding boxes corresponding to the location of the masks in the image, this function crops and resizes the masks in the position defined by the boxes. This prepares the masks for them to be fed to the loss computation as the targets. Arguments: segmentation_masks: an instance of SegmentationMask proposals: an instance of BoxList """ masks = [] M = discretization_size device = proposals.bbox.device proposals = proposals.convert("xyxy") assert segmentation_masks.size == proposals.size, "{}, {}".format( segmentation_masks, proposals ) # FIXME: CPU computation bottleneck, this should be parallelized proposals = proposals.bbox.to(torch.device("cpu")) for segmentation_mask, proposal in zip(segmentation_masks, proposals): # crop the masks, resize them to the desired resolution and # then convert them to the tensor representation. cropped_mask = segmentation_mask.crop(proposal) scaled_mask = cropped_mask.resize((M, M)) mask = scaled_mask.get_mask_tensor() masks.append(mask) if len(masks) == 0: return torch.empty(0, dtype=torch.float32, device=device) return torch.stack(masks, dim=0).to(device, dtype=torch.float32) class MaskRCNNLossComputation(object): def __init__(self, proposal_matcher, discretization_size): """ Arguments: proposal_matcher (Matcher) discretization_size (int) """ self.proposal_matcher = proposal_matcher self.discretization_size = discretization_size def match_targets_to_proposals(self, proposal, target): match_quality_matrix = boxlist_iou(target, proposal) matched_idxs = self.proposal_matcher(match_quality_matrix) # Mask RCNN needs "labels" and "masks "fields for creating the targets target = target.copy_with_fields(["labels", "masks"]) # get the targets corresponding GT for each proposal # NB: need to clamp the indices because we can have a single # GT in the image, and matched_idxs can be -2, which goes # out of bounds matched_targets = target[matched_idxs.clamp(min=0)] matched_targets.add_field("matched_idxs", matched_idxs) return matched_targets def prepare_targets(self, proposals, targets): labels = [] masks = [] for proposals_per_image, targets_per_image in zip(proposals, targets): matched_targets = self.match_targets_to_proposals( proposals_per_image, targets_per_image ) matched_idxs = matched_targets.get_field("matched_idxs") labels_per_image = matched_targets.get_field("labels") labels_per_image = labels_per_image.to(dtype=torch.int64) # this can probably be removed, but is left here for clarity # and completeness neg_inds = matched_idxs == Matcher.BELOW_LOW_THRESHOLD labels_per_image[neg_inds] = 0 # mask scores are only computed on positive samples positive_inds = torch.nonzero(labels_per_image > 0).squeeze(1) segmentation_masks = matched_targets.get_field("masks") segmentation_masks = segmentation_masks[positive_inds] positive_proposals = proposals_per_image[positive_inds] masks_per_image = project_masks_on_boxes( segmentation_masks, positive_proposals, self.discretization_size ) labels.append(labels_per_image) masks.append(masks_per_image) return labels, masks def __call__(self, proposals, mask_logits, targets): """ Arguments: proposals (list[BoxList]) mask_logits (Tensor) targets (list[BoxList]) Return: mask_loss (Tensor): scalar tensor containing the loss """ labels, mask_targets = self.prepare_targets(proposals, targets) labels = cat(labels, dim=0) mask_targets = cat(mask_targets, dim=0) positive_inds = torch.nonzero(labels > 0).squeeze(1) labels_pos = labels[positive_inds] # torch.mean (in binary_cross_entropy_with_logits) doesn't # accept empty tensors, so handle it separately if mask_targets.numel() == 0: return mask_logits.sum() * 0 mask_loss = F.binary_cross_entropy_with_logits( mask_logits[positive_inds, labels_pos], mask_targets ) return mask_loss def make_roi_mask_loss_evaluator(cfg): matcher = Matcher( cfg.MODEL.ROI_HEADS.FG_IOU_THRESHOLD, cfg.MODEL.ROI_HEADS.BG_IOU_THRESHOLD, allow_low_quality_matches=False, ) loss_evaluator = MaskRCNNLossComputation( matcher, cfg.MODEL.ROI_MASK_HEAD.RESOLUTION ) return loss_evaluator ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/mask_head/mask_head.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch import nn from maskrcnn_benchmark.structures.bounding_box import BoxList from .roi_mask_feature_extractors import make_roi_mask_feature_extractor from .roi_mask_predictors import make_roi_mask_predictor from .inference import make_roi_mask_post_processor from .loss import make_roi_mask_loss_evaluator def keep_only_positive_boxes(boxes): """ Given a set of BoxList containing the `labels` field, return a set of BoxList for which `labels > 0`. Arguments: boxes (list of BoxList) """ assert isinstance(boxes, (list, tuple)) assert isinstance(boxes[0], BoxList) assert boxes[0].has_field("labels") positive_boxes = [] positive_inds = [] num_boxes = 0 for boxes_per_image in boxes: labels = boxes_per_image.get_field("labels") inds_mask = labels > 0 inds = inds_mask.nonzero().squeeze(1) positive_boxes.append(boxes_per_image[inds]) positive_inds.append(inds_mask) return positive_boxes, positive_inds class ROIMaskHead(torch.nn.Module): def __init__(self, cfg, in_channels): super(ROIMaskHead, self).__init__() self.cfg = cfg.clone() self.feature_extractor = make_roi_mask_feature_extractor(cfg, in_channels) self.predictor = make_roi_mask_predictor( cfg, self.feature_extractor.out_channels) self.post_processor = make_roi_mask_post_processor(cfg) self.loss_evaluator = make_roi_mask_loss_evaluator(cfg) def forward(self, features, proposals, targets=None): """ Arguments: features (list[Tensor]): feature-maps from possibly several levels proposals (list[BoxList]): proposal boxes targets (list[BoxList], optional): the ground-truth targets. Returns: x (Tensor): the result of the feature extractor proposals (list[BoxList]): during training, the original proposals are returned. During testing, the predicted boxlists are returned with the `mask` field set losses (dict[Tensor]): During training, returns the losses for the head. During testing, returns an empty dict. """ if self.training: # during training, only focus on positive boxes all_proposals = proposals proposals, positive_inds = keep_only_positive_boxes(proposals) if self.training and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR: x = features x = x[torch.cat(positive_inds, dim=0)] else: x = self.feature_extractor(features, proposals) mask_logits = self.predictor(x) if not self.training: result = self.post_processor(mask_logits, proposals) return x, result, {} loss_mask = self.loss_evaluator(proposals, mask_logits, targets) return x, all_proposals, dict(loss_mask=loss_mask) def build_roi_mask_head(cfg, in_channels): return ROIMaskHead(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_feature_extractors.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from torch import nn from torch.nn import functional as F from ..box_head.roi_box_feature_extractors import ResNet50Conv5ROIFeatureExtractor from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.modeling.poolers import Pooler from maskrcnn_benchmark.modeling.make_layers import make_conv3x3 registry.ROI_MASK_FEATURE_EXTRACTORS.register( "ResNet50Conv5ROIFeatureExtractor", ResNet50Conv5ROIFeatureExtractor ) @registry.ROI_MASK_FEATURE_EXTRACTORS.register("MaskRCNNFPNFeatureExtractor") class MaskRCNNFPNFeatureExtractor(nn.Module): """ Heads for FPN for classification """ def __init__(self, cfg, in_channels): """ Arguments: num_classes (int): number of output classes input_size (int): number of channels of the input once it's flattened representation_size (int): size of the intermediate representation """ super(MaskRCNNFPNFeatureExtractor, self).__init__() resolution = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION scales = cfg.MODEL.ROI_MASK_HEAD.POOLER_SCALES sampling_ratio = cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO pooler = Pooler( output_size=(resolution, resolution), scales=scales, sampling_ratio=sampling_ratio, ) input_size = in_channels self.pooler = pooler use_gn = cfg.MODEL.ROI_MASK_HEAD.USE_GN layers = cfg.MODEL.ROI_MASK_HEAD.CONV_LAYERS dilation = cfg.MODEL.ROI_MASK_HEAD.DILATION next_feature = input_size self.blocks = [] for layer_idx, layer_features in enumerate(layers, 1): layer_name = "mask_fcn{}".format(layer_idx) module = make_conv3x3( next_feature, layer_features, dilation=dilation, stride=1, use_gn=use_gn ) self.add_module(layer_name, module) next_feature = layer_features self.blocks.append(layer_name) self.out_channels = layer_features def forward(self, x, proposals): x = self.pooler(x, proposals) for layer_name in self.blocks: x = F.relu(getattr(self, layer_name)(x)) return x def make_roi_mask_feature_extractor(cfg, in_channels): func = registry.ROI_MASK_FEATURE_EXTRACTORS[ cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR ] return func(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_predictors.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from torch import nn from torch.nn import functional as F from maskrcnn_benchmark.layers import Conv2d from maskrcnn_benchmark.layers import ConvTranspose2d from maskrcnn_benchmark.modeling import registry @registry.ROI_MASK_PREDICTOR.register("MaskRCNNC4Predictor") class MaskRCNNC4Predictor(nn.Module): def __init__(self, cfg, in_channels): super(MaskRCNNC4Predictor, self).__init__() num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES dim_reduced = cfg.MODEL.ROI_MASK_HEAD.CONV_LAYERS[-1] num_inputs = in_channels self.conv5_mask = ConvTranspose2d(num_inputs, dim_reduced, 2, 2, 0) self.mask_fcn_logits = Conv2d(dim_reduced, num_classes, 1, 1, 0) for name, param in self.named_parameters(): if "bias" in name: nn.init.constant_(param, 0) elif "weight" in name: # Caffe2 implementation uses MSRAFill, which in fact # corresponds to kaiming_normal_ in PyTorch nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu") def forward(self, x): x = F.relu(self.conv5_mask(x)) return self.mask_fcn_logits(x) @registry.ROI_MASK_PREDICTOR.register("MaskRCNNConv1x1Predictor") class MaskRCNNConv1x1Predictor(nn.Module): def __init__(self, cfg, in_channels): super(MaskRCNNConv1x1Predictor, self).__init__() num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES num_inputs = in_channels self.mask_fcn_logits = Conv2d(num_inputs, num_classes, 1, 1, 0) for name, param in self.named_parameters(): if "bias" in name: nn.init.constant_(param, 0) elif "weight" in name: # Caffe2 implementation uses MSRAFill, which in fact # corresponds to kaiming_normal_ in PyTorch nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu") def forward(self, x): return self.mask_fcn_logits(x) def make_roi_mask_predictor(cfg, in_channels): func = registry.ROI_MASK_PREDICTOR[cfg.MODEL.ROI_MASK_HEAD.PREDICTOR] return func(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/roi_heads.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from .box_head.box_head import build_roi_box_head from .mask_head.mask_head import build_roi_mask_head from .keypoint_head.keypoint_head import build_roi_keypoint_head class CombinedROIHeads(torch.nn.ModuleDict): """ Combines a set of individual heads (for box prediction or masks) into a single head. """ def __init__(self, cfg, heads): super(CombinedROIHeads, self).__init__(heads) self.cfg = cfg.clone() if cfg.MODEL.MASK_ON and cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR: self.mask.feature_extractor = self.box.feature_extractor if cfg.MODEL.KEYPOINT_ON and cfg.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR: self.keypoint.feature_extractor = self.box.feature_extractor def forward(self, features, proposals, targets=None): losses = {} # TODO rename x to roi_box_features, if it doesn't increase memory consumption x, detections, loss_box = self.box(features, proposals, targets) losses.update(loss_box) if self.cfg.MODEL.MASK_ON: mask_features = features # optimization: during training, if we share the feature extractor between # the box and the mask heads, then we can reuse the features already computed if ( self.training and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR ): mask_features = x # During training, self.box() will return the unaltered proposals as "detections" # this makes the API consistent during training and testing x, detections, loss_mask = self.mask(mask_features, detections, targets) losses.update(loss_mask) if self.cfg.MODEL.KEYPOINT_ON: keypoint_features = features # optimization: during training, if we share the feature extractor between # the box and the mask heads, then we can reuse the features already computed if ( self.training and self.cfg.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR ): keypoint_features = x # During training, self.box() will return the unaltered proposals as "detections" # this makes the API consistent during training and testing x, detections, loss_keypoint = self.keypoint(keypoint_features, detections, targets) losses.update(loss_keypoint) return x, detections, losses def build_roi_heads(cfg, in_channels): # individually create the heads, that will be combined together # afterwards roi_heads = [] if cfg.MODEL.RETINANET_ON: return [] if not cfg.MODEL.RPN_ONLY: roi_heads.append(("box", build_roi_box_head(cfg, in_channels))) if cfg.MODEL.MASK_ON: roi_heads.append(("mask", build_roi_mask_head(cfg, in_channels))) if cfg.MODEL.KEYPOINT_ON: roi_heads.append(("keypoint", build_roi_keypoint_head(cfg, in_channels))) # combine individual heads in a single module if roi_heads: roi_heads = CombinedROIHeads(cfg, roi_heads) return roi_heads ================================================ FILE: maskrcnn_benchmark/modeling/rpn/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. # from .rpn import build_rpn ================================================ FILE: maskrcnn_benchmark/modeling/rpn/anchor_generator.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import math import numpy as np import torch from torch import nn from maskrcnn_benchmark.structures.bounding_box import BoxList class BufferList(nn.Module): """ Similar to nn.ParameterList, but for buffers """ def __init__(self, buffers=None): super(BufferList, self).__init__() if buffers is not None: self.extend(buffers) def extend(self, buffers): offset = len(self) for i, buffer in enumerate(buffers): self.register_buffer(str(offset + i), buffer) return self def __len__(self): return len(self._buffers) def __iter__(self): return iter(self._buffers.values()) class AnchorGenerator(nn.Module): """ For a set of image sizes and feature maps, computes a set of anchors """ def __init__( self, sizes=(128, 256, 512), aspect_ratios=(0.5, 1.0, 2.0), anchor_strides=(8, 16, 32), straddle_thresh=0, ): super(AnchorGenerator, self).__init__() if len(anchor_strides) == 1: anchor_stride = anchor_strides[0] cell_anchors = [ generate_anchors(anchor_stride, sizes, aspect_ratios).float() ] else: if len(anchor_strides) != len(sizes): raise RuntimeError("FPN should have #anchor_strides == #sizes") cell_anchors = [ generate_anchors( anchor_stride, size if isinstance(size, (tuple, list)) else (size,), aspect_ratios ).float() for anchor_stride, size in zip(anchor_strides, sizes) ] self.strides = anchor_strides self.cell_anchors = BufferList(cell_anchors) self.straddle_thresh = straddle_thresh def num_anchors_per_location(self): return [len(cell_anchors) for cell_anchors in self.cell_anchors] def grid_anchors(self, grid_sizes): anchors = [] for size, stride, base_anchors in zip( grid_sizes, self.strides, self.cell_anchors ): grid_height, grid_width = size device = base_anchors.device shifts_x = torch.arange( 0, grid_width * stride, step=stride, dtype=torch.float32, device=device ) shifts_y = torch.arange( 0, grid_height * stride, step=stride, dtype=torch.float32, device=device ) shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) shift_x = shift_x.reshape(-1) shift_y = shift_y.reshape(-1) shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1) anchors.append( (shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4) ) return anchors def add_visibility_to(self, boxlist): image_width, image_height = boxlist.size anchors = boxlist.bbox if self.straddle_thresh >= 0: inds_inside = ( (anchors[..., 0] >= -self.straddle_thresh) & (anchors[..., 1] >= -self.straddle_thresh) & (anchors[..., 2] < image_width + self.straddle_thresh) & (anchors[..., 3] < image_height + self.straddle_thresh) ) else: device = anchors.device inds_inside = torch.ones(anchors.shape[0], dtype=torch.bool, device=device) boxlist.add_field("visibility", inds_inside) def forward(self, image_list, feature_maps): grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps] anchors_over_all_feature_maps = self.grid_anchors(grid_sizes) anchors = [] for i, (image_height, image_width) in enumerate(image_list.image_sizes): anchors_in_image = [] for anchors_per_feature_map in anchors_over_all_feature_maps: boxlist = BoxList( anchors_per_feature_map, (image_width, image_height), mode="xyxy" ) self.add_visibility_to(boxlist) anchors_in_image.append(boxlist) anchors.append(anchors_in_image) return anchors def make_anchor_generator(config): anchor_sizes = config.MODEL.RPN.ANCHOR_SIZES aspect_ratios = config.MODEL.RPN.ASPECT_RATIOS anchor_stride = config.MODEL.RPN.ANCHOR_STRIDE straddle_thresh = config.MODEL.RPN.STRADDLE_THRESH if config.MODEL.RPN.USE_FPN: assert len(anchor_stride) == len( anchor_sizes ), "FPN should have len(ANCHOR_STRIDE) == len(ANCHOR_SIZES)" else: assert len(anchor_stride) == 1, "Non-FPN should have a single ANCHOR_STRIDE" anchor_generator = AnchorGenerator( anchor_sizes, aspect_ratios, anchor_stride, straddle_thresh ) return anchor_generator def make_anchor_generator_retinanet(config): anchor_sizes = config.MODEL.RETINANET.ANCHOR_SIZES aspect_ratios = config.MODEL.RETINANET.ASPECT_RATIOS anchor_strides = config.MODEL.RETINANET.ANCHOR_STRIDES straddle_thresh = config.MODEL.RETINANET.STRADDLE_THRESH octave = config.MODEL.RETINANET.OCTAVE scales_per_octave = config.MODEL.RETINANET.SCALES_PER_OCTAVE assert len(anchor_strides) == len(anchor_sizes), "Only support FPN now" new_anchor_sizes = [] for size in anchor_sizes: per_layer_anchor_sizes = [] for scale_per_octave in range(scales_per_octave): octave_scale = octave ** (scale_per_octave / float(scales_per_octave)) per_layer_anchor_sizes.append(octave_scale * size) new_anchor_sizes.append(tuple(per_layer_anchor_sizes)) anchor_generator = AnchorGenerator( tuple(new_anchor_sizes), aspect_ratios, anchor_strides, straddle_thresh ) return anchor_generator # Copyright (c) 2017-present, Facebook, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ############################################################################## # # Based on: # -------------------------------------------------------- # Faster R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick and Sean Bell # -------------------------------------------------------- # Verify that we compute the same anchors as Shaoqing's matlab implementation: # # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat # >> anchors # # anchors = # # -83 -39 100 56 # -175 -87 192 104 # -359 -183 376 200 # -55 -55 72 72 # -119 -119 136 136 # -247 -247 264 264 # -35 -79 52 96 # -79 -167 96 184 # -167 -343 184 360 # array([[ -83., -39., 100., 56.], # [-175., -87., 192., 104.], # [-359., -183., 376., 200.], # [ -55., -55., 72., 72.], # [-119., -119., 136., 136.], # [-247., -247., 264., 264.], # [ -35., -79., 52., 96.], # [ -79., -167., 96., 184.], # [-167., -343., 184., 360.]]) def generate_anchors( stride=16, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2) ): """Generates a matrix of anchor boxes in (x1, y1, x2, y2) format. Anchors are centered on stride / 2, have (approximate) sqrt areas of the specified sizes, and aspect ratios as given. """ return _generate_anchors( stride, np.array(sizes, dtype=np.float) / stride, np.array(aspect_ratios, dtype=np.float), ) def _generate_anchors(base_size, scales, aspect_ratios): """Generate anchor (reference) windows by enumerating aspect ratios X scales wrt a reference (0, 0, base_size - 1, base_size - 1) window. """ anchor = np.array([1, 1, base_size, base_size], dtype=np.float) - 1 anchors = _ratio_enum(anchor, aspect_ratios) anchors = np.vstack( [_scale_enum(anchors[i, :], scales) for i in range(anchors.shape[0])] ) return torch.from_numpy(anchors) def _whctrs(anchor): """Return width, height, x center, and y center for an anchor (window).""" w = anchor[2] - anchor[0] + 1 h = anchor[3] - anchor[1] + 1 x_ctr = anchor[0] + 0.5 * (w - 1) y_ctr = anchor[1] + 0.5 * (h - 1) return w, h, x_ctr, y_ctr def _mkanchors(ws, hs, x_ctr, y_ctr): """Given a vector of widths (ws) and heights (hs) around a center (x_ctr, y_ctr), output a set of anchors (windows). """ ws = ws[:, np.newaxis] hs = hs[:, np.newaxis] anchors = np.hstack( ( x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1), x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1), ) ) return anchors def _ratio_enum(anchor, ratios): """Enumerate a set of anchors for each aspect ratio wrt an anchor.""" w, h, x_ctr, y_ctr = _whctrs(anchor) size = w * h size_ratios = size / ratios ws = np.round(np.sqrt(size_ratios)) hs = np.round(ws * ratios) anchors = _mkanchors(ws, hs, x_ctr, y_ctr) return anchors def _scale_enum(anchor, scales): """Enumerate a set of anchors for each scale wrt an anchor.""" w, h, x_ctr, y_ctr = _whctrs(anchor) ws = w * scales hs = h * scales anchors = _mkanchors(ws, hs, x_ctr, y_ctr) return anchors ================================================ FILE: maskrcnn_benchmark/modeling/rpn/inference.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from maskrcnn_benchmark.modeling.box_coder import BoxCoder from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms from maskrcnn_benchmark.structures.boxlist_ops import remove_small_boxes from ..utils import cat from .utils import permute_and_flatten class RPNPostProcessor(torch.nn.Module): """ Performs post-processing on the outputs of the RPN boxes, before feeding the proposals to the heads """ def __init__( self, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, box_coder=None, fpn_post_nms_top_n=None, fpn_post_nms_per_batch=True, ): """ Arguments: pre_nms_top_n (int) post_nms_top_n (int) nms_thresh (float) min_size (int) box_coder (BoxCoder) fpn_post_nms_top_n (int) """ super(RPNPostProcessor, self).__init__() self.pre_nms_top_n = pre_nms_top_n self.post_nms_top_n = post_nms_top_n self.nms_thresh = nms_thresh self.min_size = min_size if box_coder is None: box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) self.box_coder = box_coder if fpn_post_nms_top_n is None: fpn_post_nms_top_n = post_nms_top_n self.fpn_post_nms_top_n = fpn_post_nms_top_n self.fpn_post_nms_per_batch = fpn_post_nms_per_batch def add_gt_proposals(self, proposals, targets): """ Arguments: proposals: list[BoxList] targets: list[BoxList] """ # Get the device we're operating on device = proposals[0].bbox.device gt_boxes = [target.copy_with_fields([]) for target in targets] # later cat of bbox requires all fields to be present for all bbox # so we need to add a dummy for objectness that's missing for gt_box in gt_boxes: gt_box.add_field("objectness", torch.ones(len(gt_box), device=device)) proposals = [ cat_boxlist((proposal, gt_box)) for proposal, gt_box in zip(proposals, gt_boxes) ] return proposals def forward_for_single_feature_map(self, anchors, objectness, box_regression): """ Arguments: anchors: list[BoxList] objectness: tensor of size N, A, H, W box_regression: tensor of size N, A * 4, H, W """ device = objectness.device N, A, H, W = objectness.shape # put in the same format as anchors objectness = permute_and_flatten(objectness, N, A, 1, H, W).view(N, -1) objectness = objectness.sigmoid() box_regression = permute_and_flatten(box_regression, N, A, 4, H, W) num_anchors = A * H * W pre_nms_top_n = min(self.pre_nms_top_n, num_anchors) objectness, topk_idx = objectness.topk(pre_nms_top_n, dim=1, sorted=True) batch_idx = torch.arange(N, device=device)[:, None] box_regression = box_regression[batch_idx, topk_idx] image_shapes = [box.size for box in anchors] concat_anchors = torch.cat([a.bbox for a in anchors], dim=0) concat_anchors = concat_anchors.reshape(N, -1, 4)[batch_idx, topk_idx] proposals = self.box_coder.decode( box_regression.view(-1, 4), concat_anchors.view(-1, 4) ) proposals = proposals.view(N, -1, 4) result = [] for proposal, score, im_shape in zip(proposals, objectness, image_shapes): boxlist = BoxList(proposal, im_shape, mode="xyxy") boxlist.add_field("objectness", score) boxlist = boxlist.clip_to_image(remove_empty=False) boxlist = remove_small_boxes(boxlist, self.min_size) boxlist = boxlist_nms( boxlist, self.nms_thresh, max_proposals=self.post_nms_top_n, score_field="objectness", ) result.append(boxlist) return result def forward(self, anchors, objectness, box_regression, targets=None): """ Arguments: anchors: list[list[BoxList]] objectness: list[tensor] box_regression: list[tensor] Returns: boxlists (list[BoxList]): the post-processed anchors, after applying box decoding and NMS """ sampled_boxes = [] num_levels = len(objectness) anchors = list(zip(*anchors)) for a, o, b in zip(anchors, objectness, box_regression): sampled_boxes.append(self.forward_for_single_feature_map(a, o, b)) boxlists = list(zip(*sampled_boxes)) boxlists = [cat_boxlist(boxlist) for boxlist in boxlists] if num_levels > 1: boxlists = self.select_over_all_levels(boxlists) # append ground-truth bboxes to proposals if self.training and targets is not None: boxlists = self.add_gt_proposals(boxlists, targets) return boxlists def select_over_all_levels(self, boxlists): num_images = len(boxlists) # different behavior during training and during testing: # during training, post_nms_top_n is over *all* the proposals combined, while # during testing, it is over the proposals for each image # NOTE: it should be per image, and not per batch. However, to be consistent # with Detectron, the default is per batch (see Issue #672) if self.training and self.fpn_post_nms_per_batch: objectness = torch.cat( [boxlist.get_field("objectness") for boxlist in boxlists], dim=0 ) box_sizes = [len(boxlist) for boxlist in boxlists] post_nms_top_n = min(self.fpn_post_nms_top_n, len(objectness)) _, inds_sorted = torch.topk(objectness, post_nms_top_n, dim=0, sorted=True) inds_mask = torch.zeros_like(objectness, dtype=torch.bool) inds_mask[inds_sorted] = 1 inds_mask = inds_mask.split(box_sizes) for i in range(num_images): boxlists[i] = boxlists[i][inds_mask[i]] else: for i in range(num_images): objectness = boxlists[i].get_field("objectness") post_nms_top_n = min(self.fpn_post_nms_top_n, len(objectness)) _, inds_sorted = torch.topk( objectness, post_nms_top_n, dim=0, sorted=True ) boxlists[i] = boxlists[i][inds_sorted] return boxlists def make_rpn_postprocessor(config, rpn_box_coder, is_train): fpn_post_nms_top_n = config.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN if not is_train: fpn_post_nms_top_n = config.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST pre_nms_top_n = config.MODEL.RPN.PRE_NMS_TOP_N_TRAIN post_nms_top_n = config.MODEL.RPN.POST_NMS_TOP_N_TRAIN if not is_train: pre_nms_top_n = config.MODEL.RPN.PRE_NMS_TOP_N_TEST post_nms_top_n = config.MODEL.RPN.POST_NMS_TOP_N_TEST fpn_post_nms_per_batch = config.MODEL.RPN.FPN_POST_NMS_PER_BATCH nms_thresh = config.MODEL.RPN.NMS_THRESH min_size = config.MODEL.RPN.MIN_SIZE box_selector = RPNPostProcessor( pre_nms_top_n=pre_nms_top_n, post_nms_top_n=post_nms_top_n, nms_thresh=nms_thresh, min_size=min_size, box_coder=rpn_box_coder, fpn_post_nms_top_n=fpn_post_nms_top_n, fpn_post_nms_per_batch=fpn_post_nms_per_batch, ) return box_selector ================================================ FILE: maskrcnn_benchmark/modeling/rpn/loss.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """ This file contains specific functions for computing losses on the RPN file """ import torch from torch.nn import functional as F from .utils import concat_box_prediction_layers from ..balanced_positive_negative_sampler import BalancedPositiveNegativeSampler from ..utils import cat from maskrcnn_benchmark.layers import smooth_l1_loss from maskrcnn_benchmark.modeling.matcher import Matcher from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist class RPNLossComputation(object): """ This class computes the RPN loss. """ def __init__(self, proposal_matcher, fg_bg_sampler, box_coder, generate_labels_func): """ Arguments: proposal_matcher (Matcher) fg_bg_sampler (BalancedPositiveNegativeSampler) box_coder (BoxCoder) """ # self.target_preparator = target_preparator self.proposal_matcher = proposal_matcher self.fg_bg_sampler = fg_bg_sampler self.box_coder = box_coder self.copied_fields = [] self.generate_labels_func = generate_labels_func self.discard_cases = ['not_visibility', 'between_thresholds'] def match_targets_to_anchors(self, anchor, target, copied_fields=[]): match_quality_matrix = boxlist_iou(target, anchor) matched_idxs = self.proposal_matcher(match_quality_matrix) # RPN doesn't need any fields from target # for creating the labels, so clear them all target = target.copy_with_fields(copied_fields) # get the targets corresponding GT for each anchor # NB: need to clamp the indices because we can have a single # GT in the image, and matched_idxs can be -2, which goes # out of bounds matched_targets = target[matched_idxs.clamp(min=0)] matched_targets.add_field("matched_idxs", matched_idxs) return matched_targets def prepare_targets(self, anchors, targets): labels = [] regression_targets = [] for anchors_per_image, targets_per_image in zip(anchors, targets): matched_targets = self.match_targets_to_anchors( anchors_per_image, targets_per_image, self.copied_fields ) matched_idxs = matched_targets.get_field("matched_idxs") labels_per_image = self.generate_labels_func(matched_targets) labels_per_image = labels_per_image.to(dtype=torch.float32) # Background (negative examples) bg_indices = matched_idxs == Matcher.BELOW_LOW_THRESHOLD labels_per_image[bg_indices] = 0 # discard anchors that go out of the boundaries of the image if "not_visibility" in self.discard_cases: labels_per_image[~anchors_per_image.get_field("visibility")] = -1 # discard indices that are between thresholds if "between_thresholds" in self.discard_cases: inds_to_discard = matched_idxs == Matcher.BETWEEN_THRESHOLDS labels_per_image[inds_to_discard] = -1 # compute regression targets regression_targets_per_image = self.box_coder.encode( matched_targets.bbox, anchors_per_image.bbox ) labels.append(labels_per_image) regression_targets.append(regression_targets_per_image) return labels, regression_targets def __call__(self, anchors, objectness, box_regression, targets): """ Arguments: anchors (list[list[BoxList]]) objectness (list[Tensor]) box_regression (list[Tensor]) targets (list[BoxList]) Returns: objectness_loss (Tensor) box_loss (Tensor) """ anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors] labels, regression_targets = self.prepare_targets(anchors, targets) sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels) sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1) sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0)).squeeze(1) sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0) objectness, box_regression = \ concat_box_prediction_layers(objectness, box_regression) objectness = objectness.squeeze() labels = torch.cat(labels, dim=0) regression_targets = torch.cat(regression_targets, dim=0) box_loss = smooth_l1_loss( box_regression[sampled_pos_inds], regression_targets[sampled_pos_inds], beta=1.0 / 9, size_average=False, ) / (sampled_inds.numel()) objectness_loss = F.binary_cross_entropy_with_logits( objectness[sampled_inds], labels[sampled_inds] ) return objectness_loss, box_loss # This function should be overwritten in RetinaNet def generate_rpn_labels(matched_targets): matched_idxs = matched_targets.get_field("matched_idxs") labels_per_image = matched_idxs >= 0 return labels_per_image def make_rpn_loss_evaluator(cfg, box_coder): matcher = Matcher( cfg.MODEL.RPN.FG_IOU_THRESHOLD, cfg.MODEL.RPN.BG_IOU_THRESHOLD, allow_low_quality_matches=True, ) fg_bg_sampler = BalancedPositiveNegativeSampler( cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE, cfg.MODEL.RPN.POSITIVE_FRACTION ) loss_evaluator = RPNLossComputation( matcher, fg_bg_sampler, box_coder, generate_rpn_labels ) return loss_evaluator ================================================ FILE: maskrcnn_benchmark/modeling/rpn/retinanet/__init__.py ================================================ ================================================ FILE: maskrcnn_benchmark/modeling/rpn/retinanet/inference.py ================================================ import torch from ..inference import RPNPostProcessor from ..utils import permute_and_flatten from maskrcnn_benchmark.modeling.box_coder import BoxCoder from maskrcnn_benchmark.modeling.utils import cat from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms from maskrcnn_benchmark.structures.boxlist_ops import remove_small_boxes class RetinaNetPostProcessor(RPNPostProcessor): """ Performs post-processing on the outputs of the RetinaNet boxes. This is only used in the testing. """ def __init__( self, pre_nms_thresh, pre_nms_top_n, nms_thresh, fpn_post_nms_top_n, min_size, num_classes, box_coder=None, ): """ Arguments: pre_nms_thresh (float) pre_nms_top_n (int) nms_thresh (float) fpn_post_nms_top_n (int) min_size (int) num_classes (int) box_coder (BoxCoder) """ super(RetinaNetPostProcessor, self).__init__( pre_nms_thresh, 0, nms_thresh, min_size ) self.pre_nms_thresh = pre_nms_thresh self.pre_nms_top_n = pre_nms_top_n self.nms_thresh = nms_thresh self.fpn_post_nms_top_n = fpn_post_nms_top_n self.min_size = min_size self.num_classes = num_classes if box_coder is None: box_coder = BoxCoder(weights=(10., 10., 5., 5.)) self.box_coder = box_coder def add_gt_proposals(self, proposals, targets): """ This function is not used in RetinaNet """ pass def forward_for_single_feature_map( self, anchors, box_cls, box_regression): """ Arguments: anchors: list[BoxList] box_cls: tensor of size N, A * C, H, W box_regression: tensor of size N, A * 4, H, W """ device = box_cls.device N, _, H, W = box_cls.shape A = box_regression.size(1) // 4 C = box_cls.size(1) // A # put in the same format as anchors box_cls = permute_and_flatten(box_cls, N, A, C, H, W) box_cls = box_cls.sigmoid() box_regression = permute_and_flatten(box_regression, N, A, 4, H, W) num_anchors = A * H * W candidate_inds = box_cls > self.pre_nms_thresh pre_nms_top_n = candidate_inds.view(N, -1).sum(1) pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n) results = [] for per_box_cls, per_box_regression, per_pre_nms_top_n, \ per_candidate_inds, per_anchors in zip( box_cls, box_regression, pre_nms_top_n, candidate_inds, anchors): # Sort and select TopN # TODO most of this can be made out of the loop for # all images. # TODO:Yang: Not easy to do. Because the numbers of detections are # different in each image. Therefore, this part needs to be done # per image. per_box_cls = per_box_cls[per_candidate_inds] per_box_cls, top_k_indices = \ per_box_cls.topk(per_pre_nms_top_n, sorted=False) per_candidate_nonzeros = \ per_candidate_inds.nonzero()[top_k_indices, :] per_box_loc = per_candidate_nonzeros[:, 0] per_class = per_candidate_nonzeros[:, 1] per_class += 1 detections = self.box_coder.decode( per_box_regression[per_box_loc, :].view(-1, 4), per_anchors.bbox[per_box_loc, :].view(-1, 4) ) boxlist = BoxList(detections, per_anchors.size, mode="xyxy") boxlist.add_field("labels", per_class) boxlist.add_field("scores", per_box_cls) boxlist = boxlist.clip_to_image(remove_empty=False) boxlist = remove_small_boxes(boxlist, self.min_size) results.append(boxlist) return results # TODO very similar to filter_results from PostProcessor # but filter_results is per image # TODO Yang: solve this issue in the future. No good solution # right now. def select_over_all_levels(self, boxlists): num_images = len(boxlists) results = [] for i in range(num_images): scores = boxlists[i].get_field("scores") labels = boxlists[i].get_field("labels") boxes = boxlists[i].bbox boxlist = boxlists[i] result = [] # skip the background for j in range(1, self.num_classes): inds = (labels == j).nonzero().view(-1) scores_j = scores[inds] boxes_j = boxes[inds, :].view(-1, 4) boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") boxlist_for_class.add_field("scores", scores_j) boxlist_for_class = boxlist_nms( boxlist_for_class, self.nms_thresh, score_field="scores" ) num_labels = len(boxlist_for_class) boxlist_for_class.add_field( "labels", torch.full((num_labels,), j, dtype=torch.int64, device=scores.device) ) result.append(boxlist_for_class) result = cat_boxlist(result) number_of_detections = len(result) # Limit to max_per_image detections **over all classes** if number_of_detections > self.fpn_post_nms_top_n > 0: cls_scores = result.get_field("scores") image_thresh, _ = torch.kthvalue( cls_scores.cpu(), number_of_detections - self.fpn_post_nms_top_n + 1 ) keep = cls_scores >= image_thresh.item() keep = torch.nonzero(keep).squeeze(1) result = result[keep] results.append(result) return results def make_retinanet_postprocessor(config, rpn_box_coder, is_train): pre_nms_thresh = config.MODEL.RETINANET.INFERENCE_TH pre_nms_top_n = config.MODEL.RETINANET.PRE_NMS_TOP_N nms_thresh = config.MODEL.RETINANET.NMS_TH fpn_post_nms_top_n = config.TEST.DETECTIONS_PER_IMG min_size = 0 box_selector = RetinaNetPostProcessor( pre_nms_thresh=pre_nms_thresh, pre_nms_top_n=pre_nms_top_n, nms_thresh=nms_thresh, fpn_post_nms_top_n=fpn_post_nms_top_n, min_size=min_size, num_classes=config.MODEL.RETINANET.NUM_CLASSES, box_coder=rpn_box_coder, ) return box_selector ================================================ FILE: maskrcnn_benchmark/modeling/rpn/retinanet/loss.py ================================================ """ This file contains specific functions for computing losses on the RetinaNet file """ import torch from torch.nn import functional as F from ..utils import concat_box_prediction_layers from maskrcnn_benchmark.layers import smooth_l1_loss from maskrcnn_benchmark.layers import SigmoidFocalLoss from maskrcnn_benchmark.modeling.matcher import Matcher from maskrcnn_benchmark.modeling.utils import cat from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist from maskrcnn_benchmark.modeling.rpn.loss import RPNLossComputation class RetinaNetLossComputation(RPNLossComputation): """ This class computes the RetinaNet loss. """ def __init__(self, proposal_matcher, box_coder, generate_labels_func, sigmoid_focal_loss, bbox_reg_beta=0.11, regress_norm=1.0): """ Arguments: proposal_matcher (Matcher) box_coder (BoxCoder) """ self.proposal_matcher = proposal_matcher self.box_coder = box_coder self.box_cls_loss_func = sigmoid_focal_loss self.bbox_reg_beta = bbox_reg_beta self.copied_fields = ['labels'] self.generate_labels_func = generate_labels_func self.discard_cases = ['between_thresholds'] self.regress_norm = regress_norm def __call__(self, anchors, box_cls, box_regression, targets): """ Arguments: anchors (list[BoxList]) box_cls (list[Tensor]) box_regression (list[Tensor]) targets (list[BoxList]) Returns: retinanet_cls_loss (Tensor) retinanet_regression_loss (Tensor """ anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors] labels, regression_targets = self.prepare_targets(anchors, targets) N = len(labels) box_cls, box_regression = \ concat_box_prediction_layers(box_cls, box_regression) labels = torch.cat(labels, dim=0) regression_targets = torch.cat(regression_targets, dim=0) pos_inds = torch.nonzero(labels > 0).squeeze(1) retinanet_regression_loss = smooth_l1_loss( box_regression[pos_inds], regression_targets[pos_inds], beta=self.bbox_reg_beta, size_average=False, ) / (max(1, pos_inds.numel() * self.regress_norm)) labels = labels.int() retinanet_cls_loss = self.box_cls_loss_func( box_cls, labels ) / (pos_inds.numel() + N) return retinanet_cls_loss, retinanet_regression_loss def generate_retinanet_labels(matched_targets): labels_per_image = matched_targets.get_field("labels") return labels_per_image def make_retinanet_loss_evaluator(cfg, box_coder): matcher = Matcher( cfg.MODEL.RETINANET.FG_IOU_THRESHOLD, cfg.MODEL.RETINANET.BG_IOU_THRESHOLD, allow_low_quality_matches=True, ) sigmoid_focal_loss = SigmoidFocalLoss( cfg.MODEL.RETINANET.LOSS_GAMMA, cfg.MODEL.RETINANET.LOSS_ALPHA ) loss_evaluator = RetinaNetLossComputation( matcher, box_coder, generate_retinanet_labels, sigmoid_focal_loss, bbox_reg_beta = cfg.MODEL.RETINANET.BBOX_REG_BETA, regress_norm = cfg.MODEL.RETINANET.BBOX_REG_WEIGHT, ) return loss_evaluator ================================================ FILE: maskrcnn_benchmark/modeling/rpn/retinanet/retinanet.py ================================================ import math import torch import torch.nn.functional as F from torch import nn from .inference import make_retinanet_postprocessor from .loss import make_retinanet_loss_evaluator from ..anchor_generator import make_anchor_generator_retinanet from maskrcnn_benchmark.modeling.box_coder import BoxCoder class RetinaNetHead(torch.nn.Module): """ Adds a RetinNet head with classification and regression heads """ def __init__(self, cfg, in_channels): """ Arguments: in_channels (int): number of channels of the input feature num_anchors (int): number of anchors to be predicted """ super(RetinaNetHead, self).__init__() # TODO: Implement the sigmoid version first. num_classes = cfg.MODEL.RETINANET.NUM_CLASSES - 1 num_anchors = len(cfg.MODEL.RETINANET.ASPECT_RATIOS) \ * cfg.MODEL.RETINANET.SCALES_PER_OCTAVE cls_tower = [] bbox_tower = [] for i in range(cfg.MODEL.RETINANET.NUM_CONVS): cls_tower.append( nn.Conv2d( in_channels, in_channels, kernel_size=3, stride=1, padding=1 ) ) cls_tower.append(nn.ReLU()) bbox_tower.append( nn.Conv2d( in_channels, in_channels, kernel_size=3, stride=1, padding=1 ) ) bbox_tower.append(nn.ReLU()) self.add_module('cls_tower', nn.Sequential(*cls_tower)) self.add_module('bbox_tower', nn.Sequential(*bbox_tower)) self.cls_logits = nn.Conv2d( in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1 ) self.bbox_pred = nn.Conv2d( in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1 ) # Initialization for modules in [self.cls_tower, self.bbox_tower, self.cls_logits, self.bbox_pred]: for l in modules.modules(): if isinstance(l, nn.Conv2d): torch.nn.init.normal_(l.weight, std=0.01) torch.nn.init.constant_(l.bias, 0) # retinanet_bias_init prior_prob = cfg.MODEL.RETINANET.PRIOR_PROB bias_value = -math.log((1 - prior_prob) / prior_prob) torch.nn.init.constant_(self.cls_logits.bias, bias_value) def forward(self, x): logits = [] bbox_reg = [] for feature in x: logits.append(self.cls_logits(self.cls_tower(feature))) bbox_reg.append(self.bbox_pred(self.bbox_tower(feature))) return logits, bbox_reg class RetinaNetModule(torch.nn.Module): """ Module for RetinaNet computation. Takes feature maps from the backbone and RetinaNet outputs and losses. Only Test on FPN now. """ def __init__(self, cfg, in_channels): super(RetinaNetModule, self).__init__() self.cfg = cfg.clone() anchor_generator = make_anchor_generator_retinanet(cfg) head = RetinaNetHead(cfg, in_channels) box_coder = BoxCoder(weights=(10., 10., 5., 5.)) box_selector_test = make_retinanet_postprocessor(cfg, box_coder, is_train=False) loss_evaluator = make_retinanet_loss_evaluator(cfg, box_coder) self.anchor_generator = anchor_generator self.head = head self.box_selector_test = box_selector_test self.loss_evaluator = loss_evaluator def forward(self, images, features, targets=None): """ Arguments: images (ImageList): images for which we want to compute the predictions features (list[Tensor]): features computed from the images that are used for computing the predictions. Each tensor in the list correspond to different feature levels targets (list[BoxList): ground-truth boxes present in the image (optional) Returns: boxes (list[BoxList]): the predicted boxes from the RPN, one BoxList per image. losses (dict[Tensor]): the losses for the model during training. During testing, it is an empty dict. """ box_cls, box_regression = self.head(features) anchors = self.anchor_generator(images, features) if self.training: return self._forward_train(anchors, box_cls, box_regression, targets) else: return self._forward_test(anchors, box_cls, box_regression) def _forward_train(self, anchors, box_cls, box_regression, targets): loss_box_cls, loss_box_reg = self.loss_evaluator( anchors, box_cls, box_regression, targets ) losses = { "loss_retina_cls": loss_box_cls, "loss_retina_reg": loss_box_reg, } return anchors, losses def _forward_test(self, anchors, box_cls, box_regression): boxes = self.box_selector_test(anchors, box_cls, box_regression) return boxes, {} def build_retinanet(cfg, in_channels): return RetinaNetModule(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/rpn/rpn.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch import torch.nn.functional as F from torch import nn from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.modeling.box_coder import BoxCoder from maskrcnn_benchmark.modeling.rpn.retinanet.retinanet import build_retinanet from .loss import make_rpn_loss_evaluator from .anchor_generator import make_anchor_generator from .inference import make_rpn_postprocessor class RPNHeadConvRegressor(nn.Module): """ A simple RPN Head for classification and bbox regression """ def __init__(self, cfg, in_channels, num_anchors): """ Arguments: cfg : config in_channels (int): number of channels of the input feature num_anchors (int): number of anchors to be predicted """ super(RPNHeadConvRegressor, self).__init__() self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1) self.bbox_pred = nn.Conv2d( in_channels, num_anchors * 4, kernel_size=1, stride=1 ) for l in [self.cls_logits, self.bbox_pred]: torch.nn.init.normal_(l.weight, std=0.01) torch.nn.init.constant_(l.bias, 0) def forward(self, x): assert isinstance(x, (list, tuple)) logits = [self.cls_logits(y) for y in x] bbox_reg = [self.bbox_pred(y) for y in x] return logits, bbox_reg class RPNHeadFeatureSingleConv(nn.Module): """ Adds a simple RPN Head with one conv to extract the feature """ def __init__(self, cfg, in_channels): """ Arguments: cfg : config in_channels (int): number of channels of the input feature """ super(RPNHeadFeatureSingleConv, self).__init__() self.conv = nn.Conv2d( in_channels, in_channels, kernel_size=3, stride=1, padding=1 ) for l in [self.conv]: torch.nn.init.normal_(l.weight, std=0.01) torch.nn.init.constant_(l.bias, 0) self.out_channels = in_channels def forward(self, x): assert isinstance(x, (list, tuple)) x = [F.relu(self.conv(z)) for z in x] return x @registry.RPN_HEADS.register("SingleConvRPNHead") class RPNHead(nn.Module): """ Adds a simple RPN Head with classification and regression heads """ def __init__(self, cfg, in_channels, num_anchors): """ Arguments: cfg : config in_channels (int): number of channels of the input feature num_anchors (int): number of anchors to be predicted """ super(RPNHead, self).__init__() self.conv = nn.Conv2d( in_channels, in_channels, kernel_size=3, stride=1, padding=1 ) self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1) self.bbox_pred = nn.Conv2d( in_channels, num_anchors * 4, kernel_size=1, stride=1 ) for l in [self.conv, self.cls_logits, self.bbox_pred]: torch.nn.init.normal_(l.weight, std=0.01) torch.nn.init.constant_(l.bias, 0) def forward(self, x): logits = [] bbox_reg = [] for feature in x: t = F.relu(self.conv(feature)) logits.append(self.cls_logits(t)) bbox_reg.append(self.bbox_pred(t)) return logits, bbox_reg class RPNModule(torch.nn.Module): """ Module for RPN computation. Takes feature maps from the backbone and outputs RPN proposals and losses. Works for both FPN and non-FPN. """ def __init__(self, cfg, in_channels): super(RPNModule, self).__init__() self.cfg = cfg.clone() anchor_generator = make_anchor_generator(cfg) rpn_head = registry.RPN_HEADS[cfg.MODEL.RPN.RPN_HEAD] head = rpn_head( cfg, in_channels, anchor_generator.num_anchors_per_location()[0] ) rpn_box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) box_selector_train = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=True) box_selector_test = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=False) loss_evaluator = make_rpn_loss_evaluator(cfg, rpn_box_coder) self.anchor_generator = anchor_generator self.head = head self.box_selector_train = box_selector_train self.box_selector_test = box_selector_test self.loss_evaluator = loss_evaluator def forward(self, images, features, targets=None): """ Arguments: images (ImageList): images for which we want to compute the predictions features (list[Tensor]): features computed from the images that are used for computing the predictions. Each tensor in the list correspond to different feature levels targets (list[BoxList): ground-truth boxes present in the image (optional) Returns: boxes (list[BoxList]): the predicted boxes from the RPN, one BoxList per image. losses (dict[Tensor]): the losses for the model during training. During testing, it is an empty dict. """ objectness, rpn_box_regression = self.head(features) anchors = self.anchor_generator(images, features) if self.training: return self._forward_train(anchors, objectness, rpn_box_regression, targets) else: return self._forward_test(anchors, objectness, rpn_box_regression) def _forward_train(self, anchors, objectness, rpn_box_regression, targets): if self.cfg.MODEL.RPN_ONLY: # When training an RPN-only model, the loss is determined by the # predicted objectness and rpn_box_regression values and there is # no need to transform the anchors into predicted boxes; this is an # optimization that avoids the unnecessary transformation. boxes = anchors else: # For end-to-end models, anchors must be transformed into boxes and # sampled into a training batch. with torch.no_grad(): boxes = self.box_selector_train( anchors, objectness, rpn_box_regression, targets ) loss_objectness, loss_rpn_box_reg = self.loss_evaluator( anchors, objectness, rpn_box_regression, targets ) losses = { "loss_objectness": loss_objectness, "loss_rpn_box_reg": loss_rpn_box_reg, } return boxes, losses def _forward_test(self, anchors, objectness, rpn_box_regression): boxes = self.box_selector_test(anchors, objectness, rpn_box_regression) if self.cfg.MODEL.RPN_ONLY: # For end-to-end models, the RPN proposals are an intermediate state # and don't bother to sort them in decreasing score order. For RPN-only # models, the proposals are the final output and we return them in # high-to-low confidence order. inds = [ box.get_field("objectness").sort(descending=True)[1] for box in boxes ] boxes = [box[ind] for box, ind in zip(boxes, inds)] return boxes, {} def build_rpn(cfg, in_channels): """ This gives the gist of it. Not super important because it doesn't change as much """ if cfg.MODEL.RETINANET_ON: return build_retinanet(cfg, in_channels) return RPNModule(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/rpn/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """ Utility functions minipulating the prediction layers """ from ..utils import cat import torch def permute_and_flatten(layer, N, A, C, H, W): layer = layer.view(N, -1, C, H, W) layer = layer.permute(0, 3, 4, 1, 2) layer = layer.reshape(N, -1, C) return layer def concat_box_prediction_layers(box_cls, box_regression): box_cls_flattened = [] box_regression_flattened = [] # for each feature level, permute the outputs to make them be in the # same format as the labels. Note that the labels are computed for # all feature levels concatenated, so we keep the same representation # for the objectness and the box_regression for box_cls_per_level, box_regression_per_level in zip( box_cls, box_regression ): N, AxC, H, W = box_cls_per_level.shape Ax4 = box_regression_per_level.shape[1] A = Ax4 // 4 C = AxC // A box_cls_per_level = permute_and_flatten( box_cls_per_level, N, A, C, H, W ) box_cls_flattened.append(box_cls_per_level) box_regression_per_level = permute_and_flatten( box_regression_per_level, N, A, 4, H, W ) box_regression_flattened.append(box_regression_per_level) # concatenate on the first dimension (representing the feature levels), to # take into account the way the labels were generated (with all feature maps # being concatenated as well) box_cls = cat(box_cls_flattened, dim=1).reshape(-1, C) box_regression = cat(box_regression_flattened, dim=1).reshape(-1, 4) return box_cls, box_regression ================================================ FILE: maskrcnn_benchmark/modeling/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """ Miscellaneous utility functions """ import torch def cat(tensors, dim=0): """ Efficient version of torch.cat that avoids a copy if there is only a single element in a list """ assert isinstance(tensors, (list, tuple)) if len(tensors) == 1: return tensors[0] return torch.cat(tensors, dim) ================================================ FILE: maskrcnn_benchmark/solver/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from .build import make_optimizer from .build import make_lr_scheduler from .lr_scheduler import WarmupMultiStepLR ================================================ FILE: maskrcnn_benchmark/solver/build.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from .lr_scheduler import WarmupMultiStepLR def make_optimizer(cfg, model): params = [] for key, value in model.named_parameters(): if not value.requires_grad: continue lr = cfg.SOLVER.BASE_LR weight_decay = cfg.SOLVER.WEIGHT_DECAY if "bias" in key: lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] optimizer = torch.optim.SGD(params, lr, momentum=cfg.SOLVER.MOMENTUM) return optimizer def make_lr_scheduler(cfg, optimizer): return WarmupMultiStepLR( optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, warmup_factor=cfg.SOLVER.WARMUP_FACTOR, warmup_iters=cfg.SOLVER.WARMUP_ITERS, warmup_method=cfg.SOLVER.WARMUP_METHOD, ) ================================================ FILE: maskrcnn_benchmark/solver/lr_scheduler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from bisect import bisect_right import torch # FIXME ideally this would be achieved with a CombinedLRScheduler, # separating MultiStepLR with WarmupLR # but the current LRScheduler design doesn't allow it class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): def __init__( self, optimizer, milestones, gamma=0.1, warmup_factor=1.0 / 3, warmup_iters=500, warmup_method="linear", last_epoch=-1, ): if not list(milestones) == sorted(milestones): raise ValueError( "Milestones should be a list of" " increasing integers. Got {}", milestones, ) if warmup_method not in ("constant", "linear"): raise ValueError( "Only 'constant' or 'linear' warmup_method accepted" "got {}".format(warmup_method) ) self.milestones = milestones self.gamma = gamma self.warmup_factor = warmup_factor self.warmup_iters = warmup_iters self.warmup_method = warmup_method super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch) def get_lr(self): warmup_factor = 1 if self.last_epoch < self.warmup_iters: if self.warmup_method == "constant": warmup_factor = self.warmup_factor elif self.warmup_method == "linear": alpha = float(self.last_epoch) / self.warmup_iters warmup_factor = self.warmup_factor * (1 - alpha) + alpha return [ base_lr * warmup_factor * self.gamma ** bisect_right(self.milestones, self.last_epoch) for base_lr in self.base_lrs ] ================================================ FILE: maskrcnn_benchmark/structures/__init__.py ================================================ ================================================ FILE: maskrcnn_benchmark/structures/bounding_box.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch # transpose FLIP_LEFT_RIGHT = 0 FLIP_TOP_BOTTOM = 1 class BoxList(object): """ This class represents a set of bounding boxes. The bounding boxes are represented as a Nx4 Tensor. In order to uniquely determine the bounding boxes with respect to an image, we also store the corresponding image dimensions. They can contain extra information that is specific to each bounding box, such as labels. """ def __init__(self, bbox, image_size, mode="xyxy"): device = bbox.device if isinstance(bbox, torch.Tensor) else torch.device("cpu") bbox = torch.as_tensor(bbox, dtype=torch.float32, device=device) if bbox.ndimension() != 2: raise ValueError( "bbox should have 2 dimensions, got {}".format(bbox.ndimension()) ) if bbox.size(-1) != 4: raise ValueError( "last dimension of bbox should have a " "size of 4, got {}".format(bbox.size(-1)) ) if mode not in ("xyxy", "xywh"): raise ValueError("mode should be 'xyxy' or 'xywh'") self.bbox = bbox self.size = image_size # (image_width, image_height) self.mode = mode self.extra_fields = {} def add_field(self, field, field_data): self.extra_fields[field] = field_data def get_field(self, field): return self.extra_fields[field] def has_field(self, field): return field in self.extra_fields def fields(self): return list(self.extra_fields.keys()) def _copy_extra_fields(self, bbox): for k, v in bbox.extra_fields.items(): self.extra_fields[k] = v def convert(self, mode): if mode not in ("xyxy", "xywh"): raise ValueError("mode should be 'xyxy' or 'xywh'") if mode == self.mode: return self # we only have two modes, so don't need to check # self.mode xmin, ymin, xmax, ymax = self._split_into_xyxy() if mode == "xyxy": bbox = torch.cat((xmin, ymin, xmax, ymax), dim=-1) bbox = BoxList(bbox, self.size, mode=mode) else: TO_REMOVE = 1 bbox = torch.cat( (xmin, ymin, xmax - xmin + TO_REMOVE, ymax - ymin + TO_REMOVE), dim=-1 ) bbox = BoxList(bbox, self.size, mode=mode) bbox._copy_extra_fields(self) return bbox def _split_into_xyxy(self): if self.mode == "xyxy": xmin, ymin, xmax, ymax = self.bbox.split(1, dim=-1) return xmin, ymin, xmax, ymax elif self.mode == "xywh": TO_REMOVE = 1 xmin, ymin, w, h = self.bbox.split(1, dim=-1) return ( xmin, ymin, xmin + (w - TO_REMOVE).clamp(min=0), ymin + (h - TO_REMOVE).clamp(min=0), ) else: raise RuntimeError("Should not be here") def resize(self, size, *args, **kwargs): """ Returns a resized copy of this bounding box :param size: The requested size in pixels, as a 2-tuple: (width, height). """ ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size)) if ratios[0] == ratios[1]: ratio = ratios[0] scaled_box = self.bbox * ratio bbox = BoxList(scaled_box, size, mode=self.mode) # bbox._copy_extra_fields(self) for k, v in self.extra_fields.items(): if not isinstance(v, torch.Tensor): v = v.resize(size, *args, **kwargs) bbox.add_field(k, v) return bbox ratio_width, ratio_height = ratios xmin, ymin, xmax, ymax = self._split_into_xyxy() scaled_xmin = xmin * ratio_width scaled_xmax = xmax * ratio_width scaled_ymin = ymin * ratio_height scaled_ymax = ymax * ratio_height scaled_box = torch.cat( (scaled_xmin, scaled_ymin, scaled_xmax, scaled_ymax), dim=-1 ) bbox = BoxList(scaled_box, size, mode="xyxy") # bbox._copy_extra_fields(self) for k, v in self.extra_fields.items(): if not isinstance(v, torch.Tensor): v = v.resize(size, *args, **kwargs) bbox.add_field(k, v) return bbox.convert(self.mode) def transpose(self, method): """ Transpose bounding box (flip or rotate in 90 degree steps) :param method: One of :py:attr:`PIL.Image.FLIP_LEFT_RIGHT`, :py:attr:`PIL.Image.FLIP_TOP_BOTTOM`, :py:attr:`PIL.Image.ROTATE_90`, :py:attr:`PIL.Image.ROTATE_180`, :py:attr:`PIL.Image.ROTATE_270`, :py:attr:`PIL.Image.TRANSPOSE` or :py:attr:`PIL.Image.TRANSVERSE`. """ if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): raise NotImplementedError( "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" ) image_width, image_height = self.size xmin, ymin, xmax, ymax = self._split_into_xyxy() if method == FLIP_LEFT_RIGHT: TO_REMOVE = 1 transposed_xmin = image_width - xmax - TO_REMOVE transposed_xmax = image_width - xmin - TO_REMOVE transposed_ymin = ymin transposed_ymax = ymax elif method == FLIP_TOP_BOTTOM: transposed_xmin = xmin transposed_xmax = xmax transposed_ymin = image_height - ymax transposed_ymax = image_height - ymin transposed_boxes = torch.cat( (transposed_xmin, transposed_ymin, transposed_xmax, transposed_ymax), dim=-1 ) bbox = BoxList(transposed_boxes, self.size, mode="xyxy") # bbox._copy_extra_fields(self) for k, v in self.extra_fields.items(): if not isinstance(v, torch.Tensor): v = v.transpose(method) bbox.add_field(k, v) return bbox.convert(self.mode) def crop(self, box): """ Crops a rectangular region from this bounding box. The box is a 4-tuple defining the left, upper, right, and lower pixel coordinate. """ xmin, ymin, xmax, ymax = self._split_into_xyxy() w, h = box[2] - box[0], box[3] - box[1] cropped_xmin = (xmin - box[0]).clamp(min=0, max=w) cropped_ymin = (ymin - box[1]).clamp(min=0, max=h) cropped_xmax = (xmax - box[0]).clamp(min=0, max=w) cropped_ymax = (ymax - box[1]).clamp(min=0, max=h) # TODO should I filter empty boxes here? if False: is_empty = (cropped_xmin == cropped_xmax) | (cropped_ymin == cropped_ymax) cropped_box = torch.cat( (cropped_xmin, cropped_ymin, cropped_xmax, cropped_ymax), dim=-1 ) bbox = BoxList(cropped_box, (w, h), mode="xyxy") # bbox._copy_extra_fields(self) for k, v in self.extra_fields.items(): if not isinstance(v, torch.Tensor): v = v.crop(box) bbox.add_field(k, v) return bbox.convert(self.mode) # Tensor-like methods def to(self, device): bbox = BoxList(self.bbox.to(device), self.size, self.mode) for k, v in self.extra_fields.items(): if hasattr(v, "to"): v = v.to(device) bbox.add_field(k, v) return bbox def __getitem__(self, item): bbox = BoxList(self.bbox[item], self.size, self.mode) for k, v in self.extra_fields.items(): bbox.add_field(k, v[item]) return bbox def __len__(self): return self.bbox.shape[0] def clip_to_image(self, remove_empty=True): TO_REMOVE = 1 self.bbox[:, 0].clamp_(min=0, max=self.size[0] - TO_REMOVE) self.bbox[:, 1].clamp_(min=0, max=self.size[1] - TO_REMOVE) self.bbox[:, 2].clamp_(min=0, max=self.size[0] - TO_REMOVE) self.bbox[:, 3].clamp_(min=0, max=self.size[1] - TO_REMOVE) if remove_empty: box = self.bbox keep = (box[:, 3] > box[:, 1]) & (box[:, 2] > box[:, 0]) return self[keep] return self def area(self): box = self.bbox if self.mode == "xyxy": TO_REMOVE = 1 area = (box[:, 2] - box[:, 0] + TO_REMOVE) * (box[:, 3] - box[:, 1] + TO_REMOVE) elif self.mode == "xywh": area = box[:, 2] * box[:, 3] else: raise RuntimeError("Should not be here") return area def copy_with_fields(self, fields, skip_missing=False): bbox = BoxList(self.bbox, self.size, self.mode) if not isinstance(fields, (list, tuple)): fields = [fields] for field in fields: if self.has_field(field): bbox.add_field(field, self.get_field(field)) elif not skip_missing: raise KeyError("Field '{}' not found in {}".format(field, self)) return bbox def __repr__(self): s = self.__class__.__name__ + "(" s += "num_boxes={}, ".format(len(self)) s += "image_width={}, ".format(self.size[0]) s += "image_height={}, ".format(self.size[1]) s += "mode={})".format(self.mode) return s if __name__ == "__main__": bbox = BoxList([[0, 0, 10, 10], [0, 0, 5, 5]], (10, 10)) s_bbox = bbox.resize((5, 5)) print(s_bbox) print(s_bbox.bbox) t_bbox = bbox.transpose(0) print(t_bbox) print(t_bbox.bbox) ================================================ FILE: maskrcnn_benchmark/structures/boxlist_ops.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from .bounding_box import BoxList from maskrcnn_benchmark.layers import nms as _box_nms def boxlist_nms(boxlist, nms_thresh, max_proposals=-1, score_field="scores"): """ Performs non-maximum suppression on a boxlist, with scores specified in a boxlist field via score_field. Arguments: boxlist(BoxList) nms_thresh (float) max_proposals (int): if > 0, then only the top max_proposals are kept after non-maximum suppression score_field (str) """ if nms_thresh <= 0: return boxlist mode = boxlist.mode boxlist = boxlist.convert("xyxy") boxes = boxlist.bbox score = boxlist.get_field(score_field) keep = _box_nms(boxes, score, nms_thresh) if max_proposals > 0: keep = keep[: max_proposals] boxlist = boxlist[keep] return boxlist.convert(mode) def remove_small_boxes(boxlist, min_size): """ Only keep boxes with both sides >= min_size Arguments: boxlist (Boxlist) min_size (int) """ # TODO maybe add an API for querying the ws / hs xywh_boxes = boxlist.convert("xywh").bbox _, _, ws, hs = xywh_boxes.unbind(dim=1) keep = ( (ws >= min_size) & (hs >= min_size) ).nonzero().squeeze(1) return boxlist[keep] # implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py # with slight modifications def boxlist_iou(boxlist1, boxlist2): """Compute the intersection over union of two set of boxes. The box order must be (xmin, ymin, xmax, ymax). Arguments: box1: (BoxList) bounding boxes, sized [N,4]. box2: (BoxList) bounding boxes, sized [M,4]. Returns: (tensor) iou, sized [N,M]. Reference: https://github.com/chainer/chainercv/blob/master/chainercv/utils/bbox/bbox_iou.py """ if boxlist1.size != boxlist2.size: raise RuntimeError( "boxlists should have same image size, got {}, {}".format(boxlist1, boxlist2)) boxlist1 = boxlist1.convert("xyxy") boxlist2 = boxlist2.convert("xyxy") N = len(boxlist1) M = len(boxlist2) area1 = boxlist1.area() area2 = boxlist2.area() box1, box2 = boxlist1.bbox, boxlist2.bbox lt = torch.max(box1[:, None, :2], box2[:, :2]) # [N,M,2] rb = torch.min(box1[:, None, 2:], box2[:, 2:]) # [N,M,2] TO_REMOVE = 1 wh = (rb - lt + TO_REMOVE).clamp(min=0) # [N,M,2] inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] iou = inter / (area1[:, None] + area2 - inter) return iou # TODO redundant, remove def _cat(tensors, dim=0): """ Efficient version of torch.cat that avoids a copy if there is only a single element in a list """ assert isinstance(tensors, (list, tuple)) if len(tensors) == 1: return tensors[0] return torch.cat(tensors, dim) def cat_boxlist(bboxes): """ Concatenates a list of BoxList (having the same image size) into a single BoxList Arguments: bboxes (list[BoxList]) """ assert isinstance(bboxes, (list, tuple)) assert all(isinstance(bbox, BoxList) for bbox in bboxes) size = bboxes[0].size assert all(bbox.size == size for bbox in bboxes) mode = bboxes[0].mode assert all(bbox.mode == mode for bbox in bboxes) fields = set(bboxes[0].fields()) assert all(set(bbox.fields()) == fields for bbox in bboxes) cat_boxes = BoxList(_cat([bbox.bbox for bbox in bboxes], dim=0), size, mode) for field in fields: data = _cat([bbox.get_field(field) for bbox in bboxes], dim=0) cat_boxes.add_field(field, data) return cat_boxes ================================================ FILE: maskrcnn_benchmark/structures/image_list.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from __future__ import division import torch class ImageList(object): """ Structure that holds a list of images (of possibly varying sizes) as a single tensor. This works by padding the images to the same size, and storing in a field the original sizes of each image """ def __init__(self, tensors, image_sizes): """ Arguments: tensors (tensor) image_sizes (list[tuple[int, int]]) """ self.tensors = tensors self.image_sizes = image_sizes def to(self, *args, **kwargs): cast_tensor = self.tensors.to(*args, **kwargs) return ImageList(cast_tensor, self.image_sizes) def to_image_list(tensors, size_divisible=0): """ tensors can be an ImageList, a torch.Tensor or an iterable of Tensors. It can't be a numpy array. When tensors is an iterable of Tensors, it pads the Tensors with zeros so that they have the same shape """ if isinstance(tensors, torch.Tensor) and size_divisible > 0: tensors = [tensors] if isinstance(tensors, ImageList): return tensors elif isinstance(tensors, torch.Tensor): # single tensor shape can be inferred if tensors.dim() == 3: tensors = tensors[None] assert tensors.dim() == 4 image_sizes = [tensor.shape[-2:] for tensor in tensors] return ImageList(tensors, image_sizes) elif isinstance(tensors, (tuple, list)): max_size = tuple(max(s) for s in zip(*[img.shape for img in tensors])) # TODO Ideally, just remove this and let me model handle arbitrary # input sizs if size_divisible > 0: import math stride = size_divisible max_size = list(max_size) max_size[1] = int(math.ceil(max_size[1] / stride) * stride) max_size[2] = int(math.ceil(max_size[2] / stride) * stride) max_size = tuple(max_size) batch_shape = (len(tensors),) + max_size batched_imgs = tensors[0].new(*batch_shape).zero_() for img, pad_img in zip(tensors, batched_imgs): pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) image_sizes = [im.shape[-2:] for im in tensors] return ImageList(batched_imgs, image_sizes) else: raise TypeError("Unsupported type for to_image_list: {}".format(type(tensors))) ================================================ FILE: maskrcnn_benchmark/structures/keypoint.py ================================================ import torch # transpose FLIP_LEFT_RIGHT = 0 FLIP_TOP_BOTTOM = 1 class Keypoints(object): def __init__(self, keypoints, size, mode=None): # FIXME remove check once we have better integration with device # in my version this would consistently return a CPU tensor device = keypoints.device if isinstance(keypoints, torch.Tensor) else torch.device('cpu') keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=device) num_keypoints = keypoints.shape[0] if num_keypoints: keypoints = keypoints.view(num_keypoints, -1, 3) # TODO should I split them? # self.visibility = keypoints[..., 2] self.keypoints = keypoints# [..., :2] self.size = size self.mode = mode self.extra_fields = {} def crop(self, box): raise NotImplementedError() def resize(self, size, *args, **kwargs): ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size)) ratio_w, ratio_h = ratios resized_data = self.keypoints.clone() resized_data[..., 0] *= ratio_w resized_data[..., 1] *= ratio_h keypoints = type(self)(resized_data, size, self.mode) for k, v in self.extra_fields.items(): keypoints.add_field(k, v) return keypoints def transpose(self, method): if method not in (FLIP_LEFT_RIGHT,): raise NotImplementedError( "Only FLIP_LEFT_RIGHT implemented") flip_inds = type(self).FLIP_INDS flipped_data = self.keypoints[:, flip_inds] width = self.size[0] TO_REMOVE = 1 # Flip x coordinates flipped_data[..., 0] = width - flipped_data[..., 0] - TO_REMOVE # Maintain COCO convention that if visibility == 0, then x, y = 0 inds = flipped_data[..., 2] == 0 flipped_data[inds] = 0 keypoints = type(self)(flipped_data, self.size, self.mode) for k, v in self.extra_fields.items(): keypoints.add_field(k, v) return keypoints def to(self, *args, **kwargs): keypoints = type(self)(self.keypoints.to(*args, **kwargs), self.size, self.mode) for k, v in self.extra_fields.items(): if hasattr(v, "to"): v = v.to(*args, **kwargs) keypoints.add_field(k, v) return keypoints def __getitem__(self, item): keypoints = type(self)(self.keypoints[item], self.size, self.mode) for k, v in self.extra_fields.items(): keypoints.add_field(k, v[item]) return keypoints def add_field(self, field, field_data): self.extra_fields[field] = field_data def get_field(self, field): return self.extra_fields[field] def __repr__(self): s = self.__class__.__name__ + '(' s += 'num_instances={}, '.format(len(self.keypoints)) s += 'image_width={}, '.format(self.size[0]) s += 'image_height={})'.format(self.size[1]) return s def _create_flip_indices(names, flip_map): full_flip_map = flip_map.copy() full_flip_map.update({v: k for k, v in flip_map.items()}) flipped_names = [i if i not in full_flip_map else full_flip_map[i] for i in names] flip_indices = [names.index(i) for i in flipped_names] return torch.tensor(flip_indices) class PersonKeypoints(Keypoints): NAMES = [ 'nose', 'left_eye', 'right_eye', 'left_ear', 'right_ear', 'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow', 'left_wrist', 'right_wrist', 'left_hip', 'right_hip', 'left_knee', 'right_knee', 'left_ankle', 'right_ankle' ] FLIP_MAP = { 'left_eye': 'right_eye', 'left_ear': 'right_ear', 'left_shoulder': 'right_shoulder', 'left_elbow': 'right_elbow', 'left_wrist': 'right_wrist', 'left_hip': 'right_hip', 'left_knee': 'right_knee', 'left_ankle': 'right_ankle' } # TODO this doesn't look great PersonKeypoints.FLIP_INDS = _create_flip_indices(PersonKeypoints.NAMES, PersonKeypoints.FLIP_MAP) def kp_connections(keypoints): kp_lines = [ [keypoints.index('left_eye'), keypoints.index('right_eye')], [keypoints.index('left_eye'), keypoints.index('nose')], [keypoints.index('right_eye'), keypoints.index('nose')], [keypoints.index('right_eye'), keypoints.index('right_ear')], [keypoints.index('left_eye'), keypoints.index('left_ear')], [keypoints.index('right_shoulder'), keypoints.index('right_elbow')], [keypoints.index('right_elbow'), keypoints.index('right_wrist')], [keypoints.index('left_shoulder'), keypoints.index('left_elbow')], [keypoints.index('left_elbow'), keypoints.index('left_wrist')], [keypoints.index('right_hip'), keypoints.index('right_knee')], [keypoints.index('right_knee'), keypoints.index('right_ankle')], [keypoints.index('left_hip'), keypoints.index('left_knee')], [keypoints.index('left_knee'), keypoints.index('left_ankle')], [keypoints.index('right_shoulder'), keypoints.index('left_shoulder')], [keypoints.index('right_hip'), keypoints.index('left_hip')], ] return kp_lines PersonKeypoints.CONNECTIONS = kp_connections(PersonKeypoints.NAMES) # TODO make this nicer, this is a direct translation from C2 (but removing the inner loop) def keypoints_to_heat_map(keypoints, rois, heatmap_size): if rois.numel() == 0: return rois.new().long(), rois.new().long() offset_x = rois[:, 0] offset_y = rois[:, 1] scale_x = heatmap_size / (rois[:, 2] - rois[:, 0]) scale_y = heatmap_size / (rois[:, 3] - rois[:, 1]) offset_x = offset_x[:, None] offset_y = offset_y[:, None] scale_x = scale_x[:, None] scale_y = scale_y[:, None] x = keypoints[..., 0] y = keypoints[..., 1] x_boundary_inds = x == rois[:, 2][:, None] y_boundary_inds = y == rois[:, 3][:, None] x = (x - offset_x) * scale_x x = x.floor().long() y = (y - offset_y) * scale_y y = y.floor().long() x[x_boundary_inds] = heatmap_size - 1 y[y_boundary_inds] = heatmap_size - 1 valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size) vis = keypoints[..., 2] > 0 valid = (valid_loc & vis).long() lin_ind = y * heatmap_size + x heatmaps = lin_ind * valid return heatmaps, valid ================================================ FILE: maskrcnn_benchmark/structures/segmentation_mask.py ================================================ import cv2 import copy import torch import numpy as np from maskrcnn_benchmark.layers.misc import interpolate from maskrcnn_benchmark.utils import cv2_util import pycocotools.mask as mask_utils # transpose FLIP_LEFT_RIGHT = 0 FLIP_TOP_BOTTOM = 1 """ ABSTRACT Segmentations come in either: 1) Binary masks 2) Polygons Binary masks can be represented in a contiguous array and operations can be carried out more efficiently, therefore BinaryMaskList handles them together. Polygons are handled separately for each instance, by PolygonInstance and instances are handled by PolygonList. SegmentationList is supposed to represent both, therefore it wraps the functions of BinaryMaskList and PolygonList to make it transparent. """ class BinaryMaskList(object): """ This class handles binary masks for all objects in the image """ def __init__(self, masks, size): """ Arguments: masks: Either torch.tensor of [num_instances, H, W] or list of torch.tensors of [H, W] with num_instances elems, or RLE (Run Length Encoding) - interpreted as list of dicts, or BinaryMaskList. size: absolute image size, width first After initialization, a hard copy will be made, to leave the initializing source data intact. """ assert isinstance(size, (list, tuple)) assert len(size) == 2 if isinstance(masks, torch.Tensor): # The raw data representation is passed as argument masks = masks.clone() elif isinstance(masks, (list, tuple)): if len(masks) == 0: masks = torch.empty([0, size[1], size[0]]) # num_instances = 0! elif isinstance(masks[0], torch.Tensor): masks = torch.stack(masks, dim=0).clone() elif isinstance(masks[0], dict) and "counts" in masks[0]: if(isinstance(masks[0]["counts"], (list, tuple))): masks = mask_utils.frPyObjects(masks, size[1], size[0]) # RLE interpretation rle_sizes = [tuple(inst["size"]) for inst in masks] masks = mask_utils.decode(masks) # [h, w, n] masks = torch.tensor(masks).permute(2, 0, 1) # [n, h, w] assert rle_sizes.count(rle_sizes[0]) == len(rle_sizes), ( "All the sizes must be the same size: %s" % rle_sizes ) # in RLE, height come first in "size" rle_height, rle_width = rle_sizes[0] assert masks.shape[1] == rle_height assert masks.shape[2] == rle_width width, height = size if width != rle_width or height != rle_height: masks = interpolate( input=masks[None].float(), size=(height, width), mode="bilinear", align_corners=False, )[0].type_as(masks) else: RuntimeError( "Type of `masks[0]` could not be interpreted: %s" % type(masks) ) elif isinstance(masks, BinaryMaskList): # just hard copy the BinaryMaskList instance's underlying data masks = masks.masks.clone() else: RuntimeError( "Type of `masks` argument could not be interpreted:%s" % type(masks) ) if len(masks.shape) == 2: # if only a single instance mask is passed masks = masks[None] assert len(masks.shape) == 3 assert masks.shape[1] == size[1], "%s != %s" % (masks.shape[1], size[1]) assert masks.shape[2] == size[0], "%s != %s" % (masks.shape[2], size[0]) self.masks = masks self.size = tuple(size) def transpose(self, method): dim = 1 if method == FLIP_TOP_BOTTOM else 2 flipped_masks = self.masks.flip(dim) return BinaryMaskList(flipped_masks, self.size) def crop(self, box): assert isinstance(box, (list, tuple, torch.Tensor)), str(type(box)) # box is assumed to be xyxy current_width, current_height = self.size xmin, ymin, xmax, ymax = [round(float(b)) for b in box] assert xmin <= xmax and ymin <= ymax, str(box) xmin = min(max(xmin, 0), current_width - 1) ymin = min(max(ymin, 0), current_height - 1) xmax = min(max(xmax, 0), current_width) ymax = min(max(ymax, 0), current_height) xmax = max(xmax, xmin + 1) ymax = max(ymax, ymin + 1) width, height = xmax - xmin, ymax - ymin cropped_masks = self.masks[:, ymin:ymax, xmin:xmax] cropped_size = width, height return BinaryMaskList(cropped_masks, cropped_size) def resize(self, size): try: iter(size) except TypeError: assert isinstance(size, (int, float)) size = size, size width, height = map(int, size) assert width > 0 assert height > 0 # Height comes first here! resized_masks = interpolate( input=self.masks[None].float(), size=(height, width), mode="bilinear", align_corners=False, )[0].type_as(self.masks) resized_size = width, height return BinaryMaskList(resized_masks, resized_size) def convert_to_polygon(self): if self.masks.numel() == 0: return PolygonList([], self.size) contours = self._findContours() return PolygonList(contours, self.size) def to(self, *args, **kwargs): return self def _findContours(self): contours = [] masks = self.masks.detach().numpy() for mask in masks: mask = cv2.UMat(mask) contour, hierarchy = cv2_util.findContours( mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_TC89_L1 ) reshaped_contour = [] for entity in contour: assert len(entity.shape) == 3 assert ( entity.shape[1] == 1 ), "Hierarchical contours are not allowed" reshaped_contour.append(entity.reshape(-1).tolist()) contours.append(reshaped_contour) return contours def __len__(self): return len(self.masks) def __getitem__(self, index): if self.masks.numel() == 0: raise RuntimeError("Indexing empty BinaryMaskList") return BinaryMaskList(self.masks[index], self.size) def __iter__(self): return iter(self.masks) def __repr__(self): s = self.__class__.__name__ + "(" s += "num_instances={}, ".format(len(self.masks)) s += "image_width={}, ".format(self.size[0]) s += "image_height={})".format(self.size[1]) return s class PolygonInstance(object): """ This class holds a set of polygons that represents a single instance of an object mask. The object can be represented as a set of polygons """ def __init__(self, polygons, size): """ Arguments: a list of lists of numbers. The first level refers to all the polygons that compose the object, and the second level to the polygon coordinates. """ if isinstance(polygons, (list, tuple)): valid_polygons = [] for p in polygons: p = torch.as_tensor(p, dtype=torch.float32) if len(p) >= 6: # 3 * 2 coordinates valid_polygons.append(p) polygons = valid_polygons elif isinstance(polygons, PolygonInstance): polygons = copy.copy(polygons.polygons) else: RuntimeError( "Type of argument `polygons` is not allowed:%s" % (type(polygons)) ) """ This crashes the training way too many times... for p in polygons: assert p[::2].min() >= 0 assert p[::2].max() < size[0] assert p[1::2].min() >= 0 assert p[1::2].max() , size[1] """ self.polygons = polygons self.size = tuple(size) def transpose(self, method): if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): raise NotImplementedError( "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" ) flipped_polygons = [] width, height = self.size if method == FLIP_LEFT_RIGHT: dim = width idx = 0 elif method == FLIP_TOP_BOTTOM: dim = height idx = 1 for poly in self.polygons: p = poly.clone() TO_REMOVE = 1 p[idx::2] = dim - poly[idx::2] - TO_REMOVE flipped_polygons.append(p) return PolygonInstance(flipped_polygons, size=self.size) def crop(self, box): assert isinstance(box, (list, tuple, torch.Tensor)), str(type(box)) # box is assumed to be xyxy current_width, current_height = self.size xmin, ymin, xmax, ymax = map(float, box) assert xmin <= xmax and ymin <= ymax, str(box) xmin = min(max(xmin, 0), current_width - 1) ymin = min(max(ymin, 0), current_height - 1) xmax = min(max(xmax, 0), current_width) ymax = min(max(ymax, 0), current_height) xmax = max(xmax, xmin + 1) ymax = max(ymax, ymin + 1) w, h = xmax - xmin, ymax - ymin cropped_polygons = [] for poly in self.polygons: p = poly.clone() p[0::2] = p[0::2] - xmin # .clamp(min=0, max=w) p[1::2] = p[1::2] - ymin # .clamp(min=0, max=h) cropped_polygons.append(p) return PolygonInstance(cropped_polygons, size=(w, h)) def resize(self, size): try: iter(size) except TypeError: assert isinstance(size, (int, float)) size = size, size ratios = tuple( float(s) / float(s_orig) for s, s_orig in zip(size, self.size) ) if ratios[0] == ratios[1]: ratio = ratios[0] scaled_polys = [p * ratio for p in self.polygons] return PolygonInstance(scaled_polys, size) ratio_w, ratio_h = ratios scaled_polygons = [] for poly in self.polygons: p = poly.clone() p[0::2] *= ratio_w p[1::2] *= ratio_h scaled_polygons.append(p) return PolygonInstance(scaled_polygons, size=size) def convert_to_binarymask(self): width, height = self.size # formatting for COCO PythonAPI polygons = [p.numpy() for p in self.polygons] rles = mask_utils.frPyObjects(polygons, height, width) rle = mask_utils.merge(rles) mask = mask_utils.decode(rle) mask = torch.from_numpy(mask) return mask def __len__(self): return len(self.polygons) def __repr__(self): s = self.__class__.__name__ + "(" s += "num_groups={}, ".format(len(self.polygons)) s += "image_width={}, ".format(self.size[0]) s += "image_height={})".format(self.size[1]) return s class PolygonList(object): """ This class handles PolygonInstances for all objects in the image """ def __init__(self, polygons, size): """ Arguments: polygons: a list of list of lists of numbers. The first level of the list correspond to individual instances, the second level to all the polygons that compose the object, and the third level to the polygon coordinates. OR a list of PolygonInstances. OR a PolygonList size: absolute image size """ if isinstance(polygons, (list, tuple)): if len(polygons) == 0: polygons = [[[]]] if isinstance(polygons[0], (list, tuple)): assert isinstance(polygons[0][0], (list, tuple)), str( type(polygons[0][0]) ) else: assert isinstance(polygons[0], PolygonInstance), str( type(polygons[0]) ) elif isinstance(polygons, PolygonList): size = polygons.size polygons = polygons.polygons else: RuntimeError( "Type of argument `polygons` is not allowed:%s" % (type(polygons)) ) assert isinstance(size, (list, tuple)), str(type(size)) self.polygons = [] for p in polygons: p = PolygonInstance(p, size) if len(p) > 0: self.polygons.append(p) self.size = tuple(size) def transpose(self, method): if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): raise NotImplementedError( "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" ) flipped_polygons = [] for polygon in self.polygons: flipped_polygons.append(polygon.transpose(method)) return PolygonList(flipped_polygons, size=self.size) def crop(self, box): w, h = box[2] - box[0], box[3] - box[1] cropped_polygons = [] for polygon in self.polygons: cropped_polygons.append(polygon.crop(box)) cropped_size = w, h return PolygonList(cropped_polygons, cropped_size) def resize(self, size): resized_polygons = [] for polygon in self.polygons: resized_polygons.append(polygon.resize(size)) resized_size = size return PolygonList(resized_polygons, resized_size) def to(self, *args, **kwargs): return self def convert_to_binarymask(self): if len(self) > 0: masks = torch.stack( [p.convert_to_binarymask() for p in self.polygons] ) else: size = self.size masks = torch.empty([0, size[1], size[0]], dtype=torch.bool) return BinaryMaskList(masks, size=self.size) def __len__(self): return len(self.polygons) def __getitem__(self, item): if isinstance(item, int): selected_polygons = [self.polygons[item]] elif isinstance(item, slice): selected_polygons = self.polygons[item] else: # advanced indexing on a single dimension selected_polygons = [] if isinstance(item, torch.Tensor) and item.dtype == torch.bool: item = item.nonzero() item = item.squeeze(1) if item.numel() > 0 else item item = item.tolist() for i in item: selected_polygons.append(self.polygons[i]) return PolygonList(selected_polygons, size=self.size) def __iter__(self): return iter(self.polygons) def __repr__(self): s = self.__class__.__name__ + "(" s += "num_instances={}, ".format(len(self.polygons)) s += "image_width={}, ".format(self.size[0]) s += "image_height={})".format(self.size[1]) return s class SegmentationMask(object): """ This class stores the segmentations for all objects in the image. It wraps BinaryMaskList and PolygonList conveniently. """ def __init__(self, instances, size, mode="poly"): """ Arguments: instances: two types (1) polygon (2) binary mask size: (width, height) mode: 'poly', 'mask'. if mode is 'mask', convert mask of any format to binary mask """ assert isinstance(size, (list, tuple)) assert len(size) == 2 if isinstance(size[0], torch.Tensor): assert isinstance(size[1], torch.Tensor) size = size[0].item(), size[1].item() assert isinstance(size[0], (int, float)) assert isinstance(size[1], (int, float)) if mode == "poly": self.instances = PolygonList(instances, size) elif mode == "mask": self.instances = BinaryMaskList(instances, size) else: raise NotImplementedError("Unknown mode: %s" % str(mode)) self.mode = mode self.size = tuple(size) def transpose(self, method): flipped_instances = self.instances.transpose(method) return SegmentationMask(flipped_instances, self.size, self.mode) def crop(self, box): cropped_instances = self.instances.crop(box) cropped_size = cropped_instances.size return SegmentationMask(cropped_instances, cropped_size, self.mode) def resize(self, size, *args, **kwargs): resized_instances = self.instances.resize(size) resized_size = size return SegmentationMask(resized_instances, resized_size, self.mode) def to(self, *args, **kwargs): return self def convert(self, mode): if mode == self.mode: return self if mode == "poly": converted_instances = self.instances.convert_to_polygon() elif mode == "mask": converted_instances = self.instances.convert_to_binarymask() else: raise NotImplementedError("Unknown mode: %s" % str(mode)) return SegmentationMask(converted_instances, self.size, mode) def get_mask_tensor(self): instances = self.instances if self.mode == "poly": instances = instances.convert_to_binarymask() # If there is only 1 instance return instances.masks.squeeze(0) def __len__(self): return len(self.instances) def __getitem__(self, item): selected_instances = self.instances.__getitem__(item) return SegmentationMask(selected_instances, self.size, self.mode) def __iter__(self): self.iter_idx = 0 return self def __next__(self): if self.iter_idx < self.__len__(): next_segmentation = self.__getitem__(self.iter_idx) self.iter_idx += 1 return next_segmentation raise StopIteration() next = __next__ # Python 2 compatibility def __repr__(self): s = self.__class__.__name__ + "(" s += "num_instances={}, ".format(len(self.instances)) s += "image_width={}, ".format(self.size[0]) s += "image_height={}, ".format(self.size[1]) s += "mode={})".format(self.mode) return s ================================================ FILE: maskrcnn_benchmark/utils/README.md ================================================ # Utility functions This folder contain utility functions that are not used in the core library, but are useful for building models or training code using the config system. ================================================ FILE: maskrcnn_benchmark/utils/__init__.py ================================================ ================================================ FILE: maskrcnn_benchmark/utils/c2_model_loading.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import logging import pickle from collections import OrderedDict import torch from maskrcnn_benchmark.utils.model_serialization import load_state_dict from maskrcnn_benchmark.utils.registry import Registry def _rename_basic_resnet_weights(layer_keys): layer_keys = [k.replace("_", ".") for k in layer_keys] layer_keys = [k.replace(".w", ".weight") for k in layer_keys] layer_keys = [k.replace(".bn", "_bn") for k in layer_keys] layer_keys = [k.replace(".b", ".bias") for k in layer_keys] layer_keys = [k.replace("_bn.s", "_bn.scale") for k in layer_keys] layer_keys = [k.replace(".biasranch", ".branch") for k in layer_keys] layer_keys = [k.replace("bbox.pred", "bbox_pred") for k in layer_keys] layer_keys = [k.replace("cls.score", "cls_score") for k in layer_keys] layer_keys = [k.replace("res.conv1_", "conv1_") for k in layer_keys] # RPN / Faster RCNN layer_keys = [k.replace(".biasbox", ".bbox") for k in layer_keys] layer_keys = [k.replace("conv.rpn", "rpn.conv") for k in layer_keys] layer_keys = [k.replace("rpn.bbox.pred", "rpn.bbox_pred") for k in layer_keys] layer_keys = [k.replace("rpn.cls.logits", "rpn.cls_logits") for k in layer_keys] # Affine-Channel -> BatchNorm enaming layer_keys = [k.replace("_bn.scale", "_bn.weight") for k in layer_keys] # Make torchvision-compatible layer_keys = [k.replace("conv1_bn.", "bn1.") for k in layer_keys] layer_keys = [k.replace("res2.", "layer1.") for k in layer_keys] layer_keys = [k.replace("res3.", "layer2.") for k in layer_keys] layer_keys = [k.replace("res4.", "layer3.") for k in layer_keys] layer_keys = [k.replace("res5.", "layer4.") for k in layer_keys] layer_keys = [k.replace(".branch2a.", ".conv1.") for k in layer_keys] layer_keys = [k.replace(".branch2a_bn.", ".bn1.") for k in layer_keys] layer_keys = [k.replace(".branch2b.", ".conv2.") for k in layer_keys] layer_keys = [k.replace(".branch2b_bn.", ".bn2.") for k in layer_keys] layer_keys = [k.replace(".branch2c.", ".conv3.") for k in layer_keys] layer_keys = [k.replace(".branch2c_bn.", ".bn3.") for k in layer_keys] layer_keys = [k.replace(".branch1.", ".downsample.0.") for k in layer_keys] layer_keys = [k.replace(".branch1_bn.", ".downsample.1.") for k in layer_keys] # GroupNorm layer_keys = [k.replace("conv1.gn.s", "bn1.weight") for k in layer_keys] layer_keys = [k.replace("conv1.gn.bias", "bn1.bias") for k in layer_keys] layer_keys = [k.replace("conv2.gn.s", "bn2.weight") for k in layer_keys] layer_keys = [k.replace("conv2.gn.bias", "bn2.bias") for k in layer_keys] layer_keys = [k.replace("conv3.gn.s", "bn3.weight") for k in layer_keys] layer_keys = [k.replace("conv3.gn.bias", "bn3.bias") for k in layer_keys] layer_keys = [k.replace("downsample.0.gn.s", "downsample.1.weight") \ for k in layer_keys] layer_keys = [k.replace("downsample.0.gn.bias", "downsample.1.bias") \ for k in layer_keys] return layer_keys def _rename_fpn_weights(layer_keys, stage_names): for mapped_idx, stage_name in enumerate(stage_names, 1): suffix = "" if mapped_idx < 4: suffix = ".lateral" layer_keys = [ k.replace("fpn.inner.layer{}.sum{}".format(stage_name, suffix), "fpn_inner{}".format(mapped_idx)) for k in layer_keys ] layer_keys = [k.replace("fpn.layer{}.sum".format(stage_name), "fpn_layer{}".format(mapped_idx)) for k in layer_keys] layer_keys = [k.replace("rpn.conv.fpn2", "rpn.conv") for k in layer_keys] layer_keys = [k.replace("rpn.bbox_pred.fpn2", "rpn.bbox_pred") for k in layer_keys] layer_keys = [ k.replace("rpn.cls_logits.fpn2", "rpn.cls_logits") for k in layer_keys ] return layer_keys def _rename_weights_for_resnet(weights, stage_names): original_keys = sorted(weights.keys()) layer_keys = sorted(weights.keys()) # for X-101, rename output to fc1000 to avoid conflicts afterwards layer_keys = [k if k != "pred_b" else "fc1000_b" for k in layer_keys] layer_keys = [k if k != "pred_w" else "fc1000_w" for k in layer_keys] # performs basic renaming: _ -> . , etc layer_keys = _rename_basic_resnet_weights(layer_keys) # FPN layer_keys = _rename_fpn_weights(layer_keys, stage_names) # Mask R-CNN layer_keys = [k.replace("mask.fcn.logits", "mask_fcn_logits") for k in layer_keys] layer_keys = [k.replace(".[mask].fcn", "mask_fcn") for k in layer_keys] layer_keys = [k.replace("conv5.mask", "conv5_mask") for k in layer_keys] # Keypoint R-CNN layer_keys = [k.replace("kps.score.lowres", "kps_score_lowres") for k in layer_keys] layer_keys = [k.replace("kps.score", "kps_score") for k in layer_keys] layer_keys = [k.replace("conv.fcn", "conv_fcn") for k in layer_keys] # Rename for our RPN structure layer_keys = [k.replace("rpn.", "rpn.head.") for k in layer_keys] key_map = {k: v for k, v in zip(original_keys, layer_keys)} logger = logging.getLogger(__name__) logger.info("Remapping C2 weights") max_c2_key_size = max([len(k) for k in original_keys if "_momentum" not in k]) new_weights = OrderedDict() for k in original_keys: v = weights[k] if "_momentum" in k: continue # if 'fc1000' in k: # continue w = torch.from_numpy(v) # if "bn" in k: # w = w.view(1, -1, 1, 1) logger.info("C2 name: {: <{}} mapped name: {}".format(k, max_c2_key_size, key_map[k])) new_weights[key_map[k]] = w return new_weights def _load_c2_pickled_weights(file_path): with open(file_path, "rb") as f: if torch._six.PY3: data = pickle.load(f, encoding="latin1") else: data = pickle.load(f) if "blobs" in data: weights = data["blobs"] else: weights = data return weights def _rename_conv_weights_for_deformable_conv_layers(state_dict, cfg): import re logger = logging.getLogger(__name__) logger.info("Remapping conv weights for deformable conv weights") layer_keys = sorted(state_dict.keys()) for ix, stage_with_dcn in enumerate(cfg.MODEL.RESNETS.STAGE_WITH_DCN, 1): if not stage_with_dcn: continue for old_key in layer_keys: pattern = ".*layer{}.*conv2.*".format(ix) r = re.match(pattern, old_key) if r is None: continue for param in ["weight", "bias"]: if old_key.find(param) is -1: continue new_key = old_key.replace( "conv2.{}".format(param), "conv2.conv.{}".format(param) ) logger.info("pattern: {}, old_key: {}, new_key: {}".format( pattern, old_key, new_key )) state_dict[new_key] = state_dict[old_key] del state_dict[old_key] return state_dict _C2_STAGE_NAMES = { "R-50": ["1.2", "2.3", "3.5", "4.2"], "R-101": ["1.2", "2.3", "3.22", "4.2"], "R-152": ["1.2", "2.7", "3.35", "4.2"], } C2_FORMAT_LOADER = Registry() @C2_FORMAT_LOADER.register("R-50-C4") @C2_FORMAT_LOADER.register("R-50-C5") @C2_FORMAT_LOADER.register("R-101-C4") @C2_FORMAT_LOADER.register("R-101-C5") @C2_FORMAT_LOADER.register("R-50-FPN") @C2_FORMAT_LOADER.register("R-50-FPN-RETINANET") @C2_FORMAT_LOADER.register("R-101-FPN") @C2_FORMAT_LOADER.register("R-101-FPN-RETINANET") @C2_FORMAT_LOADER.register("R-152-FPN") def load_resnet_c2_format(cfg, f): state_dict = _load_c2_pickled_weights(f) conv_body = cfg.MODEL.BACKBONE.CONV_BODY arch = conv_body.replace("-C4", "").replace("-C5", "").replace("-FPN", "") arch = arch.replace("-RETINANET", "") stages = _C2_STAGE_NAMES[arch] state_dict = _rename_weights_for_resnet(state_dict, stages) # *********************************** # for deformable convolutional layer state_dict = _rename_conv_weights_for_deformable_conv_layers(state_dict, cfg) # *********************************** return dict(model=state_dict) def load_c2_format(cfg, f): return C2_FORMAT_LOADER[cfg.MODEL.BACKBONE.CONV_BODY](cfg, f) ================================================ FILE: maskrcnn_benchmark/utils/checkpoint.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import logging import os import torch from maskrcnn_benchmark.utils.model_serialization import load_state_dict from maskrcnn_benchmark.utils.c2_model_loading import load_c2_format from maskrcnn_benchmark.utils.imports import import_file from maskrcnn_benchmark.utils.model_zoo import cache_url class Checkpointer(object): def __init__( self, model, optimizer=None, scheduler=None, save_dir="", save_to_disk=None, logger=None, ): self.model = model self.optimizer = optimizer self.scheduler = scheduler self.save_dir = save_dir self.save_to_disk = save_to_disk if logger is None: logger = logging.getLogger(__name__) self.logger = logger def save(self, name, **kwargs): if not self.save_dir: return if not self.save_to_disk: return data = {} data["model"] = self.model.state_dict() if self.optimizer is not None: data["optimizer"] = self.optimizer.state_dict() if self.scheduler is not None: data["scheduler"] = self.scheduler.state_dict() data.update(kwargs) save_file = os.path.join(self.save_dir, "{}.pth".format(name)) self.logger.info("Saving checkpoint to {}".format(save_file)) torch.save(data, save_file) self.tag_last_checkpoint(save_file) def load(self, f=None, use_latest=True): if self.has_checkpoint() and use_latest: # override argument with existing checkpoint f = self.get_checkpoint_file() if not f: # no checkpoint could be found self.logger.info("No checkpoint found. Initializing model from scratch") return {} self.logger.info("Loading checkpoint from {}".format(f)) checkpoint = self._load_file(f) self._load_model(checkpoint) if "optimizer" in checkpoint and self.optimizer: self.logger.info("Loading optimizer from {}".format(f)) self.optimizer.load_state_dict(checkpoint.pop("optimizer")) if "scheduler" in checkpoint and self.scheduler: self.logger.info("Loading scheduler from {}".format(f)) self.scheduler.load_state_dict(checkpoint.pop("scheduler")) # return any further checkpoint data return checkpoint def has_checkpoint(self): save_file = os.path.join(self.save_dir, "last_checkpoint") return os.path.exists(save_file) def get_checkpoint_file(self): save_file = os.path.join(self.save_dir, "last_checkpoint") try: with open(save_file, "r") as f: last_saved = f.read() last_saved = last_saved.strip() except IOError: # if file doesn't exist, maybe because it has just been # deleted by a separate process last_saved = "" return last_saved def tag_last_checkpoint(self, last_filename): save_file = os.path.join(self.save_dir, "last_checkpoint") with open(save_file, "w") as f: f.write(last_filename) def _load_file(self, f): return torch.load(f, map_location=torch.device("cpu")) def _load_model(self, checkpoint): load_state_dict(self.model, checkpoint.pop("model")) class DetectronCheckpointer(Checkpointer): def __init__( self, cfg, model, optimizer=None, scheduler=None, save_dir="", save_to_disk=None, logger=None, ): super(DetectronCheckpointer, self).__init__( model, optimizer, scheduler, save_dir, save_to_disk, logger ) self.cfg = cfg.clone() def _load_file(self, f): # catalog lookup if f.startswith("catalog://"): paths_catalog = import_file( "maskrcnn_benchmark.config.paths_catalog", self.cfg.PATHS_CATALOG, True ) catalog_f = paths_catalog.ModelCatalog.get(f[len("catalog://") :]) self.logger.info("{} points to {}".format(f, catalog_f)) f = catalog_f # download url files if f.startswith("http"): # if the file is a url path, download it and cache it cached_f = cache_url(f) self.logger.info("url {} cached in {}".format(f, cached_f)) f = cached_f # convert Caffe2 checkpoint from pkl if f.endswith(".pkl"): return load_c2_format(self.cfg, f) # load native detectron.pytorch checkpoint loaded = super(DetectronCheckpointer, self)._load_file(f) if "model" not in loaded: loaded = dict(model=loaded) return loaded ================================================ FILE: maskrcnn_benchmark/utils/collect_env.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import PIL from torch.utils.collect_env import get_pretty_env_info def get_pil_version(): return "\n Pillow ({})".format(PIL.__version__) def collect_env_info(): env_str = get_pretty_env_info() env_str += get_pil_version() return env_str ================================================ FILE: maskrcnn_benchmark/utils/comm.py ================================================ """ This file contains primitives for multi-gpu communication. This is useful when doing distributed training. """ import pickle import time import torch import torch.distributed as dist def get_world_size(): if not dist.is_available(): return 1 if not dist.is_initialized(): return 1 return dist.get_world_size() def get_rank(): if not dist.is_available(): return 0 if not dist.is_initialized(): return 0 return dist.get_rank() def is_main_process(): return get_rank() == 0 def synchronize(): """ Helper function to synchronize (barrier) among all processes when using distributed training """ if not dist.is_available(): return if not dist.is_initialized(): return world_size = dist.get_world_size() if world_size == 1: return dist.barrier() def all_gather(data): """ Run all_gather on arbitrary picklable data (not necessarily tensors) Args: data: any picklable object Returns: list[data]: list of data gathered from each rank """ world_size = get_world_size() if world_size == 1: return [data] # serialized to a Tensor buffer = pickle.dumps(data) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to("cuda") # obtain Tensor size of each rank local_size = torch.LongTensor([tensor.numel()]).to("cuda") size_list = [torch.LongTensor([0]).to("cuda") for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # receiving Tensor from all ranks # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes tensor_list = [] for _ in size_list: tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) if local_size != max_size: padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") tensor = torch.cat((tensor, padding), dim=0) dist.all_gather(tensor_list, tensor) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list def reduce_dict(input_dict, average=True): """ Args: input_dict (dict): all the values will be reduced average (bool): whether to do average or sum Reduce the values in the dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as input_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return input_dict with torch.no_grad(): names = [] values = [] # sort the keys so that they are consistent across processes for k in sorted(input_dict.keys()): names.append(k) values.append(input_dict[k]) values = torch.stack(values, dim=0) dist.reduce(values, dst=0) if dist.get_rank() == 0 and average: # only main process gets accumulated, so only divide by # world_size in this case values /= world_size reduced_dict = {k: v for k, v in zip(names, values)} return reduced_dict ================================================ FILE: maskrcnn_benchmark/utils/cv2_util.py ================================================ """ Module for cv2 utility functions and maintaining version compatibility between 3.x and 4.x """ import cv2 def findContours(*args, **kwargs): """ Wraps cv2.findContours to maintain compatiblity between versions 3 and 4 Returns: contours, hierarchy """ if cv2.__version__.startswith('4'): contours, hierarchy = cv2.findContours(*args, **kwargs) elif cv2.__version__.startswith('3'): _, contours, hierarchy = cv2.findContours(*args, **kwargs) else: raise AssertionError( 'cv2 must be either version 3 or 4 to call this method') return contours, hierarchy ================================================ FILE: maskrcnn_benchmark/utils/env.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import os from maskrcnn_benchmark.utils.imports import import_file def setup_environment(): """Perform environment setup work. The default setup is a no-op, but this function allows the user to specify a Python source file that performs custom setup work that may be necessary to their computing environment. """ custom_module_path = os.environ.get("TORCH_DETECTRON_ENV_MODULE") if custom_module_path: setup_custom_environment(custom_module_path) else: # The default setup is a no-op pass def setup_custom_environment(custom_module_path): """Load custom environment setup from a Python source file and run the setup function. """ module = import_file("maskrcnn_benchmark.utils.env.custom_module", custom_module_path) assert hasattr(module, "setup_environment") and callable( module.setup_environment ), ( "Custom environment module defined in {} does not have the " "required callable attribute 'setup_environment'." ).format( custom_module_path ) module.setup_environment() # Force environment setup when this module is imported setup_environment() ================================================ FILE: maskrcnn_benchmark/utils/imports.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch if torch._six.PY3: import importlib import importlib.util import sys # from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa def import_file(module_name, file_path, make_importable=False): spec = importlib.util.spec_from_file_location(module_name, file_path) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) if make_importable: sys.modules[module_name] = module return module else: import imp def import_file(module_name, file_path, make_importable=None): module = imp.load_source(module_name, file_path) return module ================================================ FILE: maskrcnn_benchmark/utils/logger.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import logging import os import sys def setup_logger(name, save_dir, distributed_rank, filename="log.txt"): logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) # don't log results for the non-master process if distributed_rank > 0: return logger ch = logging.StreamHandler(stream=sys.stdout) ch.setLevel(logging.DEBUG) formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s") ch.setFormatter(formatter) logger.addHandler(ch) if save_dir: fh = logging.FileHandler(os.path.join(save_dir, filename)) fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger.addHandler(fh) return logger ================================================ FILE: maskrcnn_benchmark/utils/metric_logger.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from collections import defaultdict from collections import deque import torch class SmoothedValue(object): """Track a series of values and provide access to smoothed values over a window or the global series average. """ def __init__(self, window_size=20): self.deque = deque(maxlen=window_size) self.series = [] self.total = 0.0 self.count = 0 def update(self, value): self.deque.append(value) self.series.append(value) self.count += 1 self.total += value @property def median(self): d = torch.tensor(list(self.deque)) return d.median().item() @property def avg(self): d = torch.tensor(list(self.deque)) return d.mean().item() @property def global_avg(self): return self.total / self.count class MetricLogger(object): def __init__(self, delimiter="\t"): self.meters = defaultdict(SmoothedValue) self.delimiter = delimiter def update(self, **kwargs): for k, v in kwargs.items(): if isinstance(v, torch.Tensor): v = v.item() assert isinstance(v, (float, int)) self.meters[k].update(v) def __getattr__(self, attr): if attr in self.meters: return self.meters[attr] if attr in self.__dict__: return self.__dict__[attr] raise AttributeError("'{}' object has no attribute '{}'".format( type(self).__name__, attr)) def __str__(self): loss_str = [] for name, meter in self.meters.items(): loss_str.append( "{}: {:.4f} ({:.4f})".format(name, meter.median, meter.global_avg) ) return self.delimiter.join(loss_str) ================================================ FILE: maskrcnn_benchmark/utils/miscellaneous.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import errno import json import logging import os from .comm import is_main_process def mkdir(path): try: os.makedirs(path) except OSError as e: if e.errno != errno.EEXIST: raise def save_labels(dataset_list, output_dir): if is_main_process(): logger = logging.getLogger(__name__) ids_to_labels = {} for dataset in dataset_list: if hasattr(dataset, 'categories'): ids_to_labels.update(dataset.categories) else: logger.warning("Dataset [{}] has no categories attribute, labels.json file won't be created".format( dataset.__class__.__name__)) if ids_to_labels: labels_file = os.path.join(output_dir, 'labels.json') logger.info("Saving labels mapping into {}".format(labels_file)) with open(labels_file, 'w') as f: json.dump(ids_to_labels, f, indent=2) def save_config(cfg, path): if is_main_process(): with open(path, 'w') as f: f.write(cfg.dump()) ================================================ FILE: maskrcnn_benchmark/utils/model_serialization.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from collections import OrderedDict import logging import torch from maskrcnn_benchmark.utils.imports import import_file def align_and_update_state_dicts(model_state_dict, loaded_state_dict): """ Strategy: suppose that the models that we will create will have prefixes appended to each of its keys, for example due to an extra level of nesting that the original pre-trained weights from ImageNet won't contain. For example, model.state_dict() might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains res2.conv1.weight. We thus want to match both parameters together. For that, we look for each model weight, look among all loaded keys if there is one that is a suffix of the current weight name, and use it if that's the case. If multiple matches exist, take the one with longest size of the corresponding name. For example, for the same model as before, the pretrained weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case, we want to match backbone[0].body.conv1.weight to conv1.weight, and backbone[0].body.res2.conv1.weight to res2.conv1.weight. """ current_keys = sorted(list(model_state_dict.keys())) loaded_keys = sorted(list(loaded_state_dict.keys())) # get a matrix of string matches, where each (i, j) entry correspond to the size of the # loaded_key string, if it matches match_matrix = [ len(j) if i.endswith(j) else 0 for i in current_keys for j in loaded_keys ] match_matrix = torch.as_tensor(match_matrix).view( len(current_keys), len(loaded_keys) ) max_match_size, idxs = match_matrix.max(1) # remove indices that correspond to no-match idxs[max_match_size == 0] = -1 # used for logging max_size = max([len(key) for key in current_keys]) if current_keys else 1 max_size_loaded = max([len(key) for key in loaded_keys]) if loaded_keys else 1 log_str_template = "{: <{}} loaded from {: <{}} of shape {}" logger = logging.getLogger(__name__) for idx_new, idx_old in enumerate(idxs.tolist()): if idx_old == -1: continue key = current_keys[idx_new] key_old = loaded_keys[idx_old] model_state_dict[key] = loaded_state_dict[key_old] logger.info( log_str_template.format( key, max_size, key_old, max_size_loaded, tuple(loaded_state_dict[key_old].shape), ) ) def strip_prefix_if_present(state_dict, prefix): keys = sorted(state_dict.keys()) if not all(key.startswith(prefix) for key in keys): return state_dict stripped_state_dict = OrderedDict() for key, value in state_dict.items(): stripped_state_dict[key.replace(prefix, "")] = value return stripped_state_dict def load_state_dict(model, loaded_state_dict): model_state_dict = model.state_dict() # if the state_dict comes from a model that was wrapped in a # DataParallel or DistributedDataParallel during serialization, # remove the "module" prefix before performing the matching loaded_state_dict = strip_prefix_if_present(loaded_state_dict, prefix="module.") align_and_update_state_dicts(model_state_dict, loaded_state_dict) # use strict loading model.load_state_dict(model_state_dict) ================================================ FILE: maskrcnn_benchmark/utils/model_zoo.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import os import sys try: from torch.hub import _download_url_to_file from torch.hub import urlparse from torch.hub import HASH_REGEX except ImportError: from torch.utils.model_zoo import _download_url_to_file from torch.utils.model_zoo import urlparse from torch.utils.model_zoo import HASH_REGEX from maskrcnn_benchmark.utils.comm import is_main_process from maskrcnn_benchmark.utils.comm import synchronize # very similar to https://github.com/pytorch/pytorch/blob/master/torch/utils/model_zoo.py # but with a few improvements and modifications def cache_url(url, model_dir=None, progress=True): r"""Loads the Torch serialized object at the given URL. If the object is already present in `model_dir`, it's deserialized and returned. The filename part of the URL should follow the naming convention ``filename-.ext`` where ```` is the first eight or more digits of the SHA256 hash of the contents of the file. The hash is used to ensure unique names and to verify the contents of the file. The default value of `model_dir` is ``$TORCH_HOME/models`` where ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be overridden with the ``$TORCH_MODEL_ZOO`` environment variable. Args: url (string): URL of the object to download model_dir (string, optional): directory in which to save the object progress (bool, optional): whether or not to display a progress bar to stderr Example: >>> cached_file = maskrcnn_benchmark.utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth') """ if model_dir is None: torch_home = os.path.expanduser(os.getenv("TORCH_HOME", "~/.torch")) model_dir = os.getenv("TORCH_MODEL_ZOO", os.path.join(torch_home, "models")) if not os.path.exists(model_dir): os.makedirs(model_dir) parts = urlparse(url) filename = os.path.basename(parts.path) if filename == "model_final.pkl": # workaround as pre-trained Caffe2 models from Detectron have all the same filename # so make the full path the filename by replacing / with _ filename = parts.path.replace("/", "_") cached_file = os.path.join(model_dir, filename) if not os.path.exists(cached_file) and is_main_process(): sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file)) hash_prefix = HASH_REGEX.search(filename) if hash_prefix is not None: hash_prefix = hash_prefix.group(1) # workaround: Caffe2 models don't have a hash, but follow the R-50 convention, # which matches the hash PyTorch uses. So we skip the hash matching # if the hash_prefix is less than 6 characters if len(hash_prefix) < 6: hash_prefix = None _download_url_to_file(url, cached_file, hash_prefix, progress=progress) synchronize() return cached_file ================================================ FILE: maskrcnn_benchmark/utils/registry.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. def _register_generic(module_dict, module_name, module): assert module_name not in module_dict module_dict[module_name] = module class Registry(dict): ''' A helper class for managing registering modules, it extends a dictionary and provides a register functions. Eg. creeting a registry: some_registry = Registry({"default": default_module}) There're two ways of registering new modules: 1): normal way is just calling register function: def foo(): ... some_registry.register("foo_module", foo) 2): used as decorator when declaring the module: @some_registry.register("foo_module") @some_registry.register("foo_modeul_nickname") def foo(): ... Access of module is just like using a dictionary, eg: f = some_registry["foo_modeul"] ''' def __init__(self, *args, **kwargs): super(Registry, self).__init__(*args, **kwargs) def register(self, module_name, module=None): # used as function call if module is not None: _register_generic(self, module_name, module) return # used as decorator def register_fn(fn): _register_generic(self, module_name, fn) return fn return register_fn ================================================ FILE: maskrcnn_benchmark/utils/timer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import time import datetime class Timer(object): def __init__(self): self.reset() @property def average_time(self): return self.total_time / self.calls if self.calls > 0 else 0.0 def tic(self): # using time.time instead of time.clock because time time.clock # does not normalize for multithreading self.start_time = time.time() def toc(self, average=True): self.add(time.time() - self.start_time) if average: return self.average_time else: return self.diff def add(self, time_diff): self.diff = time_diff self.total_time += self.diff self.calls += 1 def reset(self): self.total_time = 0.0 self.calls = 0 self.start_time = 0.0 self.diff = 0.0 def avg_time_str(self): time_str = str(datetime.timedelta(seconds=self.average_time)) return time_str def get_time_str(time_diff): time_str = str(datetime.timedelta(seconds=time_diff)) return time_str ================================================ FILE: requirements.txt ================================================ ninja yacs cython matplotlib tqdm ================================================ FILE: setup.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #!/usr/bin/env python import glob import os import torch from setuptools import find_packages from setuptools import setup from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CppExtension from torch.utils.cpp_extension import CUDAExtension requirements = ["torch", "torchvision"] def get_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) extensions_dir = os.path.join(this_dir, "maskrcnn_benchmark", "csrc") main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) sources = main_file + source_cpu extension = CppExtension extra_compile_args = {"cxx": []} define_macros = [] if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1": extension = CUDAExtension sources += source_cuda define_macros += [("WITH_CUDA", None)] extra_compile_args["nvcc"] = [ "-DCUDA_HAS_FP16=1", "-D__CUDA_NO_HALF_OPERATORS__", "-D__CUDA_NO_HALF_CONVERSIONS__", "-D__CUDA_NO_HALF2_OPERATORS__", ] sources = [os.path.join(extensions_dir, s) for s in sources] include_dirs = [extensions_dir] ext_modules = [ extension( "maskrcnn_benchmark._C", sources, include_dirs=include_dirs, define_macros=define_macros, extra_compile_args=extra_compile_args, ) ] return ext_modules setup( name="maskrcnn_benchmark", version="0.1", author="fmassa", url="https://github.com/facebookresearch/maskrcnn-benchmark", description="object detection in pytorch", packages=find_packages(exclude=("configs", "tests",)), # install_requires=requirements, ext_modules=get_extensions(), cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, ) ================================================ FILE: tests/checkpoint.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from collections import OrderedDict import os from tempfile import TemporaryDirectory import unittest import torch from torch import nn from maskrcnn_benchmark.utils.model_serialization import load_state_dict from maskrcnn_benchmark.utils.checkpoint import Checkpointer class TestCheckpointer(unittest.TestCase): def create_model(self): return nn.Sequential(nn.Linear(2, 3), nn.Linear(3, 1)) def create_complex_model(self): m = nn.Module() m.block1 = nn.Module() m.block1.layer1 = nn.Linear(2, 3) m.layer2 = nn.Linear(3, 2) m.res = nn.Module() m.res.layer2 = nn.Linear(3, 2) state_dict = OrderedDict() state_dict["layer1.weight"] = torch.rand(3, 2) state_dict["layer1.bias"] = torch.rand(3) state_dict["layer2.weight"] = torch.rand(2, 3) state_dict["layer2.bias"] = torch.rand(2) state_dict["res.layer2.weight"] = torch.rand(2, 3) state_dict["res.layer2.bias"] = torch.rand(2) return m, state_dict def test_from_last_checkpoint_model(self): # test that loading works even if they differ by a prefix for trained_model, fresh_model in [ (self.create_model(), self.create_model()), (nn.DataParallel(self.create_model()), self.create_model()), (self.create_model(), nn.DataParallel(self.create_model())), ( nn.DataParallel(self.create_model()), nn.DataParallel(self.create_model()), ), ]: with TemporaryDirectory() as f: checkpointer = Checkpointer( trained_model, save_dir=f, save_to_disk=True ) checkpointer.save("checkpoint_file") # in the same folder fresh_checkpointer = Checkpointer(fresh_model, save_dir=f) self.assertTrue(fresh_checkpointer.has_checkpoint()) self.assertEqual( fresh_checkpointer.get_checkpoint_file(), os.path.join(f, "checkpoint_file.pth"), ) _ = fresh_checkpointer.load() for trained_p, loaded_p in zip( trained_model.parameters(), fresh_model.parameters() ): # different tensor references self.assertFalse(id(trained_p) == id(loaded_p)) # same content self.assertTrue(trained_p.equal(loaded_p)) def test_from_name_file_model(self): # test that loading works even if they differ by a prefix for trained_model, fresh_model in [ (self.create_model(), self.create_model()), (nn.DataParallel(self.create_model()), self.create_model()), (self.create_model(), nn.DataParallel(self.create_model())), ( nn.DataParallel(self.create_model()), nn.DataParallel(self.create_model()), ), ]: with TemporaryDirectory() as f: checkpointer = Checkpointer( trained_model, save_dir=f, save_to_disk=True ) checkpointer.save("checkpoint_file") # on different folders with TemporaryDirectory() as g: fresh_checkpointer = Checkpointer(fresh_model, save_dir=g) self.assertFalse(fresh_checkpointer.has_checkpoint()) self.assertEqual(fresh_checkpointer.get_checkpoint_file(), "") _ = fresh_checkpointer.load(os.path.join(f, "checkpoint_file.pth")) for trained_p, loaded_p in zip( trained_model.parameters(), fresh_model.parameters() ): # different tensor references self.assertFalse(id(trained_p) == id(loaded_p)) # same content self.assertTrue(trained_p.equal(loaded_p)) def test_complex_model_loaded(self): for add_data_parallel in [False, True]: model, state_dict = self.create_complex_model() if add_data_parallel: model = nn.DataParallel(model) load_state_dict(model, state_dict) for loaded, stored in zip(model.state_dict().values(), state_dict.values()): # different tensor references self.assertFalse(id(loaded) == id(stored)) # same content self.assertTrue(loaded.equal(stored)) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/env_tests/env.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import os def get_config_root_path(): ''' Path to configs for unit tests ''' # cur_file_dir is root/tests/env_tests cur_file_dir = os.path.dirname(os.path.abspath(os.path.realpath(__file__))) ret = os.path.dirname(os.path.dirname(cur_file_dir)) ret = os.path.join(ret, "configs") return ret ================================================ FILE: tests/test_backbones.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import copy import torch # import modules to to register backbones from maskrcnn_benchmark.modeling.backbone import build_backbone # NoQA from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.config import cfg as g_cfg from utils import load_config # overwrite configs if specified, otherwise default config is used BACKBONE_CFGS = { "R-50-FPN": "e2e_faster_rcnn_R_50_FPN_1x.yaml", "R-101-FPN": "e2e_faster_rcnn_R_101_FPN_1x.yaml", "R-152-FPN": "e2e_faster_rcnn_R_101_FPN_1x.yaml", "R-50-FPN-RETINANET": "retinanet/retinanet_R-50-FPN_1x.yaml", "R-101-FPN-RETINANET": "retinanet/retinanet_R-101-FPN_1x.yaml", } class TestBackbones(unittest.TestCase): def test_build_backbones(self): ''' Make sure backbones run ''' self.assertGreater(len(registry.BACKBONES), 0) for name, backbone_builder in registry.BACKBONES.items(): print('Testing {}...'.format(name)) if name in BACKBONE_CFGS: cfg = load_config(BACKBONE_CFGS[name]) else: # Use default config if config file is not specified cfg = copy.deepcopy(g_cfg) backbone = backbone_builder(cfg) # make sures the backbone has `out_channels` self.assertIsNotNone( getattr(backbone, 'out_channels', None), 'Need to provide out_channels for backbone {}'.format(name) ) N, C_in, H, W = 2, 3, 224, 256 input = torch.rand([N, C_in, H, W], dtype=torch.float32) out = backbone(input) for cur_out in out: self.assertEqual( cur_out.shape[:2], torch.Size([N, backbone.out_channels]) ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_box_coder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import numpy as np import torch from maskrcnn_benchmark.modeling.box_coder import BoxCoder class TestBoxCoder(unittest.TestCase): def test_box_decoder(self): """ Match unit test UtilsBoxesTest.TestBboxTransformRandom in caffe2/operators/generate_proposals_op_util_boxes_test.cc """ box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) bbox = torch.from_numpy( np.array( [ 175.62031555, 20.91103172, 253.352005, 155.0145874, 169.24636841, 4.85241556, 228.8605957, 105.02092743, 181.77426147, 199.82876587, 192.88427734, 214.0255127, 174.36262512, 186.75761414, 296.19091797, 231.27906799, 22.73153877, 92.02596283, 135.5695343, 208.80291748, ] ) .astype(np.float32) .reshape(-1, 4) ) deltas = torch.from_numpy( np.array( [ 0.47861834, 0.13992102, 0.14961673, 0.71495209, 0.29915856, -0.35664671, 0.89018666, 0.70815367, -0.03852064, 0.44466892, 0.49492538, 0.71409376, 0.28052918, 0.02184832, 0.65289006, 1.05060139, -0.38172557, -0.08533806, -0.60335309, 0.79052375, ] ) .astype(np.float32) .reshape(-1, 4) ) gt_bbox = ( np.array( [ 206.949539, -30.715202, 297.387665, 244.448486, 143.871216, -83.342888, 290.502289, 121.053398, 177.430283, 198.666245, 196.295273, 228.703079, 152.251892, 145.431564, 387.215454, 274.594238, 5.062420, 11.040955, 66.328903, 269.686218, ] ) .astype(np.float32) .reshape(-1, 4) ) results = box_coder.decode(deltas, bbox) np.testing.assert_allclose(results.detach().numpy(), gt_bbox, atol=1e-4) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_configs.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import glob import os import utils class TestConfigs(unittest.TestCase): def test_configs_load(self): ''' Make sure configs are loadable ''' cfg_root_path = utils.get_config_root_path() files = glob.glob( os.path.join(cfg_root_path, "./**/*.yaml"), recursive=True) self.assertGreater(len(files), 0) for fn in files: print('Loading {}...'.format(fn)) utils.load_config_from_file(fn) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_data_samplers.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import itertools import random import unittest from torch.utils.data.sampler import BatchSampler from torch.utils.data.sampler import Sampler from torch.utils.data.sampler import SequentialSampler from torch.utils.data.sampler import RandomSampler from maskrcnn_benchmark.data.samplers import GroupedBatchSampler from maskrcnn_benchmark.data.samplers import IterationBasedBatchSampler class SubsetSampler(Sampler): def __init__(self, indices): self.indices = indices def __iter__(self): return iter(self.indices) def __len__(self): return len(self.indices) class TestGroupedBatchSampler(unittest.TestCase): def test_respect_order_simple(self): drop_uneven = False dataset = [i for i in range(40)] group_ids = [i // 10 for i in dataset] sampler = SequentialSampler(dataset) for batch_size in [1, 3, 5, 6]: batch_sampler = GroupedBatchSampler( sampler, group_ids, batch_size, drop_uneven ) result = list(batch_sampler) merged_result = list(itertools.chain.from_iterable(result)) self.assertEqual(merged_result, dataset) def test_respect_order(self): drop_uneven = False dataset = [i for i in range(10)] group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] sampler = SequentialSampler(dataset) expected = [ [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]], [[0, 1, 3], [2, 4, 5], [6, 9], [7, 8]], [[0, 1, 3, 6], [2, 4, 5, 7], [8], [9]], ] for idx, batch_size in enumerate([1, 3, 4]): batch_sampler = GroupedBatchSampler( sampler, group_ids, batch_size, drop_uneven ) result = list(batch_sampler) self.assertEqual(result, expected[idx]) def test_respect_order_drop_uneven(self): batch_size = 3 drop_uneven = True dataset = [i for i in range(10)] group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] sampler = SequentialSampler(dataset) batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) result = list(batch_sampler) expected = [[0, 1, 3], [2, 4, 5]] self.assertEqual(result, expected) def test_subset_sampler(self): batch_size = 3 drop_uneven = False dataset = [i for i in range(10)] group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] sampler = SubsetSampler([0, 3, 5, 6, 7, 8]) batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) result = list(batch_sampler) expected = [[0, 3, 6], [5, 7, 8]] self.assertEqual(result, expected) def test_permute_subset_sampler(self): batch_size = 3 drop_uneven = False dataset = [i for i in range(10)] group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] sampler = SubsetSampler([5, 0, 6, 1, 3, 8]) batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) result = list(batch_sampler) expected = [[5, 8], [0, 6, 1], [3]] self.assertEqual(result, expected) def test_permute_subset_sampler_drop_uneven(self): batch_size = 3 drop_uneven = True dataset = [i for i in range(10)] group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] sampler = SubsetSampler([5, 0, 6, 1, 3, 8]) batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) result = list(batch_sampler) expected = [[0, 6, 1]] self.assertEqual(result, expected) def test_len(self): batch_size = 3 drop_uneven = True dataset = [i for i in range(10)] group_ids = [random.randint(0, 1) for _ in dataset] sampler = RandomSampler(dataset) batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) result = list(batch_sampler) self.assertEqual(len(result), len(batch_sampler)) self.assertEqual(len(result), len(batch_sampler)) batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) batch_sampler_len = len(batch_sampler) result = list(batch_sampler) self.assertEqual(len(result), batch_sampler_len) self.assertEqual(len(result), len(batch_sampler)) class TestIterationBasedBatchSampler(unittest.TestCase): def test_number_of_iters_and_elements(self): for batch_size in [2, 3, 4]: for num_iterations in [4, 10, 20]: for drop_last in [False, True]: dataset = [i for i in range(10)] sampler = SequentialSampler(dataset) batch_sampler = BatchSampler( sampler, batch_size, drop_last=drop_last ) iter_sampler = IterationBasedBatchSampler( batch_sampler, num_iterations ) assert len(iter_sampler) == num_iterations for i, batch in enumerate(iter_sampler): start = (i % len(batch_sampler)) * batch_size end = min(start + batch_size, len(dataset)) expected = [x for x in range(start, end)] self.assertEqual(batch, expected) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_detectors.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import glob import os import copy import torch from maskrcnn_benchmark.modeling.detector import build_detection_model from maskrcnn_benchmark.structures.image_list import to_image_list import utils CONFIG_FILES = [ # bbox "e2e_faster_rcnn_R_50_C4_1x.yaml", "e2e_faster_rcnn_R_50_FPN_1x.yaml", "e2e_faster_rcnn_fbnet.yaml", # mask "e2e_mask_rcnn_R_50_C4_1x.yaml", "e2e_mask_rcnn_R_50_FPN_1x.yaml", "e2e_mask_rcnn_fbnet.yaml", # keypoints # TODO: fail to run for random model due to empty head input # "e2e_keypoint_rcnn_R_50_FPN_1x.yaml", # gn "gn_baselines/e2e_faster_rcnn_R_50_FPN_1x_gn.yaml", # TODO: fail to run for random model due to empty head input # "gn_baselines/e2e_mask_rcnn_R_50_FPN_Xconv1fc_1x_gn.yaml", # retinanet "retinanet/retinanet_R-50-FPN_1x.yaml", # rpn only "rpn_R_50_C4_1x.yaml", "rpn_R_50_FPN_1x.yaml", ] EXCLUDED_FOLDERS = [ "caffe2", "quick_schedules", "pascal_voc", "cityscapes", ] TEST_CUDA = torch.cuda.is_available() def get_config_files(file_list, exclude_folders): cfg_root_path = utils.get_config_root_path() if file_list is not None: files = [os.path.join(cfg_root_path, x) for x in file_list] else: files = glob.glob( os.path.join(cfg_root_path, "./**/*.yaml"), recursive=True) def _contains(path, exclude_dirs): return any(x in path for x in exclude_dirs) if exclude_folders is not None: files = [x for x in files if not _contains(x, exclude_folders)] return files def create_model(cfg, device): cfg = copy.deepcopy(cfg) cfg.freeze() model = build_detection_model(cfg) model = model.to(device) return model def create_random_input(cfg, device): ret = [] for x in cfg.INPUT.MIN_SIZE_TRAIN: ret.append(torch.rand(3, x, int(x * 1.2))) ret = to_image_list(ret, cfg.DATALOADER.SIZE_DIVISIBILITY) ret = ret.to(device) return ret def _test_build_detectors(self, device): ''' Make sure models build ''' cfg_files = get_config_files(None, EXCLUDED_FOLDERS) self.assertGreater(len(cfg_files), 0) for cfg_file in cfg_files: with self.subTest(cfg_file=cfg_file): print('Testing {}...'.format(cfg_file)) cfg = utils.load_config_from_file(cfg_file) create_model(cfg, device) def _test_run_selected_detectors(self, cfg_files, device): ''' Make sure models build and run ''' self.assertGreater(len(cfg_files), 0) for cfg_file in cfg_files: with self.subTest(cfg_file=cfg_file): print('Testing {}...'.format(cfg_file)) cfg = utils.load_config_from_file(cfg_file) cfg.MODEL.RPN.POST_NMS_TOP_N_TEST = 10 cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 10 model = create_model(cfg, device) inputs = create_random_input(cfg, device) model.eval() output = model(inputs) self.assertEqual(len(output), len(inputs.image_sizes)) class TestDetectors(unittest.TestCase): def test_build_detectors(self): ''' Make sure models build ''' _test_build_detectors(self, "cpu") @unittest.skipIf(not TEST_CUDA, "no CUDA detected") def test_build_detectors_cuda(self): ''' Make sure models build on gpu''' _test_build_detectors(self, "cuda") def test_run_selected_detectors(self): ''' Make sure models build and run ''' # run on selected models cfg_files = get_config_files(CONFIG_FILES, None) # cfg_files = get_config_files(None, EXCLUDED_FOLDERS) _test_run_selected_detectors(self, cfg_files, "cpu") @unittest.skipIf(not TEST_CUDA, "no CUDA detected") def test_run_selected_detectors_cuda(self): ''' Make sure models build and run on cuda ''' # run on selected models cfg_files = get_config_files(CONFIG_FILES, None) # cfg_files = get_config_files(None, EXCLUDED_FOLDERS) _test_run_selected_detectors(self, cfg_files, "cuda") if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_fbnet.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import numpy as np import torch import maskrcnn_benchmark.modeling.backbone.fbnet_builder as fbnet_builder TEST_CUDA = torch.cuda.is_available() def _test_primitive(self, device, op_name, op_func, N, C_in, C_out, expand, stride): op = op_func(C_in, C_out, expand, stride).to(device) input = torch.rand([N, C_in, 7, 7], dtype=torch.float32).to(device) output = op(input) self.assertEqual( output.shape[:2], torch.Size([N, C_out]), 'Primitive {} failed for shape {}.'.format(op_name, input.shape) ) class TestFBNetBuilder(unittest.TestCase): def test_identity(self): id_op = fbnet_builder.Identity(20, 20, 1) input = torch.rand([10, 20, 7, 7], dtype=torch.float32) output = id_op(input) np.testing.assert_array_equal(np.array(input), np.array(output)) id_op = fbnet_builder.Identity(20, 40, 2) input = torch.rand([10, 20, 7, 7], dtype=torch.float32) output = id_op(input) np.testing.assert_array_equal(output.shape, [10, 40, 4, 4]) def test_primitives(self): ''' Make sures the primitives runs ''' for op_name, op_func in fbnet_builder.PRIMITIVES.items(): print('Testing {}'.format(op_name)) _test_primitive( self, "cpu", op_name, op_func, N=20, C_in=16, C_out=32, expand=4, stride=1 ) @unittest.skipIf(not TEST_CUDA, "no CUDA detected") def test_primitives_cuda(self): ''' Make sures the primitives runs on cuda ''' for op_name, op_func in fbnet_builder.PRIMITIVES.items(): print('Testing {}'.format(op_name)) _test_primitive( self, "cuda", op_name, op_func, N=20, C_in=16, C_out=32, expand=4, stride=1 ) def test_primitives_empty_batch(self): ''' Make sures the primitives runs ''' for op_name, op_func in fbnet_builder.PRIMITIVES.items(): print('Testing {}'.format(op_name)) # test empty batch size _test_primitive( self, "cpu", op_name, op_func, N=0, C_in=16, C_out=32, expand=4, stride=1 ) @unittest.skipIf(not TEST_CUDA, "no CUDA detected") def test_primitives_cuda_empty_batch(self): ''' Make sures the primitives runs ''' for op_name, op_func in fbnet_builder.PRIMITIVES.items(): print('Testing {}'.format(op_name)) # test empty batch size _test_primitive( self, "cuda", op_name, op_func, N=0, C_in=16, C_out=32, expand=4, stride=1 ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_feature_extractors.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import copy import torch # import modules to to register feature extractors from maskrcnn_benchmark.modeling.backbone import build_backbone # NoQA from maskrcnn_benchmark.modeling.roi_heads.roi_heads import build_roi_heads # NoQA from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.config import cfg as g_cfg from utils import load_config # overwrite configs if specified, otherwise default config is used FEATURE_EXTRACTORS_CFGS = { } # overwrite configs if specified, otherwise default config is used FEATURE_EXTRACTORS_INPUT_CHANNELS = { # in_channels was not used, load through config "ResNet50Conv5ROIFeatureExtractor": 1024, } def _test_feature_extractors( self, extractors, overwrite_cfgs, overwrite_in_channels ): ''' Make sure roi box feature extractors run ''' self.assertGreater(len(extractors), 0) in_channels_default = 64 for name, builder in extractors.items(): print('Testing {}...'.format(name)) if name in overwrite_cfgs: cfg = load_config(overwrite_cfgs[name]) else: # Use default config if config file is not specified cfg = copy.deepcopy(g_cfg) in_channels = overwrite_in_channels.get( name, in_channels_default) fe = builder(cfg, in_channels) self.assertIsNotNone( getattr(fe, 'out_channels', None), 'Need to provide out_channels for feature extractor {}'.format(name) ) N, C_in, H, W = 2, in_channels, 24, 32 input = torch.rand([N, C_in, H, W], dtype=torch.float32) bboxes = [[1, 1, 10, 10], [5, 5, 8, 8], [2, 2, 3, 4]] img_size = [384, 512] box_list = BoxList(bboxes, img_size, "xyxy") out = fe([input], [box_list] * N) self.assertEqual( out.shape[:2], torch.Size([N * len(bboxes), fe.out_channels]) ) class TestFeatureExtractors(unittest.TestCase): def test_roi_box_feature_extractors(self): ''' Make sure roi box feature extractors run ''' _test_feature_extractors( self, registry.ROI_BOX_FEATURE_EXTRACTORS, FEATURE_EXTRACTORS_CFGS, FEATURE_EXTRACTORS_INPUT_CHANNELS, ) def test_roi_keypoints_feature_extractors(self): ''' Make sure roi keypoints feature extractors run ''' _test_feature_extractors( self, registry.ROI_KEYPOINT_FEATURE_EXTRACTORS, FEATURE_EXTRACTORS_CFGS, FEATURE_EXTRACTORS_INPUT_CHANNELS, ) def test_roi_mask_feature_extractors(self): ''' Make sure roi mask feature extractors run ''' _test_feature_extractors( self, registry.ROI_MASK_FEATURE_EXTRACTORS, FEATURE_EXTRACTORS_CFGS, FEATURE_EXTRACTORS_INPUT_CHANNELS, ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_metric_logger.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest from maskrcnn_benchmark.utils.metric_logger import MetricLogger class TestMetricLogger(unittest.TestCase): def test_update(self): meter = MetricLogger() for i in range(10): meter.update(metric=float(i)) m = meter.meters["metric"] self.assertEqual(m.count, 10) self.assertEqual(m.total, 45) self.assertEqual(m.median, 4) self.assertEqual(m.avg, 4.5) def test_no_attr(self): meter = MetricLogger() _ = meter.meters _ = meter.delimiter def broken(): _ = meter.not_existent self.assertRaises(AttributeError, broken) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_nms.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import numpy as np import torch from maskrcnn_benchmark.layers import nms as box_nms class TestNMS(unittest.TestCase): def test_nms_cpu(self): """ Match unit test UtilsNMSTest.TestNMS in caffe2/operators/generate_proposals_op_util_nms_test.cc """ inputs = ( np.array( [ 10, 10, 50, 60, 0.5, 11, 12, 48, 60, 0.7, 8, 9, 40, 50, 0.6, 100, 100, 150, 140, 0.9, 99, 110, 155, 139, 0.8, ] ) .astype(np.float32) .reshape(-1, 5) ) boxes = torch.from_numpy(inputs[:, :4]) scores = torch.from_numpy(inputs[:, 4]) test_thresh = [0.1, 0.3, 0.5, 0.8, 0.9] gt_indices = [[1, 3], [1, 3], [1, 3], [1, 2, 3, 4], [0, 1, 2, 3, 4]] for thresh, gt_index in zip(test_thresh, gt_indices): keep_indices = box_nms(boxes, scores, thresh) keep_indices = np.sort(keep_indices) np.testing.assert_array_equal(keep_indices, np.array(gt_index)) def test_nms1_cpu(self): """ Match unit test UtilsNMSTest.TestNMS1 in caffe2/operators/generate_proposals_op_util_nms_test.cc """ boxes = torch.from_numpy( np.array( [ [350.9821, 161.8200, 369.9685, 205.2372], [250.5236, 154.2844, 274.1773, 204.9810], [471.4920, 160.4118, 496.0094, 213.4244], [352.0421, 164.5933, 366.4458, 205.9624], [166.0765, 169.7707, 183.0102, 232.6606], [252.3000, 183.1449, 269.6541, 210.6747], [469.7862, 162.0192, 482.1673, 187.0053], [168.4862, 174.2567, 181.7437, 232.9379], [470.3290, 162.3442, 496.4272, 214.6296], [251.0450, 155.5911, 272.2693, 203.3675], [252.0326, 154.7950, 273.7404, 195.3671], [351.7479, 161.9567, 370.6432, 204.3047], [496.3306, 161.7157, 515.0573, 210.7200], [471.0749, 162.6143, 485.3374, 207.3448], [250.9745, 160.7633, 264.1924, 206.8350], [470.4792, 169.0351, 487.1934, 220.2984], [474.4227, 161.9546, 513.1018, 215.5193], [251.9428, 184.1950, 262.6937, 207.6416], [252.6623, 175.0252, 269.8806, 213.7584], [260.9884, 157.0351, 288.3554, 206.6027], [251.3629, 164.5101, 263.2179, 202.4203], [471.8361, 190.8142, 485.6812, 220.8586], [248.6243, 156.9628, 264.3355, 199.2767], [495.1643, 158.0483, 512.6261, 184.4192], [376.8718, 168.0144, 387.3584, 201.3210], [122.9191, 160.7433, 172.5612, 231.3837], [350.3857, 175.8806, 366.2500, 205.4329], [115.2958, 162.7822, 161.9776, 229.6147], [168.4375, 177.4041, 180.8028, 232.4551], [169.7939, 184.4330, 181.4767, 232.1220], [347.7536, 175.9356, 355.8637, 197.5586], [495.5434, 164.6059, 516.4031, 207.7053], [172.1216, 194.6033, 183.1217, 235.2653], [264.2654, 181.5540, 288.4626, 214.0170], [111.7971, 183.7748, 137.3745, 225.9724], [253.4919, 186.3945, 280.8694, 210.0731], [165.5334, 169.7344, 185.9159, 232.8514], [348.3662, 184.5187, 354.9081, 201.4038], [164.6562, 162.5724, 186.3108, 233.5010], [113.2999, 186.8410, 135.8841, 219.7642], [117.0282, 179.8009, 142.5375, 221.0736], [462.1312, 161.1004, 495.3576, 217.2208], [462.5800, 159.9310, 501.2937, 224.1655], [503.5242, 170.0733, 518.3792, 209.0113], [250.3658, 195.5925, 260.6523, 212.4679], [108.8287, 163.6994, 146.3642, 229.7261], [256.7617, 187.3123, 288.8407, 211.2013], [161.2781, 167.4801, 186.3751, 232.7133], [115.3760, 177.5859, 163.3512, 236.9660], [248.9077, 188.0919, 264.8579, 207.9718], [108.1349, 160.7851, 143.6370, 229.6243], [465.0900, 156.7555, 490.3561, 213.5704], [107.5338, 173.4323, 141.0704, 235.2910], ] ).astype(np.float32) ) scores = torch.from_numpy( np.array( [ 0.1919, 0.3293, 0.0860, 0.1600, 0.1885, 0.4297, 0.0974, 0.2711, 0.1483, 0.1173, 0.1034, 0.2915, 0.1993, 0.0677, 0.3217, 0.0966, 0.0526, 0.5675, 0.3130, 0.1592, 0.1353, 0.0634, 0.1557, 0.1512, 0.0699, 0.0545, 0.2692, 0.1143, 0.0572, 0.1990, 0.0558, 0.1500, 0.2214, 0.1878, 0.2501, 0.1343, 0.0809, 0.1266, 0.0743, 0.0896, 0.0781, 0.0983, 0.0557, 0.0623, 0.5808, 0.3090, 0.1050, 0.0524, 0.0513, 0.4501, 0.4167, 0.0623, 0.1749, ] ).astype(np.float32) ) gt_indices = np.array( [ 1, 6, 7, 8, 11, 12, 13, 14, 17, 18, 19, 21, 23, 24, 25, 26, 30, 32, 33, 34, 35, 37, 43, 44, 47, 50, ] ) keep_indices = box_nms(boxes, scores, 0.5) keep_indices = np.sort(keep_indices) np.testing.assert_array_equal(keep_indices, gt_indices) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_predictors.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import copy import torch # import modules to to register predictors from maskrcnn_benchmark.modeling.backbone import build_backbone # NoQA from maskrcnn_benchmark.modeling.roi_heads.roi_heads import build_roi_heads # NoQA from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.config import cfg as g_cfg from utils import load_config # overwrite configs if specified, otherwise default config is used PREDICTOR_CFGS = { } # overwrite configs if specified, otherwise default config is used PREDICTOR_INPUT_CHANNELS = { } def _test_predictors( self, predictors, overwrite_cfgs, overwrite_in_channels, hwsize, ): ''' Make sure predictors run ''' self.assertGreater(len(predictors), 0) in_channels_default = 64 for name, builder in predictors.items(): print('Testing {}...'.format(name)) if name in overwrite_cfgs: cfg = load_config(overwrite_cfgs[name]) else: # Use default config if config file is not specified cfg = copy.deepcopy(g_cfg) in_channels = overwrite_in_channels.get( name, in_channels_default) fe = builder(cfg, in_channels) N, C_in, H, W = 2, in_channels, hwsize, hwsize input = torch.rand([N, C_in, H, W], dtype=torch.float32) out = fe(input) yield input, out, cfg class TestPredictors(unittest.TestCase): def test_roi_box_predictors(self): ''' Make sure roi box predictors run ''' for cur_in, cur_out, cur_cfg in _test_predictors( self, registry.ROI_BOX_PREDICTOR, PREDICTOR_CFGS, PREDICTOR_INPUT_CHANNELS, hwsize=1, ): self.assertEqual(len(cur_out), 2) scores, bbox_deltas = cur_out[0], cur_out[1] self.assertEqual( scores.shape[1], cur_cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES) self.assertEqual(scores.shape[0], cur_in.shape[0]) self.assertEqual(scores.shape[0], bbox_deltas.shape[0]) self.assertEqual(scores.shape[1] * 4, bbox_deltas.shape[1]) def test_roi_keypoints_predictors(self): ''' Make sure roi keypoint predictors run ''' for cur_in, cur_out, cur_cfg in _test_predictors( self, registry.ROI_KEYPOINT_PREDICTOR, PREDICTOR_CFGS, PREDICTOR_INPUT_CHANNELS, hwsize=14, ): self.assertEqual(cur_out.shape[0], cur_in.shape[0]) self.assertEqual( cur_out.shape[1], cur_cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES) def test_roi_mask_predictors(self): ''' Make sure roi mask predictors run ''' for cur_in, cur_out, cur_cfg in _test_predictors( self, registry.ROI_MASK_PREDICTOR, PREDICTOR_CFGS, PREDICTOR_INPUT_CHANNELS, hwsize=14, ): self.assertEqual(cur_out.shape[0], cur_in.shape[0]) self.assertEqual( cur_out.shape[1], cur_cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_rpn_heads.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import copy import torch # import modules to to register rpn heads from maskrcnn_benchmark.modeling.backbone import build_backbone # NoQA from maskrcnn_benchmark.modeling.rpn.rpn import build_rpn # NoQA from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.config import cfg as g_cfg from utils import load_config # overwrite configs if specified, otherwise default config is used RPN_CFGS = { } class TestRPNHeads(unittest.TestCase): def test_build_rpn_heads(self): ''' Make sure rpn heads run ''' self.assertGreater(len(registry.RPN_HEADS), 0) in_channels = 64 num_anchors = 10 for name, builder in registry.RPN_HEADS.items(): print('Testing {}...'.format(name)) if name in RPN_CFGS: cfg = load_config(RPN_CFGS[name]) else: # Use default config if config file is not specified cfg = copy.deepcopy(g_cfg) rpn = builder(cfg, in_channels, num_anchors) N, C_in, H, W = 2, in_channels, 24, 32 input = torch.rand([N, C_in, H, W], dtype=torch.float32) LAYERS = 3 out = rpn([input] * LAYERS) self.assertEqual(len(out), 2) logits, bbox_reg = out for idx in range(LAYERS): self.assertEqual( logits[idx].shape, torch.Size([ input.shape[0], num_anchors, input.shape[2], input.shape[3], ]) ) self.assertEqual( bbox_reg[idx].shape, torch.Size([ logits[idx].shape[0], num_anchors * 4, logits[idx].shape[2], logits[idx].shape[3], ]), ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_segmentation_mask.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import torch from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask class TestSegmentationMask(unittest.TestCase): def __init__(self, method_name='runTest'): super(TestSegmentationMask, self).__init__(method_name) poly = [[[423.0, 306.5, 406.5, 277.0, 400.0, 271.5, 389.5, 277.0, 387.5, 292.0, 384.5, 295.0, 374.5, 220.0, 378.5, 210.0, 391.0, 200.5, 404.0, 199.5, 414.0, 203.5, 425.5, 221.0, 438.5, 297.0, 423.0, 306.5], [100, 100, 200, 100, 200, 200, 100, 200], ]] width = 640 height = 480 size = width, height self.P = SegmentationMask(poly, size, 'poly') self.M = SegmentationMask(poly, size, 'poly').convert('mask') def L1(self, A, B): diff = A.get_mask_tensor() - B.get_mask_tensor() diff = torch.sum(torch.abs(diff.float())).item() return diff def test_convert(self): M_hat = self.M.convert('poly').convert('mask') P_hat = self.P.convert('mask').convert('poly') diff_mask = self.L1(self.M, M_hat) diff_poly = self.L1(self.P, P_hat) self.assertTrue(diff_mask == diff_poly) self.assertTrue(diff_mask <= 8169.) self.assertTrue(diff_poly <= 8169.) def test_crop(self): box = [400, 250, 500, 300] # xyxy diff = self.L1(self.M.crop(box), self.P.crop(box)) self.assertTrue(diff <= 1.) def test_resize(self): new_size = 50, 25 M_hat = self.M.resize(new_size) P_hat = self.P.resize(new_size) diff = self.L1(M_hat, P_hat) self.assertTrue(self.M.size == self.P.size) self.assertTrue(M_hat.size == P_hat.size) self.assertTrue(self.M.size != M_hat.size) self.assertTrue(diff <= 255.) def test_transpose(self): FLIP_LEFT_RIGHT = 0 FLIP_TOP_BOTTOM = 1 diff_hor = self.L1(self.M.transpose(FLIP_LEFT_RIGHT), self.P.transpose(FLIP_LEFT_RIGHT)) diff_ver = self.L1(self.M.transpose(FLIP_TOP_BOTTOM), self.P.transpose(FLIP_TOP_BOTTOM)) self.assertTrue(diff_hor <= 53250.) self.assertTrue(diff_ver <= 42494.) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/utils.py ================================================ from __future__ import absolute_import, division, print_function, unicode_literals # Set up custom environment before nearly anything else is imported # NOTE: this should be the first import (no not reorder) from maskrcnn_benchmark.utils.env import setup_environment # noqa F401 isort:skip import env_tests.env as env_tests import os import copy from maskrcnn_benchmark.config import cfg as g_cfg def get_config_root_path(): return env_tests.get_config_root_path() def load_config(rel_path): ''' Load config from file path specified as path relative to config_root ''' cfg_path = os.path.join(env_tests.get_config_root_path(), rel_path) return load_config_from_file(cfg_path) def load_config_from_file(file_path): ''' Load config from file path specified as absolute path ''' ret = copy.deepcopy(g_cfg) ret.merge_from_file(file_path) return ret ================================================ FILE: tools/cityscapes/convert_cityscapes_to_coco.py ================================================ #!/usr/bin/env python # Copyright (c) 2017-present, Facebook, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ############################################################################## # This file is copy from https://github.com/facebookresearch/Detectron/tree/master/tools from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import argparse import h5py import json import os import scipy.misc import sys import cityscapesscripts.evaluation.instances2dict_with_polygons as cs def parse_args(): parser = argparse.ArgumentParser(description='Convert dataset') parser.add_argument( '--dataset', help="cocostuff, cityscapes", default=None, type=str) parser.add_argument( '--outdir', help="output dir for json files", default=None, type=str) parser.add_argument( '--datadir', help="data dir for annotations to be converted", default=None, type=str) if len(sys.argv) == 1: parser.print_help() sys.exit(1) return parser.parse_args() def poly_to_box(poly): """Convert a polygon into a tight bounding box.""" x0 = min(min(p[::2]) for p in poly) x1 = max(max(p[::2]) for p in poly) y0 = min(min(p[1::2]) for p in poly) y1 = max(max(p[1::2]) for p in poly) box_from_poly = [x0, y0, x1, y1] return box_from_poly def xyxy_to_xywh(xyxy_box): xmin, ymin, xmax, ymax = xyxy_box TO_REMOVE = 1 xywh_box = (xmin, ymin, xmax - xmin + TO_REMOVE, ymax - ymin + TO_REMOVE) return xywh_box def convert_coco_stuff_mat(data_dir, out_dir): """Convert to png and save json with path. This currently only contains the segmentation labels for objects+stuff in cocostuff - if we need to combine with other labels from original COCO that will be a TODO.""" sets = ['train', 'val'] categories = [] json_name = 'coco_stuff_%s.json' ann_dict = {} for data_set in sets: file_list = os.path.join(data_dir, '%s.txt') images = [] with open(file_list % data_set) as f: for img_id, img_name in enumerate(f): img_name = img_name.replace('coco', 'COCO').strip('\n') image = {} mat_file = os.path.join( data_dir, 'annotations/%s.mat' % img_name) data = h5py.File(mat_file, 'r') labelMap = data.get('S') if len(categories) == 0: labelNames = data.get('names') for idx, n in enumerate(labelNames): categories.append( {"id": idx, "name": ''.join(chr(i) for i in data[ n[0]])}) ann_dict['categories'] = categories scipy.misc.imsave( os.path.join(data_dir, img_name + '.png'), labelMap) image['width'] = labelMap.shape[0] image['height'] = labelMap.shape[1] image['file_name'] = img_name image['seg_file_name'] = img_name image['id'] = img_id images.append(image) ann_dict['images'] = images print("Num images: %s" % len(images)) with open(os.path.join(out_dir, json_name % data_set), 'wb') as outfile: outfile.write(json.dumps(ann_dict)) # for Cityscapes def getLabelID(self, instID): if (instID < 1000): return instID else: return int(instID / 1000) def convert_cityscapes_instance_only( data_dir, out_dir): """Convert from cityscapes format to COCO instance seg format - polygons""" sets = [ 'gtFine_val', 'gtFine_train', 'gtFine_test', # 'gtCoarse_train', # 'gtCoarse_val', # 'gtCoarse_train_extra' ] ann_dirs = [ 'gtFine_trainvaltest/gtFine/val', 'gtFine_trainvaltest/gtFine/train', 'gtFine_trainvaltest/gtFine/test', # 'gtCoarse/train', # 'gtCoarse/train_extra', # 'gtCoarse/val' ] json_name = 'instancesonly_filtered_%s.json' ends_in = '%s_polygons.json' img_id = 0 ann_id = 0 cat_id = 1 category_dict = {} category_instancesonly = [ 'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle', ] for data_set, ann_dir in zip(sets, ann_dirs): print('Starting %s' % data_set) ann_dict = {} images = [] annotations = [] ann_dir = os.path.join(data_dir, ann_dir) for root, _, files in os.walk(ann_dir): for filename in files: if filename.endswith(ends_in % data_set.split('_')[0]): if len(images) % 50 == 0: print("Processed %s images, %s annotations" % ( len(images), len(annotations))) json_ann = json.load(open(os.path.join(root, filename))) image = {} image['id'] = img_id img_id += 1 image['width'] = json_ann['imgWidth'] image['height'] = json_ann['imgHeight'] image['file_name'] = filename[:-len( ends_in % data_set.split('_')[0])] + 'leftImg8bit.png' image['seg_file_name'] = filename[:-len( ends_in % data_set.split('_')[0])] + \ '%s_instanceIds.png' % data_set.split('_')[0] images.append(image) fullname = os.path.join(root, image['seg_file_name']) objects = cs.instances2dict_with_polygons( [fullname], verbose=False)[fullname] for object_cls in objects: if object_cls not in category_instancesonly: continue # skip non-instance categories for obj in objects[object_cls]: if obj['contours'] == []: print('Warning: empty contours.') continue # skip non-instance categories len_p = [len(p) for p in obj['contours']] if min(len_p) <= 4: print('Warning: invalid contours.') continue # skip non-instance categories ann = {} ann['id'] = ann_id ann_id += 1 ann['image_id'] = image['id'] ann['segmentation'] = obj['contours'] if object_cls not in category_dict: category_dict[object_cls] = cat_id cat_id += 1 ann['category_id'] = category_dict[object_cls] ann['iscrowd'] = 0 ann['area'] = obj['pixelCount'] xyxy_box = poly_to_box(ann['segmentation']) xywh_box = xyxy_to_xywh(xyxy_box) ann['bbox'] = xywh_box annotations.append(ann) ann_dict['images'] = images categories = [{"id": category_dict[name], "name": name} for name in category_dict] ann_dict['categories'] = categories ann_dict['annotations'] = annotations print("Num categories: %s" % len(categories)) print("Num images: %s" % len(images)) print("Num annotations: %s" % len(annotations)) with open(os.path.join(out_dir, json_name % data_set), 'w') as outfile: outfile.write(json.dumps(ann_dict)) if __name__ == '__main__': args = parse_args() if args.dataset == "cityscapes_instance_only": convert_cityscapes_instance_only(args.datadir, args.outdir) elif args.dataset == "cocostuff": convert_coco_stuff_mat(args.datadir, args.outdir) else: print("Dataset not supported: %s" % args.dataset) ================================================ FILE: tools/cityscapes/instances2dict_with_polygons.py ================================================ #!/usr/bin/python # # Convert instances from png files to a dictionary # This files is created according to https://github.com/facebookresearch/Detectron/issues/111 from __future__ import print_function, absolute_import, division import os, sys sys.path.append( os.path.normpath( os.path.join( os.path.dirname( __file__ ) , '..' , 'helpers' ) ) ) from csHelpers import * # Cityscapes imports from cityscapesscripts.evaluation.instance import * from cityscapesscripts.helpers.csHelpers import * import cv2 from maskrcnn_benchmark.utils import cv2_util def instances2dict_with_polygons(imageFileList, verbose=False): imgCount = 0 instanceDict = {} if not isinstance(imageFileList, list): imageFileList = [imageFileList] if verbose: print("Processing {} images...".format(len(imageFileList))) for imageFileName in imageFileList: # Load image img = Image.open(imageFileName) # Image as numpy array imgNp = np.array(img) # Initialize label categories instances = {} for label in labels: instances[label.name] = [] # Loop through all instance ids in instance image for instanceId in np.unique(imgNp): if instanceId < 1000: continue instanceObj = Instance(imgNp, instanceId) instanceObj_dict = instanceObj.toDict() #instances[id2label[instanceObj.labelID].name].append(instanceObj.toDict()) if id2label[instanceObj.labelID].hasInstances: mask = (imgNp == instanceId).astype(np.uint8) contour, hier = cv2_util.findContours( mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) polygons = [c.reshape(-1).tolist() for c in contour] instanceObj_dict['contours'] = polygons instances[id2label[instanceObj.labelID].name].append(instanceObj_dict) imgKey = os.path.abspath(imageFileName) instanceDict[imgKey] = instances imgCount += 1 if verbose: print("\rImages Processed: {}".format(imgCount), end=' ') sys.stdout.flush() if verbose: print("") return instanceDict def main(argv): fileList = [] if (len(argv) > 2): for arg in argv: if ("png" in arg): fileList.append(arg) instances2dict_with_polygons(fileList, True) if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: tools/test_net.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. # Set up custom environment before nearly anything else is imported # NOTE: this should be the first import (no not reorder) from maskrcnn_benchmark.utils.env import setup_environment # noqa F401 isort:skip import argparse import os import torch from maskrcnn_benchmark.config import cfg from maskrcnn_benchmark.data import make_data_loader from maskrcnn_benchmark.engine.inference import inference from maskrcnn_benchmark.modeling.detector import build_detection_model from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer from maskrcnn_benchmark.utils.collect_env import collect_env_info from maskrcnn_benchmark.utils.comm import synchronize, get_rank from maskrcnn_benchmark.utils.logger import setup_logger from maskrcnn_benchmark.utils.miscellaneous import mkdir # Check if we can enable mixed-precision via apex.amp try: from apex import amp except ImportError: raise ImportError('Use APEX for mixed precision via apex.amp') def main(): parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default="/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--ckpt", help="The path to the checkpoint for test, default is the latest checkpoint.", default=None, ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group( backend="nccl", init_method="env://" ) synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() save_dir = "" logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) model = build_detection_model(cfg) model.to(cfg.MODEL.DEVICE) # Initialize mixed-precision if necessary use_mixed_precision = cfg.DTYPE == 'float16' amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE) output_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) ckpt = cfg.MODEL.WEIGHT if args.ckpt is None else args.ckpt _ = checkpointer.load(ckpt, use_latest=args.ckpt is None) iou_types = ("bbox",) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm",) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints",) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, bbox_aug=cfg.TEST.BBOX_AUG.ENABLED, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize() if __name__ == "__main__": main() ================================================ FILE: tools/train_net.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. r""" Basic training script for PyTorch """ # Set up custom environment before nearly anything else is imported # NOTE: this should be the first import (no not reorder) from maskrcnn_benchmark.utils.env import setup_environment # noqa F401 isort:skip import argparse import os import torch from maskrcnn_benchmark.config import cfg from maskrcnn_benchmark.data import make_data_loader from maskrcnn_benchmark.solver import make_lr_scheduler from maskrcnn_benchmark.solver import make_optimizer from maskrcnn_benchmark.engine.inference import inference from maskrcnn_benchmark.engine.trainer import do_train from maskrcnn_benchmark.modeling.detector import build_detection_model from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer from maskrcnn_benchmark.utils.collect_env import collect_env_info from maskrcnn_benchmark.utils.comm import synchronize, get_rank from maskrcnn_benchmark.utils.imports import import_file from maskrcnn_benchmark.utils.logger import setup_logger from maskrcnn_benchmark.utils.miscellaneous import mkdir, save_config # See if we can use apex.DistributedDataParallel instead of the torch default, # and enable mixed-precision via apex.amp try: from apex import amp except ImportError: raise ImportError('Use APEX for multi-precision via apex.amp') def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) test_period = cfg.SOLVER.TEST_PERIOD if test_period > 0: data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True) else: data_loader_val = None checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, ) return model def run_test(cfg, model, distributed): if distributed: model = model.module torch.cuda.empty_cache() # TODO check if it helps iou_types = ("bbox",) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm",) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints",) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, bbox_aug=cfg.TEST.BBOX_AUG.ENABLED, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize() def main(): parser = argparse.ArgumentParser(description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group( backend="nccl", init_method="env://" ) synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) output_config_path = os.path.join(cfg.OUTPUT_DIR, 'config.yml') logger.info("Saving config into: {}".format(output_config_path)) # save overloaded model config in the output directory save_config(cfg, output_config_path) model = train(cfg, args.local_rank, args.distributed) if not args.skip_test: run_test(cfg, model, args.distributed) if __name__ == "__main__": main()