Repository: yqyao/FCOS_PLUS Branch: master Commit: 0d20ba34ccc3 Files: 240 Total size: 632.0 KB Directory structure: gitextract_1mo5xw5_/ ├── .flake8 ├── .gitignore ├── ABSTRACTIONS.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── INSTALL.md ├── LICENSE ├── MASKRCNN_README.md ├── MODEL_ZOO.md ├── README.md ├── TROUBLESHOOTING.md ├── configs/ │ ├── caffe2/ │ │ ├── e2e_faster_rcnn_R_101_FPN_1x_caffe2.yaml │ │ ├── e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml │ │ ├── e2e_faster_rcnn_R_50_FPN_1x_caffe2.yaml │ │ ├── e2e_faster_rcnn_X_101_32x8d_FPN_1x_caffe2.yaml │ │ ├── e2e_keypoint_rcnn_R_50_FPN_1x_caffe2.yaml │ │ ├── e2e_mask_rcnn_R_101_FPN_1x_caffe2.yaml │ │ ├── e2e_mask_rcnn_R_50_C4_1x_caffe2.yaml │ │ ├── e2e_mask_rcnn_R_50_FPN_1x_caffe2.yaml │ │ ├── e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x_caffe2.yaml │ │ └── e2e_mask_rcnn_X_101_32x8d_FPN_1x_caffe2.yaml │ ├── cityscapes/ │ │ ├── e2e_faster_rcnn_R_50_FPN_1x_cocostyle.yaml │ │ └── e2e_mask_rcnn_R_50_FPN_1x_cocostyle.yaml │ ├── e2e_faster_rcnn_R_101_FPN_1x.yaml │ ├── e2e_faster_rcnn_R_50_C4_1x.yaml │ ├── e2e_faster_rcnn_R_50_FPN_1x.yaml │ ├── e2e_faster_rcnn_X_101_32x8d_FPN_1x.yaml │ ├── e2e_faster_rcnn_fbnet.yaml │ ├── e2e_faster_rcnn_fbnet_600.yaml │ ├── e2e_faster_rcnn_fbnet_chamv1a_600.yaml │ ├── e2e_keypoint_rcnn_R_50_FPN_1x.yaml │ ├── e2e_mask_rcnn_R_101_FPN_1x.yaml │ ├── e2e_mask_rcnn_R_50_C4_1x.yaml │ ├── e2e_mask_rcnn_R_50_FPN_1x.yaml │ ├── e2e_mask_rcnn_X_101_32x8d_FPN_1x.yaml │ ├── e2e_mask_rcnn_fbnet.yaml │ ├── e2e_mask_rcnn_fbnet_600.yaml │ ├── e2e_mask_rcnn_fbnet_xirb16d_dsmask.yaml │ ├── e2e_mask_rcnn_fbnet_xirb16d_dsmask_600.yaml │ ├── fcos/ │ │ ├── fcos_R_101_FPN_2x.yaml │ │ ├── fcos_R_50_FPN_1x.yaml │ │ ├── fcos_R_50_FPN_1x_center.yaml │ │ ├── fcos_R_50_FPN_1x_center_giou.yaml │ │ ├── fcos_X_101_32x8d_FPN_2x.yaml │ │ ├── fcos_X_101_64x4d_FPN_2x.yaml │ │ ├── fcos_bn_bs16_MNV2_FPN_1x.yaml │ │ ├── fcos_syncbn_bs32_MNV2_FPN_1x.yaml │ │ ├── fcos_syncbn_bs32_c128_MNV2_FPN_1x.yaml │ │ ├── fcos_syncbn_bs32_c128_ms_MNV2_FPN_1x.yaml │ │ └── fcos_syncbn_bs64_c128_ms_MNV2_FPN_1x.yaml │ ├── gn_baselines/ │ │ ├── README.md │ │ ├── e2e_faster_rcnn_R_50_FPN_1x_gn.yaml │ │ ├── e2e_faster_rcnn_R_50_FPN_Xconv1fc_1x_gn.yaml │ │ ├── e2e_mask_rcnn_R_50_FPN_1x_gn.yaml │ │ ├── e2e_mask_rcnn_R_50_FPN_Xconv1fc_1x_gn.yaml │ │ ├── scratch_e2e_faster_rcnn_R_50_FPN_3x_gn.yaml │ │ ├── scratch_e2e_faster_rcnn_R_50_FPN_Xconv1fc_3x_gn.yaml │ │ ├── scratch_e2e_mask_rcnn_R_50_FPN_3x_gn.yaml │ │ └── scratch_e2e_mask_rcnn_R_50_FPN_Xconv1fc_3x_gn.yaml │ ├── pascal_voc/ │ │ ├── e2e_faster_rcnn_R_50_C4_1x_1_gpu_voc.yaml │ │ ├── e2e_faster_rcnn_R_50_C4_1x_4_gpu_voc.yaml │ │ └── e2e_mask_rcnn_R_50_FPN_1x_cocostyle.yaml │ ├── quick_schedules/ │ │ ├── e2e_faster_rcnn_R_50_C4_quick.yaml │ │ ├── e2e_faster_rcnn_R_50_FPN_quick.yaml │ │ ├── e2e_faster_rcnn_X_101_32x8d_FPN_quick.yaml │ │ ├── e2e_keypoint_rcnn_R_50_FPN_quick.yaml │ │ ├── e2e_mask_rcnn_R_50_C4_quick.yaml │ │ ├── e2e_mask_rcnn_R_50_FPN_quick.yaml │ │ ├── e2e_mask_rcnn_X_101_32x8d_FPN_quick.yaml │ │ ├── rpn_R_50_C4_quick.yaml │ │ └── rpn_R_50_FPN_quick.yaml │ ├── retinanet/ │ │ ├── retinanet_R-101-FPN_1x.yaml │ │ ├── retinanet_R-101-FPN_P5_1x.yaml │ │ ├── retinanet_R-50-FPN_1x.yaml │ │ ├── retinanet_R-50-FPN_1x_quick.yaml │ │ ├── retinanet_R-50-FPN_P5_1x.yaml │ │ └── retinanet_X_101_32x8d_FPN_1x.yaml │ ├── rpn_R_101_FPN_1x.yaml │ ├── rpn_R_50_C4_1x.yaml │ ├── rpn_R_50_FPN_1x.yaml │ └── rpn_X_101_32x8d_FPN_1x.yaml ├── demo/ │ ├── README.md │ ├── fcos_demo.py │ ├── predictor.py │ └── webcam.py ├── docker/ │ ├── Dockerfile │ └── docker-jupyter/ │ ├── Dockerfile │ └── jupyter_notebook_config.py ├── maskrcnn_benchmark/ │ ├── __init__.py │ ├── config/ │ │ ├── __init__.py │ │ ├── defaults.py │ │ └── paths_catalog.py │ ├── csrc/ │ │ ├── ROIAlign.h │ │ ├── ROIPool.h │ │ ├── SigmoidFocalLoss.h │ │ ├── cpu/ │ │ │ ├── ROIAlign_cpu.cpp │ │ │ ├── nms_cpu.cpp │ │ │ └── vision.h │ │ ├── cuda/ │ │ │ ├── ROIAlign_cuda.cu │ │ │ ├── ROIPool_cuda.cu │ │ │ ├── SigmoidFocalLoss_cuda.cu │ │ │ ├── nms.cu │ │ │ └── vision.h │ │ ├── nms.h │ │ └── vision.cpp │ ├── data/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── build.py │ │ ├── collate_batch.py │ │ ├── datasets/ │ │ │ ├── __init__.py │ │ │ ├── coco.py │ │ │ ├── concat_dataset.py │ │ │ ├── evaluation/ │ │ │ │ ├── __init__.py │ │ │ │ ├── coco/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── coco_eval.py │ │ │ │ └── voc/ │ │ │ │ ├── __init__.py │ │ │ │ └── voc_eval.py │ │ │ ├── list_dataset.py │ │ │ └── voc.py │ │ ├── samplers/ │ │ │ ├── __init__.py │ │ │ ├── distributed.py │ │ │ ├── grouped_batch_sampler.py │ │ │ └── iteration_based_batch_sampler.py │ │ └── transforms/ │ │ ├── __init__.py │ │ ├── build.py │ │ └── transforms.py │ ├── engine/ │ │ ├── __init__.py │ │ ├── inference.py │ │ └── trainer.py │ ├── layers/ │ │ ├── __init__.py │ │ ├── _utils.py │ │ ├── batch_norm.py │ │ ├── iou_loss.py │ │ ├── misc.py │ │ ├── nms.py │ │ ├── roi_align.py │ │ ├── roi_pool.py │ │ ├── scale.py │ │ ├── sigmoid_focal_loss.py │ │ └── smooth_l1_loss.py │ ├── modeling/ │ │ ├── __init__.py │ │ ├── backbone/ │ │ │ ├── __init__.py │ │ │ ├── backbone.py │ │ │ ├── fbnet.py │ │ │ ├── fbnet_builder.py │ │ │ ├── fbnet_modeldef.py │ │ │ ├── fpn.py │ │ │ ├── mobilenet.py │ │ │ └── resnet.py │ │ ├── balanced_positive_negative_sampler.py │ │ ├── box_coder.py │ │ ├── detector/ │ │ │ ├── __init__.py │ │ │ ├── detectors.py │ │ │ └── generalized_rcnn.py │ │ ├── make_layers.py │ │ ├── matcher.py │ │ ├── poolers.py │ │ ├── registry.py │ │ ├── roi_heads/ │ │ │ ├── __init__.py │ │ │ ├── box_head/ │ │ │ │ ├── __init__.py │ │ │ │ ├── box_head.py │ │ │ │ ├── inference.py │ │ │ │ ├── loss.py │ │ │ │ ├── roi_box_feature_extractors.py │ │ │ │ └── roi_box_predictors.py │ │ │ ├── keypoint_head/ │ │ │ │ ├── __init__.py │ │ │ │ ├── inference.py │ │ │ │ ├── keypoint_head.py │ │ │ │ ├── loss.py │ │ │ │ ├── roi_keypoint_feature_extractors.py │ │ │ │ └── roi_keypoint_predictors.py │ │ │ ├── mask_head/ │ │ │ │ ├── __init__.py │ │ │ │ ├── inference.py │ │ │ │ ├── loss.py │ │ │ │ ├── mask_head.py │ │ │ │ ├── roi_mask_feature_extractors.py │ │ │ │ └── roi_mask_predictors.py │ │ │ └── roi_heads.py │ │ ├── rpn/ │ │ │ ├── __init__.py │ │ │ ├── anchor_generator.py │ │ │ ├── fcos/ │ │ │ │ ├── __init__.py │ │ │ │ ├── fcos.py │ │ │ │ ├── inference.py │ │ │ │ └── loss.py │ │ │ ├── inference.py │ │ │ ├── loss.py │ │ │ ├── retinanet/ │ │ │ │ ├── __init__.py │ │ │ │ ├── inference.py │ │ │ │ ├── loss.py │ │ │ │ └── retinanet.py │ │ │ ├── rpn.py │ │ │ └── utils.py │ │ └── utils.py │ ├── solver/ │ │ ├── __init__.py │ │ ├── build.py │ │ └── lr_scheduler.py │ ├── structures/ │ │ ├── __init__.py │ │ ├── bounding_box.py │ │ ├── boxlist_ops.py │ │ ├── image_list.py │ │ ├── keypoint.py │ │ └── segmentation_mask.py │ └── utils/ │ ├── README.md │ ├── __init__.py │ ├── c2_model_loading.py │ ├── checkpoint.py │ ├── collect_env.py │ ├── comm.py │ ├── cv2_util.py │ ├── env.py │ ├── imports.py │ ├── logger.py │ ├── metric_logger.py │ ├── miscellaneous.py │ ├── model_serialization.py │ ├── model_zoo.py │ ├── registry.py │ └── timer.py ├── requirements.txt ├── setup.py ├── tests/ │ ├── checkpoint.py │ ├── env_tests/ │ │ └── env.py │ ├── test_backbones.py │ ├── test_box_coder.py │ ├── test_configs.py │ ├── test_data_samplers.py │ ├── test_detectors.py │ ├── test_fbnet.py │ ├── test_feature_extractors.py │ ├── test_metric_logger.py │ ├── test_nms.py │ ├── test_predictors.py │ ├── test_rpn_heads.py │ ├── test_segmentation_mask.py │ └── utils.py └── tools/ ├── cityscapes/ │ ├── convert_cityscapes_to_coco.py │ └── instances2dict_with_polygons.py ├── remove_solver_states.py ├── test_net.py └── train_net.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .flake8 ================================================ # This is an example .flake8 config, used when developing *Black* itself. # Keep in sync with setup.cfg which is used for source packages. [flake8] ignore = E203, E266, E501, W503 max-line-length = 80 max-complexity = 18 select = B,C,E,F,W,T4,B9 ================================================ FILE: .gitignore ================================================ # compilation and distribution __pycache__ _ext *.pyc *.so maskrcnn_benchmark.egg-info/ build/ dist/ # pytorch/python/numpy formats *.pth *.pkl *.npy # ipython/jupyter notebooks *.ipynb **/.ipynb_checkpoints/ # Editor temporaries *.swn *.swo *.swp *~ # Pycharm editor settings .idea # project dirs /datasets /models ================================================ FILE: ABSTRACTIONS.md ================================================ ## Abstractions The main abstractions introduced by `maskrcnn_benchmark` that are useful to have in mind are the following: ### ImageList In PyTorch, the first dimension of the input to the network generally represents the batch dimension, and thus all elements of the same batch have the same height / width. In order to support images with different sizes and aspect ratios in the same batch, we created the `ImageList` class, which holds internally a batch of images (os possibly different sizes). The images are padded with zeros such that they have the same final size and batched over the first dimension. The original sizes of the images before padding are stored in the `image_sizes` attribute, and the batched tensor in `tensors`. We provide a convenience function `to_image_list` that accepts a few different input types, including a list of tensors, and returns an `ImageList` object. ```python from maskrcnn_benchmark.structures.image_list import to_image_list images = [torch.rand(3, 100, 200), torch.rand(3, 150, 170)] batched_images = to_image_list(images) # it is also possible to make the final batched image be a multiple of a number batched_images_32 = to_image_list(images, size_divisible=32) ``` ### BoxList The `BoxList` class holds a set of bounding boxes (represented as a `Nx4` tensor) for a specific image, as well as the size of the image as a `(width, height)` tuple. It also contains a set of methods that allow to perform geometric transformations to the bounding boxes (such as cropping, scaling and flipping). The class accepts bounding boxes from two different input formats: - `xyxy`, where each box is encoded as a `x1`, `y1`, `x2` and `y2` coordinates, and - `xywh`, where each box is encoded as `x1`, `y1`, `w` and `h`. Additionally, each `BoxList` instance can also hold arbitrary additional information for each bounding box, such as labels, visibility, probability scores etc. Here is an example on how to create a `BoxList` from a list of coordinates: ```python from maskrcnn_benchmark.structures.bounding_box import BoxList, FLIP_LEFT_RIGHT width = 100 height = 200 boxes = [ [0, 10, 50, 50], [50, 20, 90, 60], [10, 10, 50, 50] ] # create a BoxList with 3 boxes bbox = BoxList(boxes, image_size=(width, height), mode='xyxy') # perform some box transformations, has similar API as PIL.Image bbox_scaled = bbox.resize((width * 2, height * 3)) bbox_flipped = bbox.transpose(FLIP_LEFT_RIGHT) # add labels for each bbox labels = torch.tensor([0, 10, 1]) bbox.add_field('labels', labels) # bbox also support a few operations, like indexing # here, selects boxes 0 and 2 bbox_subset = bbox[[0, 2]] ``` ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Code of Conduct Facebook has adopted a Code of Conduct that we expect project participants to adhere to. Please read the [full text](https://code.fb.com/codeofconduct/) so that you can understand what actions will and will not be tolerated. ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to Mask-RCNN Benchmark We want to make contributing to this project as easy and transparent as possible. ## Our Development Process Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis. ## Pull Requests We actively welcome your pull requests. 1. Fork the repo and create your branch from `master`. 2. If you've added code that should be tested, add tests. 3. If you've changed APIs, update the documentation. 4. Ensure the test suite passes. 5. Make sure your code lints. 6. If you haven't already, complete the Contributor License Agreement ("CLA"). ## Contributor License Agreement ("CLA") In order to accept your pull request, we need you to submit a CLA. You only need to do this once to work on any of Facebook's open source projects. Complete your CLA here: ## Issues We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue. Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe disclosure of security bugs. In those cases, please go through the process outlined on that page and do not file a public issue. ## Coding Style * 4 spaces for indentation rather than tabs * 80 character line length * PEP8 formatting following [Black](https://black.readthedocs.io/en/stable/) ## License By contributing to Mask-RCNN Benchmark, you agree that your contributions will be licensed under the LICENSE file in the root directory of this source tree. ================================================ FILE: INSTALL.md ================================================ ## Installation ### Requirements: - PyTorch >= 1.0. Installation instructions can be found in https://pytorch.org/get-started/locally/. - torchvision==0.2.1 - cocoapi - yacs - matplotlib - GCC >= 4.9 - (optional) OpenCV for the webcam demo ### Option 1: Step-by-step installation ```bash # first, make sure that your conda is setup properly with the right environment # for that, check that `which conda`, `which pip` and `which python` points to the # right path. From a clean conda env, this is what you need to do conda create --name FCOS conda activate FCOS # this installs the right pip and dependencies for the fresh python conda install ipython # FCOS and coco api dependencies pip install ninja yacs cython matplotlib tqdm # follow PyTorch installation in https://pytorch.org/get-started/locally/ # we give the instructions for CUDA 9.0 conda install -c pytorch torchvision=0.2.1 cudatoolkit=9.0 export INSTALL_DIR=$PWD # install pycocotools. Please make sure you have installed cython. cd $INSTALL_DIR git clone https://github.com/cocodataset/cocoapi.git cd cocoapi/PythonAPI python setup.py build_ext install # install PyTorch Detection cd $INSTALL_DIR git clone https://github.com/yqyao/FCOS_PLUS.git cd FCOS_PLUS # the following will install the lib with # symbolic links, so that you can modify # the files if you want and won't need to # re-build it python setup.py build develop unset INSTALL_DIR # or if you are on macOS # MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py build develop ``` ### Option 2: Docker Image (Requires CUDA, Linux only) *The following steps are for original maskrcnn-benchmark. Please change the repository name if needed.* Build image with defaults (`CUDA=9.0`, `CUDNN=7`, `FORCE_CUDA=1`): nvidia-docker build -t maskrcnn-benchmark docker/ Build image with other CUDA and CUDNN versions: nvidia-docker build -t maskrcnn-benchmark --build-arg CUDA=9.2 --build-arg CUDNN=7 docker/ Build image with FORCE_CUDA disabled: nvidia-docker build -t maskrcnn-benchmark --build-arg FORCE_CUDA=0 docker/ Build and run image with built-in jupyter notebook(note that the password is used to log in jupyter notebook): nvidia-docker build -t maskrcnn-benchmark-jupyter docker/docker-jupyter/ nvidia-docker run -td -p 8888:8888 -e PASSWORD= -v : maskrcnn-benchmark-jupyter ================================================ FILE: LICENSE ================================================ FCOS for non-commercial purposes Copyright (c) 2019 the authors All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: MASKRCNN_README.md ================================================ # Faster R-CNN and Mask R-CNN in PyTorch 1.0 This project aims at providing the necessary building blocks for easily creating detection and segmentation models using PyTorch 1.0. ![alt text](demo/demo_e2e_mask_rcnn_X_101_32x8d_FPN_1x.png "from http://cocodataset.org/#explore?id=345434") ## Highlights - **PyTorch 1.0:** RPN, Faster R-CNN and Mask R-CNN implementations that matches or exceeds Detectron accuracies - **Very fast**: up to **2x** faster than [Detectron](https://github.com/facebookresearch/Detectron) and **30%** faster than [mmdetection](https://github.com/open-mmlab/mmdetection) during training. See [MODEL_ZOO.md](MODEL_ZOO.md) for more details. - **Memory efficient:** uses roughly 500MB less GPU memory than mmdetection during training - **Multi-GPU training and inference** - **Batched inference:** can perform inference using multiple images per batch per GPU - **CPU support for inference:** runs on CPU in inference time. See our [webcam demo](demo) for an example - Provides pre-trained models for almost all reference Mask R-CNN and Faster R-CNN configurations with 1x schedule. ## Webcam and Jupyter notebook demo We provide a simple webcam demo that illustrates how you can use `maskrcnn_benchmark` for inference: ```bash cd demo # by default, it runs on the GPU # for best results, use min-image-size 800 python webcam.py --min-image-size 800 # can also run it on the CPU python webcam.py --min-image-size 300 MODEL.DEVICE cpu # or change the model that you want to use python webcam.py --config-file ../configs/caffe2/e2e_mask_rcnn_R_101_FPN_1x_caffe2.yaml --min-image-size 300 MODEL.DEVICE cpu # in order to see the probability heatmaps, pass --show-mask-heatmaps python webcam.py --min-image-size 300 --show-mask-heatmaps MODEL.DEVICE cpu # for the keypoint demo python webcam.py --config-file ../configs/caffe2/e2e_keypoint_rcnn_R_50_FPN_1x_caffe2.yaml --min-image-size 300 MODEL.DEVICE cpu ``` A notebook with the demo can be found in [demo/Mask_R-CNN_demo.ipynb](demo/Mask_R-CNN_demo.ipynb). ## Installation Check [INSTALL.md](INSTALL.md) for installation instructions. ## Model Zoo and Baselines Pre-trained models, baselines and comparison with Detectron and mmdetection can be found in [MODEL_ZOO.md](MODEL_ZOO.md) ## Inference in a few lines We provide a helper class to simplify writing inference pipelines using pre-trained models. Here is how we would do it. Run this from the `demo` folder: ```python from maskrcnn_benchmark.config import cfg from predictor import COCODemo config_file = "../configs/caffe2/e2e_mask_rcnn_R_50_FPN_1x_caffe2.yaml" # update the config options with the config file cfg.merge_from_file(config_file) # manual override some options cfg.merge_from_list(["MODEL.DEVICE", "cpu"]) coco_demo = COCODemo( cfg, min_image_size=800, confidence_threshold=0.7, ) # load image and then run prediction image = ... predictions = coco_demo.run_on_opencv_image(image) ``` ## Perform training on COCO dataset For the following examples to work, you need to first install `maskrcnn_benchmark`. You will also need to download the COCO dataset. We recommend to symlink the path to the coco dataset to `datasets/` as follows We use `minival` and `valminusminival` sets from [Detectron](https://github.com/facebookresearch/Detectron/blob/master/detectron/datasets/data/README.md#coco-minival-annotations) ```bash # symlink the coco dataset cd ~/github/maskrcnn-benchmark mkdir -p datasets/coco ln -s /path_to_coco_dataset/annotations datasets/coco/annotations ln -s /path_to_coco_dataset/train2014 datasets/coco/train2014 ln -s /path_to_coco_dataset/test2014 datasets/coco/test2014 ln -s /path_to_coco_dataset/val2014 datasets/coco/val2014 # or use COCO 2017 version ln -s /path_to_coco_dataset/annotations datasets/coco/annotations ln -s /path_to_coco_dataset/train2017 datasets/coco/train2017 ln -s /path_to_coco_dataset/test2017 datasets/coco/test2017 ln -s /path_to_coco_dataset/val2017 datasets/coco/val2017 # for pascal voc dataset: ln -s /path_to_VOCdevkit_dir datasets/voc ``` P.S. `COCO_2017_train` = `COCO_2014_train` + `valminusminival` , `COCO_2017_val` = `minival` You can also configure your own paths to the datasets. For that, all you need to do is to modify `maskrcnn_benchmark/config/paths_catalog.py` to point to the location where your dataset is stored. You can also create a new `paths_catalog.py` file which implements the same two classes, and pass it as a config argument `PATHS_CATALOG` during training. ### Single GPU training Most of the configuration files that we provide assume that we are running on 8 GPUs. In order to be able to run it on fewer GPUs, there are a few possibilities: **1. Run the following without modifications** ```bash python /path_to_maskrcnn_benchmark/tools/train_net.py --config-file "/path/to/config/file.yaml" ``` This should work out of the box and is very similar to what we should do for multi-GPU training. But the drawback is that it will use much more GPU memory. The reason is that we set in the configuration files a global batch size that is divided over the number of GPUs. So if we only have a single GPU, this means that the batch size for that GPU will be 8x larger, which might lead to out-of-memory errors. If you have a lot of memory available, this is the easiest solution. **2. Modify the cfg parameters** If you experience out-of-memory errors, you can reduce the global batch size. But this means that you'll also need to change the learning rate, the number of iterations and the learning rate schedule. Here is an example for Mask R-CNN R-50 FPN with the 1x schedule: ```bash python tools/train_net.py --config-file "configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" SOLVER.IMS_PER_BATCH 2 SOLVER.BASE_LR 0.0025 SOLVER.MAX_ITER 720000 SOLVER.STEPS "(480000, 640000)" TEST.IMS_PER_BATCH 1 ``` This follows the [scheduling rules from Detectron.](https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14-L30) Note that we have multiplied the number of iterations by 8x (as well as the learning rate schedules), and we have divided the learning rate by 8x. We also changed the batch size during testing, but that is generally not necessary because testing requires much less memory than training. ### Multi-GPU training We use internally `torch.distributed.launch` in order to launch multi-gpu training. This utility function from PyTorch spawns as many Python processes as the number of GPUs we want to use, and each Python process will only use a single GPU. ```bash export NGPUS=8 python -m torch.distributed.launch --nproc_per_node=$NGPUS /path_to_maskrcnn_benchmark/tools/train_net.py --config-file "path/to/config/file.yaml" ``` ## Abstractions For more information on some of the main abstractions in our implementation, see [ABSTRACTIONS.md](ABSTRACTIONS.md). ## Adding your own dataset This implementation adds support for COCO-style datasets. But adding support for training on a new dataset can be done as follows: ```python from maskrcnn_benchmark.structures.bounding_box import BoxList class MyDataset(object): def __init__(self, ...): # as you would do normally def __getitem__(self, idx): # load the image as a PIL Image image = ... # load the bounding boxes as a list of list of boxes # in this case, for illustrative purposes, we use # x1, y1, x2, y2 order. boxes = [[0, 0, 10, 10], [10, 20, 50, 50]] # and labels labels = torch.tensor([10, 20]) # create a BoxList from the boxes boxlist = BoxList(boxes, image.size, mode="xyxy") # add the labels to the boxlist boxlist.add_field("labels", labels) if self.transforms: image, boxlist = self.transforms(image, boxlist) # return the image, the boxlist and the idx in your dataset return image, boxlist, idx def get_img_info(self, idx): # get img_height and img_width. This is used if # we want to split the batches according to the aspect ratio # of the image, as it can be more efficient than loading the # image from disk return {"height": img_height, "width": img_width} ``` That's it. You can also add extra fields to the boxlist, such as segmentation masks (using `structures.segmentation_mask.SegmentationMask`), or even your own instance type. For a full example of how the `COCODataset` is implemented, check [`maskrcnn_benchmark/data/datasets/coco.py`](maskrcnn_benchmark/data/datasets/coco.py). Once you have created your dataset, it needs to be added in a couple of places: - [`maskrcnn_benchmark/data/datasets/__init__.py`](maskrcnn_benchmark/data/datasets/__init__.py): add it to `__all__` - [`maskrcnn_benchmark/config/paths_catalog.py`](maskrcnn_benchmark/config/paths_catalog.py): `DatasetCatalog.DATASETS` and corresponding `if` clause in `DatasetCatalog.get()` ### Testing While the aforementioned example should work for training, we leverage the cocoApi for computing the accuracies during testing. Thus, test datasets should currently follow the cocoApi for now. To enable your dataset for testing, add a corresponding if statement in [`maskrcnn_benchmark/data/datasets/evaluation/__init__.py`](maskrcnn_benchmark/data/datasets/evaluation/__init__.py): ```python if isinstance(dataset, datasets.MyDataset): return coco_evaluation(**args) ``` ## Finetuning from Detectron weights on custom datasets Create a script `tools/trim_detectron_model.py` like [here](https://gist.github.com/wangg12/aea194aa6ab6a4de088f14ee193fd968). You can decide which keys to be removed and which keys to be kept by modifying the script. Then you can simply point the converted model path in the config file by changing `MODEL.WEIGHT`. For further information, please refer to [#15](https://github.com/facebookresearch/maskrcnn-benchmark/issues/15). ## Troubleshooting If you have issues running or compiling this code, we have compiled a list of common issues in [TROUBLESHOOTING.md](TROUBLESHOOTING.md). If your issue is not present there, please feel free to open a new issue. ## Citations Please consider citing this project in your publications if it helps your research. The following is a BibTeX reference. The BibTeX entry requires the `url` LaTeX package. ``` @misc{massa2018mrcnn, author = {Massa, Francisco and Girshick, Ross}, title = {{maskrcnn-benchmark: Fast, modular reference implementation of Instance Segmentation and Object Detection algorithms in PyTorch}}, year = {2018}, howpublished = {\url{https://github.com/facebookresearch/maskrcnn-benchmark}}, note = {Accessed: [Insert date here]} } ``` ## Projects using maskrcnn-benchmark - [RetinaMask: Learning to predict masks improves state-of-the-art single-shot detection for free](https://arxiv.org/abs/1901.03353). Cheng-Yang Fu, Mykhailo Shvets, and Alexander C. Berg. Tech report, arXiv,1901.03353. ## License maskrcnn-benchmark is released under the MIT license. See [LICENSE](LICENSE) for additional details. ================================================ FILE: MODEL_ZOO.md ================================================ ## Model Zoo and Baselines ### Hardware - 8 NVIDIA V100 GPUs ### Software - PyTorch version: 1.0.0a0+dd2c487 - CUDA 9.2 - CUDNN 7.1 - NCCL 2.2.13-1 ### End-to-end Faster and Mask R-CNN baselines All the baselines were trained using the exact same experimental setup as in Detectron. We initialize the detection models with ImageNet weights from Caffe2, the same as used by Detectron. The pre-trained models are available in the link in the model id. backbone | type | lr sched | im / gpu | train mem(GB) | train time (s/iter) | total train time(hr) | inference time(s/im) | box AP | mask AP | model id -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- R-50-C4 | Fast | 1x | 1 | 5.8 | 0.4036 | 20.2 | 0.17130 | 34.8 | - | [6358800](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_R_50_C4_1x.pth) R-50-FPN | Fast | 1x | 2 | 4.4 | 0.3530 | 8.8 | 0.12580 | 36.8 | - | [6358793](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_R_50_FPN_1x.pth) R-101-FPN | Fast | 1x | 2 | 7.1 | 0.4591 | 11.5 | 0.143149 | 39.1 | - | [6358804](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_R_101_FPN_1x.pth) X-101-32x8d-FPN | Fast | 1x | 1 | 7.6 | 0.7007 | 35.0 | 0.209965 | 41.2 | - | [6358717](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_X_101_32x8d_FPN_1x.pth) R-50-C4 | Mask | 1x | 1 | 5.8 | 0.4520 | 22.6 | 0.17796 + 0.028 | 35.6 | 31.5 | [6358801](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_50_C4_1x.pth) R-50-FPN | Mask | 1x | 2 | 5.2 | 0.4536 | 11.3 | 0.12966 + 0.034 | 37.8 | 34.2 | [6358792](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_50_FPN_1x.pth) R-101-FPN | Mask | 1x | 2 | 7.9 | 0.5665 | 14.2 | 0.15384 + 0.034 | 40.1 | 36.1 | [6358805](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_101_FPN_1x.pth) X-101-32x8d-FPN | Mask | 1x | 1 | 7.8 | 0.7562 | 37.8 | 0.21739 + 0.034 | 42.2 | 37.8 | [6358718](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_X_101_32x8d_FPN_1x.pth) For person keypoint detection: backbone | type | lr sched | im / gpu | train mem(GB) | train time (s/iter) | total train time(hr) | inference time(s/im) | box AP | keypoint AP | model id -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- R-50-FPN | Keypoint | 1x | 2 | 5.7 | 0.3771 | 9.4 | 0.10941 | 53.7 | 64.3 | 9981060 ### Light-weight Model baselines We provided pre-trained models for selected FBNet models. * All the models are trained from scratched with BN using the training schedule specified below. * Evaluation is performed on a single NVIDIA V100 GPU with `MODEL.RPN.POST_NMS_TOP_N_TEST` set to `200`. The following inference time is reported: * inference total batch=8: Total inference time including data loading, model inference and pre/post preprocessing using 8 images per batch. * inference model batch=8: Model inference time only and using 8 images per batch. * inference model batch=1: Model inference time only and using 1 image per batch. * inferenee caffe2 batch=1: Model inference time for the model in Caffe2 format using 1 image per batch. The Caffe2 models fused the BN to Conv and purely run on C++/CUDA by using Caffe2 ops for rpn/detection post processing. The pre-trained models are available in the link in the model id. backbone | type | resolution | lr sched | im / gpu | train mem(GB) | train time (s/iter) | total train time (hr) | inference total batch=8 (s/im) | inference model batch=8 (s/im) | inference model batch=1 (s/im) | inference caffe2 batch=1 (s/im) | box AP | mask AP | model id -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- [R-50-C4](configs/e2e_faster_rcnn_R_50_C4_1x.yaml) (reference) | Fast | 800 | 1x | 1 | 5.8 | 0.4036 | 20.2 | 0.0875 | **0.0793** | 0.0831 | **0.0625** | 34.4 | - | f35857197 [fbnet_chamv1a](configs/e2e_faster_rcnn_fbnet_chamv1a_600.yaml) | Fast | 600 | 0.75x | 12 | 13.6 | 0.5444 | 20.5 | 0.0315 | **0.0260** | 0.0376 | **0.0188** | 33.5 | - | [f100940543](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_fbnet_chamv1a_600.pth) [fbnet_default](configs/e2e_faster_rcnn_fbnet_600.yaml) | Fast | 600 | 0.5x | 16 | 11.1 | 0.4872 | 12.5 | 0.0316 | **0.0250** | 0.0297 | **0.0130** | 28.2 | - | [f101086388](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_fbnet_600.pth) [R-50-C4](configs/e2e_mask_rcnn_R_50_C4_1x.yaml) (reference) | Mask | 800 | 1x | 1 | 5.8 | 0.452 | 22.6 | 0.0918 | **0.0848** | 0.0844 | - | 35.2 | 31.0 | f35858791 [fbnet_xirb16d](configs/e2e_mask_rcnn_fbnet_xirb16d_dsmask_600.yaml) | Mask | 600 | 0.5x | 16 | 13.4 | 1.1732 | 29 | 0.0386 | **0.0319** | 0.0356 | - | 30.7 | 26.9 | [f101086394](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_fbnet_xirb16d_dsmask.pth) [fbnet_default](configs/e2e_mask_rcnn_fbnet_600.yaml) | Mask | 600 | 0.5x | 16 | 13.0 | 0.9036 | 23.0 | 0.0327 | **0.0269** | 0.0385 | - | 29.0 | 26.1 | [f101086385](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_fbnet_600.pth) ## Comparison with Detectron and mmdetection In the following section, we compare our implementation with [Detectron](https://github.com/facebookresearch/Detectron) and [mmdetection](https://github.com/open-mmlab/mmdetection). The same remarks from [mmdetection](https://github.com/open-mmlab/mmdetection/blob/master/MODEL_ZOO.md#training-speed) about different hardware applies here. ### Training speed The numbers here are in seconds / iteration. The lower, the better. type | Detectron (P100) | mmdetection (V100) | maskrcnn_benchmark (V100) -- | -- | -- | -- Faster R-CNN R-50 C4 | 0.566 | - | 0.4036 Faster R-CNN R-50 FPN | 0.544 | 0.554 | 0.3530 Faster R-CNN R-101 FPN | 0.647 | - | 0.4591 Faster R-CNN X-101-32x8d FPN | 0.799 | - | 0.7007 Mask R-CNN R-50 C4 | 0.620 | - | 0.4520 Mask R-CNN R-50 FPN | 0.889 | 0.690 | 0.4536 Mask R-CNN R-101 FPN | 1.008 | - | 0.5665 Mask R-CNN X-101-32x8d FPN | 0.961 | - | 0.7562 ### Training memory The lower, the better type | Detectron (P100) | mmdetection (V100) | maskrcnn_benchmark (V100) -- | -- | -- | -- Faster R-CNN R-50 C4 | 6.3 | - | 5.8 Faster R-CNN R-50 FPN | 7.2 | 4.9 | 4.4 Faster R-CNN R-101 FPN | 8.9 | - | 7.1 Faster R-CNN X-101-32x8d FPN | 7.0 | - | 7.6 Mask R-CNN R-50 C4 | 6.6 | - | 5.8 Mask R-CNN R-50 FPN | 8.6 | 5.9 | 5.2 Mask R-CNN R-101 FPN | 10.2 | - | 7.9 Mask R-CNN X-101-32x8d FPN | 7.7 | - | 7.8 ### Accuracy The higher, the better type | Detectron (P100) | mmdetection (V100) | maskrcnn_benchmark (V100) -- | -- | -- | -- Faster R-CNN R-50 C4 | 34.8 | - | 34.8 Faster R-CNN R-50 FPN | 36.7 | 36.7 | 36.8 Faster R-CNN R-101 FPN | 39.4 | - | 39.1 Faster R-CNN X-101-32x8d FPN | 41.3 | - | 41.2 Mask R-CNN R-50 C4 | 35.8 & 31.4 | - | 35.6 & 31.5 Mask R-CNN R-50 FPN | 37.7 & 33.9 | 37.5 & 34.4 | 37.8 & 34.2 Mask R-CNN R-101 FPN | 40.0 & 35.9 | - | 40.1 & 36.1 Mask R-CNN X-101-32x8d FPN | 42.1 & 37.3 | - | 42.2 & 37.8 ================================================ FILE: README.md ================================================ # FCOS_PLUS This project contains some improvements about FCOS (Fully Convolutional One-Stage Object Detection). ## Installation Please check [INSTALL.md](INSTALL.md) (same as original FCOS) for installation instructions. **Results** Model | Total training mem (GB) | Multi-scale training | Testing time / im | AP (minival) | link --- |:---:|:---:|:---:|:---:|:---:| FCOS_R_50_FPN_1x | 29.3 | No | 71ms | 37.0 | [model](https://pan.baidu.com/s/1Xcbx7EfOGvwnexXAuovM0A) | FCOS_R_50_FPN_1x_center | 30.61 | No | 71ms | 37.8 | [model](https://pan.baidu.com/s/1Gs7AzmJRmeYhXUPDQZuSLA) | FCOS_R_50_FPN_1x_center_liou | 30.61 | No | 71ms | 38.1 | [model](https://pan.baidu.com/s/1HpYrkAsVXNvXRFTd06SGgA) | FCOS_R_50_FPN_1x_center_giou | 30.61 | No | 71ms | 38.2 | [model](https://pan.baidu.com/s/13_o6343Ikg4td01kVXxGSw) | FCOS_R_101_FPN_2x | 44.1 | Yes | 74ms | 41.4 | [model](https://pan.baidu.com/s/1u_5OD5NURYe1EYFWnohgEA) | FCOS_R_101_FPN_2x_center_giou | 44.1 | Yes | 74ms | 42.5 | [model](https://pan.baidu.com/s/1qhHM067ywwlEXfamaFq23g) | [1] *1x and 2x mean the model is trained for 90K and 180K iterations, respectively.* \ [2] center means [center sample](fcos.pdf) is used in our training. \ [3] liou means the model use linear iou loss function. (1 - iou) \ [4] giou means the use giou loss function. (1 - giou) ## Training The following command line will train FCOS_R_50_FPN_1x on 8 GPUs with Synchronous Stochastic Gradient Descent (SGD): python -m torch.distributed.launch \ --nproc_per_node=8 \ --master_port=$((RANDOM + 10000)) \ tools/train_net.py \ --skip-test \ --config-file configs/fcos/fcos_R_50_FPN_1x_center_giou.yaml \ DATALOADER.NUM_WORKERS 2 \ OUTPUT_DIR training_dir/fcos_R_50_FPN_1x_center_giou Note that: 1) If you want to use fewer GPUs, please change `--nproc_per_node` to the number of GPUs. No other settings need to be changed. The total batch size does not depends on `nproc_per_node`. If you want to change the total batch size, please change `SOLVER.IMS_PER_BATCH` in [configs/fcos/fcos_R_50_FPN_1x_center_giou.yaml](configs/fcos/fcos_R_50_FPN_1x_center_giou.yaml). 2) The models will be saved into `OUTPUT_DIR`. 3) If you want to train FCOS with other backbones, please change `--config-file`. ## Citations Please consider citing original paper in your publications if the project helps your research. ``` @article{tian2019fcos, title = {{FCOS}: Fully Convolutional One-Stage Object Detection}, author = {Tian, Zhi and Shen, Chunhua and Chen, Hao and He, Tong}, journal = {arXiv preprint arXiv:1904.01355}, year = {2019} } ``` ## License For academic use, this project is licensed under the 2-clause BSD License - see the LICENSE file for details. For commercial use, please contact the authors. ================================================ FILE: TROUBLESHOOTING.md ================================================ # Troubleshooting Here is a compilation if common issues that you might face while compiling / running this code: ## Compilation errors when compiling the library If you encounter build errors like the following: ``` /usr/include/c++/6/type_traits:1558:8: note: provided for ‘template struct std::is_convertible’ struct is_convertible ^~~~~~~~~~~~~~ /usr/include/c++/6/tuple:502:1: error: body of constexpr function ‘static constexpr bool std::_TC<, _Elements>::_NonNestedTuple() [with _SrcTuple = std::tuple&&; bool = true; _Elements = {at::Tensor, at::Tensor, at::Tensor, at::Tensor}]’ not a return-statement } ^ error: command '/usr/local/cuda/bin/nvcc' failed with exit status 1 ``` check your CUDA version and your `gcc` version. ``` nvcc --version gcc --version ``` If you are using CUDA 9.0 and gcc 6.4.0, then refer to https://github.com/facebookresearch/maskrcnn-benchmark/issues/25, which has a summary of the solution. Basically, CUDA 9.0 is not compatible with gcc 6.4.0. ## ImportError: No module named maskrcnn_benchmark.config when running webcam.py This means that `maskrcnn-benchmark` has not been properly installed. Refer to https://github.com/facebookresearch/maskrcnn-benchmark/issues/22 for a few possible issues. Note that we now support Python 2 as well. ## ImportError: Undefined symbol: __cudaPopCallConfiguration error when import _C This probably means that the inconsistent version of NVCC compile and your conda CUDAToolKit package. This is firstly mentioned in https://github.com/facebookresearch/maskrcnn-benchmark/issues/45 . All you need to do is: ``` # Check the NVCC compile version(e.g.) /usr/cuda-9.2/bin/nvcc --version # Check the CUDAToolKit version(e.g.) ~/anaconda3/bin/conda list | grep cuda # If you need to update your CUDAToolKit ~/anaconda3/bin/conda install -c anaconda cudatoolkit==9.2 ``` Both of them should have the **same** version. For example, if NVCC==9.2 and CUDAToolKit==9.2, this will be fine while when NVCC==9.2 but CUDAToolKit==9, it fails. ## Segmentation fault (core dumped) when running the library This probably means that you have compiled the library using GCC < 4.9, which is ABI incompatible with PyTorch. Indeed, during installation, you probably saw a message like ``` Your compiler (g++ 4.8) may be ABI-incompatible with PyTorch! Please use a compiler that is ABI-compatible with GCC 4.9 and above. See https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html. See https://gist.github.com/goldsborough/d466f43e8ffc948ff92de7486c5216d6 for instructions on how to install GCC 4.9 or higher. ``` Follow the instructions on https://gist.github.com/goldsborough/d466f43e8ffc948ff92de7486c5216d6 to install GCC 4.9 or higher, and try recompiling `maskrcnn-benchmark` again, after cleaning the `build` folder with ``` rm -rf build ``` ================================================ FILE: configs/caffe2/e2e_faster_rcnn_R_101_FPN_1x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/35857890/e2e_faster_rcnn_R-101-FPN_1x" BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 ================================================ FILE: configs/caffe2/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/35857197/e2e_faster_rcnn_R-50-C4_1x" DATASETS: TEST: ("coco_2014_minival",) ================================================ FILE: configs/caffe2/e2e_faster_rcnn_R_50_FPN_1x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/35857345/e2e_faster_rcnn_R-50-FPN_1x" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 ================================================ FILE: configs/caffe2/e2e_faster_rcnn_X_101_32x8d_FPN_1x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x" BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 ================================================ FILE: configs/caffe2/e2e_keypoint_rcnn_R_50_FPN_1x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/37697547/e2e_keypoint_rcnn_R-50-FPN_1x" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" NUM_CLASSES: 2 ROI_KEYPOINT_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "KeypointRCNNFeatureExtractor" PREDICTOR: "KeypointRCNNPredictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 56 SHARE_BOX_FEATURE_EXTRACTOR: False KEYPOINT_ON: True DATASETS: TRAIN: ("keypoints_coco_2014_train", "keypoints_coco_2014_valminusminival",) TEST: ("keypoints_coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/caffe2/e2e_mask_rcnn_R_101_FPN_1x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/35861795/e2e_mask_rcnn_R-101-FPN_1x" BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 ================================================ FILE: configs/caffe2/e2e_mask_rcnn_R_50_C4_1x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/35858791/e2e_mask_rcnn_R-50-C4_1x" ROI_MASK_HEAD: PREDICTOR: "MaskRCNNC4Predictor" SHARE_BOX_FEATURE_EXTRACTOR: True MASK_ON: True DATASETS: TEST: ("coco_2014_minival",) ================================================ FILE: configs/caffe2/e2e_mask_rcnn_R_50_FPN_1x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/35858933/e2e_mask_rcnn_R-50-FPN_1x" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 ================================================ FILE: configs/caffe2/e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/37129812/e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x" BACKBONE: CONV_BODY: "R-152-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 ================================================ FILE: configs/caffe2/e2e_mask_rcnn_X_101_32x8d_FPN_1x_caffe2.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://Caffe2Detectron/COCO/36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x" BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 ================================================ FILE: configs/cityscapes/e2e_faster_rcnn_R_50_FPN_1x_cocostyle.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" NUM_CLASSES: 9 DATASETS: TRAIN: ("cityscapes_fine_instanceonly_seg_train_cocostyle",) TEST: ("cityscapes_fine_instanceonly_seg_val_cocostyle",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (18000,) MAX_ITER: 24000 ================================================ FILE: configs/cityscapes/e2e_mask_rcnn_R_50_FPN_1x_cocostyle.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" NUM_CLASSES: 9 ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("cityscapes_fine_instanceonly_seg_train_cocostyle",) TEST: ("cityscapes_fine_instanceonly_seg_val_cocostyle",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (18000,) MAX_ITER: 24000 ================================================ FILE: configs/e2e_faster_rcnn_R_101_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/e2e_faster_rcnn_R_50_C4_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN: PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TEST: 1000 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 8 ================================================ FILE: configs/e2e_faster_rcnn_R_50_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/e2e_faster_rcnn_X_101_32x8d_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-32x8d" BACKBONE: CONV_BODY: "R-101-FPN" RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" RESNETS: BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 8 ================================================ FILE: configs/e2e_faster_rcnn_fbnet.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" BACKBONE: CONV_BODY: FBNet FBNET: ARCH: "default" BN_TYPE: "bn" WIDTH_DIVISOR: 8 DW_CONV_SKIP_BN: True DW_CONV_SKIP_RELU: True RPN: ANCHOR_SIZES: (16, 32, 64, 128, 256) ANCHOR_STRIDE: (16, ) BATCH_SIZE_PER_IMAGE: 256 PRE_NMS_TOP_N_TRAIN: 6000 PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TRAIN: 2000 POST_NMS_TOP_N_TEST: 100 RPN_HEAD: FBNet.rpn_head ROI_HEADS: BATCH_SIZE_PER_IMAGE: 512 ROI_BOX_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head NUM_CLASSES: 81 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.06 WARMUP_FACTOR: 0.1 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 128 # for 8GPUs # TEST: # IMS_PER_BATCH: 8 INPUT: MIN_SIZE_TRAIN: (320, ) MAX_SIZE_TRAIN: 640 MIN_SIZE_TEST: 320 MAX_SIZE_TEST: 640 PIXEL_MEAN: [103.53, 116.28, 123.675] PIXEL_STD: [57.375, 57.12, 58.395] ================================================ FILE: configs/e2e_faster_rcnn_fbnet_600.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" BACKBONE: CONV_BODY: FBNet FBNET: ARCH: "default" BN_TYPE: "bn" WIDTH_DIVISOR: 8 DW_CONV_SKIP_BN: True DW_CONV_SKIP_RELU: True RPN: ANCHOR_SIZES: (32, 64, 128, 256, 512) ANCHOR_STRIDE: (16, ) BATCH_SIZE_PER_IMAGE: 256 PRE_NMS_TOP_N_TRAIN: 6000 PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TRAIN: 2000 POST_NMS_TOP_N_TEST: 200 RPN_HEAD: FBNet.rpn_head ROI_HEADS: BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head NUM_CLASSES: 81 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.06 WARMUP_FACTOR: 0.1 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 128 # for 8GPUs # TEST: # IMS_PER_BATCH: 8 INPUT: MIN_SIZE_TRAIN: (600, ) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 600 MAX_SIZE_TEST: 1000 PIXEL_MEAN: [103.53, 116.28, 123.675] PIXEL_STD: [57.375, 57.12, 58.395] ================================================ FILE: configs/e2e_faster_rcnn_fbnet_chamv1a_600.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" BACKBONE: CONV_BODY: FBNet FBNET: ARCH: "cham_v1a" BN_TYPE: "bn" WIDTH_DIVISOR: 8 DW_CONV_SKIP_BN: True DW_CONV_SKIP_RELU: True RPN: ANCHOR_SIZES: (32, 64, 128, 256, 512) ANCHOR_STRIDE: (16, ) BATCH_SIZE_PER_IMAGE: 256 PRE_NMS_TOP_N_TRAIN: 6000 PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TRAIN: 2000 POST_NMS_TOP_N_TEST: 200 RPN_HEAD: FBNet.rpn_head ROI_HEADS: BATCH_SIZE_PER_IMAGE: 128 ROI_BOX_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head NUM_CLASSES: 81 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.045 WARMUP_FACTOR: 0.1 WEIGHT_DECAY: 0.0001 STEPS: (90000, 120000) MAX_ITER: 135000 IMS_PER_BATCH: 96 # for 8GPUs # TEST: # IMS_PER_BATCH: 8 INPUT: MIN_SIZE_TRAIN: (600, ) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 600 MAX_SIZE_TEST: 1000 PIXEL_MEAN: [103.53, 116.28, 123.675] PIXEL_STD: [57.375, 57.12, 58.395] ================================================ FILE: configs/e2e_keypoint_rcnn_R_50_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" NUM_CLASSES: 2 ROI_KEYPOINT_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "KeypointRCNNFeatureExtractor" PREDICTOR: "KeypointRCNNPredictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 56 SHARE_BOX_FEATURE_EXTRACTOR: False KEYPOINT_ON: True DATASETS: TRAIN: ("keypoints_coco_2014_train", "keypoints_coco_2014_valminusminival",) TEST: ("keypoints_coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/e2e_mask_rcnn_R_101_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/e2e_mask_rcnn_R_50_C4_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN: PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TEST: 1000 ROI_MASK_HEAD: PREDICTOR: "MaskRCNNC4Predictor" SHARE_BOX_FEATURE_EXTRACTOR: True MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 8 ================================================ FILE: configs/e2e_mask_rcnn_R_50_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/e2e_mask_rcnn_X_101_32x8d_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-32x8d" BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 8 ================================================ FILE: configs/e2e_mask_rcnn_fbnet.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" BACKBONE: CONV_BODY: FBNet FBNET: ARCH: "default" BN_TYPE: "bn" WIDTH_DIVISOR: 8 DW_CONV_SKIP_BN: True DW_CONV_SKIP_RELU: True DET_HEAD_LAST_SCALE: 0.0 RPN: ANCHOR_SIZES: (16, 32, 64, 128, 256) ANCHOR_STRIDE: (16, ) BATCH_SIZE_PER_IMAGE: 256 PRE_NMS_TOP_N_TRAIN: 6000 PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TRAIN: 2000 POST_NMS_TOP_N_TEST: 100 RPN_HEAD: FBNet.rpn_head ROI_HEADS: BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head NUM_CLASSES: 81 ROI_MASK_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head_mask PREDICTOR: "MaskRCNNConv1x1Predictor" RESOLUTION: 12 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.06 WARMUP_FACTOR: 0.1 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 128 # for 8GPUs # TEST: # IMS_PER_BATCH: 8 INPUT: MIN_SIZE_TRAIN: (320, ) MAX_SIZE_TRAIN: 640 MIN_SIZE_TEST: 320 MAX_SIZE_TEST: 640 PIXEL_MEAN: [103.53, 116.28, 123.675] PIXEL_STD: [57.375, 57.12, 58.395] ================================================ FILE: configs/e2e_mask_rcnn_fbnet_600.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" BACKBONE: CONV_BODY: FBNet FBNET: ARCH: "default" BN_TYPE: "bn" WIDTH_DIVISOR: 8 DW_CONV_SKIP_BN: True DW_CONV_SKIP_RELU: True DET_HEAD_LAST_SCALE: 0.0 RPN: ANCHOR_SIZES: (32, 64, 128, 256, 512) ANCHOR_STRIDE: (16, ) BATCH_SIZE_PER_IMAGE: 256 PRE_NMS_TOP_N_TRAIN: 6000 PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TRAIN: 2000 POST_NMS_TOP_N_TEST: 200 RPN_HEAD: FBNet.rpn_head ROI_HEADS: BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head NUM_CLASSES: 81 ROI_MASK_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head_mask PREDICTOR: "MaskRCNNConv1x1Predictor" RESOLUTION: 12 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.06 WARMUP_FACTOR: 0.1 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 128 # for 8GPUs # TEST: # IMS_PER_BATCH: 8 INPUT: MIN_SIZE_TRAIN: (600, ) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 600 MAX_SIZE_TEST: 1000 PIXEL_MEAN: [103.53, 116.28, 123.675] PIXEL_STD: [57.375, 57.12, 58.395] ================================================ FILE: configs/e2e_mask_rcnn_fbnet_xirb16d_dsmask.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" BACKBONE: CONV_BODY: FBNet FBNET: ARCH: "xirb16d_dsmask" BN_TYPE: "bn" WIDTH_DIVISOR: 8 DW_CONV_SKIP_BN: True DW_CONV_SKIP_RELU: True DET_HEAD_LAST_SCALE: -1.0 RPN: ANCHOR_SIZES: (16, 32, 64, 128, 256) ANCHOR_STRIDE: (16, ) BATCH_SIZE_PER_IMAGE: 256 PRE_NMS_TOP_N_TRAIN: 6000 PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TRAIN: 2000 POST_NMS_TOP_N_TEST: 100 RPN_HEAD: FBNet.rpn_head ROI_HEADS: BATCH_SIZE_PER_IMAGE: 512 ROI_BOX_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head NUM_CLASSES: 81 ROI_MASK_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head_mask PREDICTOR: "MaskRCNNConv1x1Predictor" RESOLUTION: 12 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.06 WARMUP_FACTOR: 0.1 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 128 # for 8GPUs # TEST: # IMS_PER_BATCH: 8 INPUT: MIN_SIZE_TRAIN: (320, ) MAX_SIZE_TRAIN: 640 MIN_SIZE_TEST: 320 MAX_SIZE_TEST: 640 PIXEL_MEAN: [103.53, 116.28, 123.675] PIXEL_STD: [57.375, 57.12, 58.395] ================================================ FILE: configs/e2e_mask_rcnn_fbnet_xirb16d_dsmask_600.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" BACKBONE: CONV_BODY: FBNet FBNET: ARCH: "xirb16d_dsmask" BN_TYPE: "bn" WIDTH_DIVISOR: 8 DW_CONV_SKIP_BN: True DW_CONV_SKIP_RELU: True DET_HEAD_LAST_SCALE: 0.0 RPN: ANCHOR_SIZES: (32, 64, 128, 256, 512) ANCHOR_STRIDE: (16, ) BATCH_SIZE_PER_IMAGE: 256 PRE_NMS_TOP_N_TRAIN: 6000 PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TRAIN: 2000 POST_NMS_TOP_N_TEST: 200 RPN_HEAD: FBNet.rpn_head ROI_HEADS: BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head NUM_CLASSES: 81 ROI_MASK_HEAD: POOLER_RESOLUTION: 6 FEATURE_EXTRACTOR: FBNet.roi_head_mask PREDICTOR: "MaskRCNNConv1x1Predictor" RESOLUTION: 12 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.06 WARMUP_FACTOR: 0.1 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 128 # for 8GPUs # TEST: # IMS_PER_BATCH: 8 INPUT: MIN_SIZE_TRAIN: (600, ) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 600 MAX_SIZE_TEST: 1000 PIXEL_MEAN: [103.53, 116.28, 123.675] PIXEL_STD: [57.375, 57.12, 58.395] ================================================ FILE: configs/fcos/fcos_R_101_FPN_2x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" RPN_ONLY: True FCOS_ON: True BACKBONE: CONV_BODY: "R-101-FPN-RETINANET" RESNETS: BACKBONE_OUT_CHANNELS: 256 RETINANET: USE_C5: False # FCOS uses P5 instead of C5 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_RANGE_TRAIN: (640, 800) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 16 WARMUP_METHOD: "constant" ================================================ FILE: configs/fcos/fcos_R_50_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "pretrain_models/R-50.pkl" RPN_ONLY: True FCOS_ON: True BACKBONE: CONV_BODY: "R-50-FPN-RETINANET" RESNETS: BACKBONE_OUT_CHANNELS: 256 RETINANET: USE_C5: False # FCOS uses P5 instead of C5 FCOS: CENTER_SAMPLE: False DATASETS: TRAIN: ("coco_2017_train", ) TEST: ("coco_2017_val", ) INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 16 WARMUP_METHOD: "constant" ================================================ FILE: configs/fcos/fcos_R_50_FPN_1x_center.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "pretrain_models/R-50.pkl" RPN_ONLY: True FCOS_ON: True BACKBONE: CONV_BODY: "R-50-FPN-RETINANET" RESNETS: BACKBONE_OUT_CHANNELS: 256 RETINANET: USE_C5: False # FCOS uses P5 instead of C5 FCOS: CENTER_SAMPLE: True POS_RADIUS: 1.5 DATASETS: TRAIN: ("coco_2017_train", ) TEST: ("coco_2017_val", ) INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 16 WARMUP_METHOD: "constant" ================================================ FILE: configs/fcos/fcos_R_50_FPN_1x_center_giou.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "pretrain_models/R-50.pkl" RPN_ONLY: True FCOS_ON: True BACKBONE: CONV_BODY: "R-50-FPN-RETINANET" RESNETS: BACKBONE_OUT_CHANNELS: 256 RETINANET: USE_C5: False # FCOS uses P5 instead of C5 FCOS: CENTER_SAMPLE: True POS_RADIUS: 1.5 LOC_LOSS_TYPE: "giou" DATASETS: TRAIN: ("coco_2017_train", ) TEST: ("coco_2017_val", ) INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 16 WARMUP_METHOD: "constant" ================================================ FILE: configs/fcos/fcos_X_101_32x8d_FPN_2x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-32x8d" RPN_ONLY: True FCOS_ON: True BACKBONE: CONV_BODY: "R-101-FPN-RETINANET" RESNETS: STRIDE_IN_1X1: False BACKBONE_OUT_CHANNELS: 256 NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 RETINANET: USE_C5: False # FCOS uses P5 instead of C5 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_RANGE_TRAIN: (640, 800) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 16 WARMUP_METHOD: "constant" ================================================ FILE: configs/fcos/fcos_X_101_64x4d_FPN_2x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-64x4d" RPN_ONLY: True FCOS_ON: True BACKBONE: CONV_BODY: "R-101-FPN-RETINANET" RESNETS: STRIDE_IN_1X1: False BACKBONE_OUT_CHANNELS: 256 NUM_GROUPS: 64 WIDTH_PER_GROUP: 4 RETINANET: USE_C5: False # FCOS uses P5 instead of C5 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_RANGE_TRAIN: (640, 800) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 16 WARMUP_METHOD: "constant" ================================================ FILE: configs/fcos/fcos_bn_bs16_MNV2_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "https://cloudstor.aarnet.edu.au/plus/s/xtixKaxLWmbcyf7/download#mobilenet_v2-ecbe2b5.pth" RPN_ONLY: True FCOS_ON: True BACKBONE: CONV_BODY: "MNV2-FPN-RETINANET" FREEZE_CONV_BODY_AT: 0 RESNETS: BACKBONE_OUT_CHANNELS: 256 RETINANET: USE_C5: False # FCOS uses P5 instead of C5 USE_SYNCBN: False DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 16 WARMUP_METHOD: "constant" ================================================ FILE: configs/fcos/fcos_syncbn_bs32_MNV2_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "https://cloudstor.aarnet.edu.au/plus/s/xtixKaxLWmbcyf7/download#mobilenet_v2-ecbe2b5.pth" RPN_ONLY: True FCOS_ON: True BACKBONE: CONV_BODY: "MNV2-FPN-RETINANET" FREEZE_CONV_BODY_AT: 0 RESNETS: BACKBONE_OUT_CHANNELS: 256 RETINANET: USE_C5: False # FCOS uses P5 instead of C5 USE_SYNCBN: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 32 WARMUP_METHOD: "constant" ================================================ FILE: configs/fcos/fcos_syncbn_bs32_c128_MNV2_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "https://cloudstor.aarnet.edu.au/plus/s/xtixKaxLWmbcyf7/download#mobilenet_v2-ecbe2b5.pth" RPN_ONLY: True FCOS_ON: True BACKBONE: CONV_BODY: "MNV2-FPN-RETINANET" FREEZE_CONV_BODY_AT: 0 RESNETS: BACKBONE_OUT_CHANNELS: 128 RETINANET: USE_C5: False # FCOS uses P5 instead of C5 USE_SYNCBN: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 32 WARMUP_METHOD: "constant" ================================================ FILE: configs/fcos/fcos_syncbn_bs32_c128_ms_MNV2_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "https://cloudstor.aarnet.edu.au/plus/s/xtixKaxLWmbcyf7/download#mobilenet_v2-ecbe2b5.pth" RPN_ONLY: True FCOS_ON: True BACKBONE: CONV_BODY: "MNV2-FPN-RETINANET" FREEZE_CONV_BODY_AT: 0 RESNETS: BACKBONE_OUT_CHANNELS: 128 RETINANET: USE_C5: False # FCOS uses P5 instead of C5 USE_SYNCBN: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_RANGE_TRAIN: (640, 800) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 32 WARMUP_METHOD: "constant" ================================================ FILE: configs/fcos/fcos_syncbn_bs64_c128_ms_MNV2_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "https://cloudstor.aarnet.edu.au/plus/s/xtixKaxLWmbcyf7/download#mobilenet_v2-ecbe2b5.pth" RPN_ONLY: True FCOS_ON: True BACKBONE: CONV_BODY: "MNV2-FPN-RETINANET" FREEZE_CONV_BODY_AT: 0 RESNETS: BACKBONE_OUT_CHANNELS: 128 RETINANET: USE_C5: False # FCOS uses P5 instead of C5 USE_SYNCBN: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_RANGE_TRAIN: (640, 800) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 64 WARMUP_METHOD: "constant" ================================================ FILE: configs/gn_baselines/README.md ================================================ ### Group Normalization 1 [Group Normalization](https://arxiv.org/abs/1803.08494) 2 [Rethinking ImageNet Pre-training](https://arxiv.org/abs/1811.08883) 3 [official code](https://github.com/facebookresearch/Detectron/blob/master/projects/GN/README.md) ### Performance | case | Type | lr schd | im/gpu | bbox AP | mask AP | |----------------------------|:------------:|:---------:|:-------:|:-------:|:-------:| | R-50-FPN, GN (paper) | finetune | 2x | 2 | 40.3 | 35.7 | | R-50-FPN, GN (implement) | finetune | 2x | 2 | 40.2 | 36.0 | | R-50-FPN, GN (paper) | from scratch | 3x | 2 | 39.5 | 35.2 | | R-50-FPN, GN (implement) | from scratch | 3x | 2 | 38.9 | 35.1 | ================================================ FILE: configs/gn_baselines/e2e_faster_rcnn_R_50_FPN_1x_gn.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50-GN" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: # use GN for backbone BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False TRANS_FUNC: "BottleneckWithGN" STEM_FUNC: "StemWithGN" FPN: USE_GN: True # use GN for FPN RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 512 POSITIVE_FRACTION: 0.25 ROI_BOX_HEAD: USE_GN: True # use GN for bbox head POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/gn_baselines/e2e_faster_rcnn_R_50_FPN_Xconv1fc_1x_gn.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50-GN" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: # use GN for backbone BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False TRANS_FUNC: "BottleneckWithGN" STEM_FUNC: "StemWithGN" FPN: USE_GN: True # use GN for FPN RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 512 POSITIVE_FRACTION: 0.25 ROI_BOX_HEAD: USE_GN: True # use GN for bbox head POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 CONV_HEAD_DIM: 256 NUM_STACKED_CONVS: 4 FEATURE_EXTRACTOR: "FPNXconv1fcFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/gn_baselines/e2e_mask_rcnn_R_50_FPN_1x_gn.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50-GN" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: # use GN for backbone BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False TRANS_FUNC: "BottleneckWithGN" STEM_FUNC: "StemWithGN" FPN: USE_GN: True # use GN for FPN RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 512 POSITIVE_FRACTION: 0.25 ROI_BOX_HEAD: USE_GN: True # use GN for bbox head POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: USE_GN: True # use GN for mask head POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) CONV_LAYERS: (256, 256, 256, 256) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/gn_baselines/e2e_mask_rcnn_R_50_FPN_Xconv1fc_1x_gn.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50-GN" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: # use GN for backbone BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False TRANS_FUNC: "BottleneckWithGN" STEM_FUNC: "StemWithGN" FPN: USE_GN: True # use GN for FPN RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 512 POSITIVE_FRACTION: 0.25 ROI_BOX_HEAD: USE_GN: True # use GN for bbox head POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 CONV_HEAD_DIM: 256 NUM_STACKED_CONVS: 4 FEATURE_EXTRACTOR: "FPNXconv1fcFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: USE_GN: True # use GN for mask head POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) CONV_LAYERS: (256, 256, 256, 256) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/gn_baselines/scratch_e2e_faster_rcnn_R_50_FPN_3x_gn.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "" # no pretrained model BACKBONE: CONV_BODY: "R-50-FPN" FREEZE_CONV_BODY_AT: 0 # finetune all layers RESNETS: # use GN for backbone BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False TRANS_FUNC: "BottleneckWithGN" STEM_FUNC: "StemWithGN" FPN: USE_GN: True # use GN for FPN RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 512 POSITIVE_FRACTION: 0.25 ROI_BOX_HEAD: USE_GN: True # use GN for bbox head POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (210000, 250000) MAX_ITER: 270000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/gn_baselines/scratch_e2e_faster_rcnn_R_50_FPN_Xconv1fc_3x_gn.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "" # no pretrained model BACKBONE: CONV_BODY: "R-50-FPN" FREEZE_CONV_BODY_AT: 0 # finetune all layers RESNETS: # use GN for backbone BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False TRANS_FUNC: "BottleneckWithGN" STEM_FUNC: "StemWithGN" FPN: USE_GN: True # use GN for FPN RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 512 POSITIVE_FRACTION: 0.25 ROI_BOX_HEAD: USE_GN: True # use GN for bbox head POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 CONV_HEAD_DIM: 256 NUM_STACKED_CONVS: 4 FEATURE_EXTRACTOR: "FPNXconv1fcFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (210000, 250000) MAX_ITER: 270000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/gn_baselines/scratch_e2e_mask_rcnn_R_50_FPN_3x_gn.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "" # no pretrained model BACKBONE: CONV_BODY: "R-50-FPN" FREEZE_CONV_BODY_AT: 0 # finetune all layers RESNETS: # use GN for backbone BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False TRANS_FUNC: "BottleneckWithGN" STEM_FUNC: "StemWithGN" FPN: USE_GN: True # use GN for FPN RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 512 POSITIVE_FRACTION: 0.25 ROI_BOX_HEAD: USE_GN: True # use GN for bbox head POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: USE_GN: True # use GN for mask head POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) CONV_LAYERS: (256, 256, 256, 256) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (210000, 250000) MAX_ITER: 270000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/gn_baselines/scratch_e2e_mask_rcnn_R_50_FPN_Xconv1fc_3x_gn.yaml ================================================ INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "" # no pretrained model BACKBONE: CONV_BODY: "R-50-FPN" FREEZE_CONV_BODY_AT: 0 # finetune all layers RESNETS: # use GN for backbone BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False TRANS_FUNC: "BottleneckWithGN" STEM_FUNC: "StemWithGN" FPN: USE_GN: True # use GN for FPN RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 512 POSITIVE_FRACTION: 0.25 ROI_BOX_HEAD: USE_GN: True # use GN for bbox head POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 CONV_HEAD_DIM: 256 NUM_STACKED_CONVS: 4 FEATURE_EXTRACTOR: "FPNXconv1fcFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: USE_GN: True # use GN for mask head POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) CONV_LAYERS: (256, 256, 256, 256) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 8 gpus BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (210000, 250000) MAX_ITER: 270000 IMS_PER_BATCH: 16 TEST: IMS_PER_BATCH: 8 ================================================ FILE: configs/pascal_voc/e2e_faster_rcnn_R_50_C4_1x_1_gpu_voc.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN: PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TEST: 300 ANCHOR_SIZES: (128, 256, 512) ROI_BOX_HEAD: NUM_CLASSES: 21 DATASETS: TRAIN: ("voc_2007_train", "voc_2007_val") TEST: ("voc_2007_test",) SOLVER: BASE_LR: 0.001 WEIGHT_DECAY: 0.0001 STEPS: (50000, ) MAX_ITER: 70000 IMS_PER_BATCH: 1 TEST: IMS_PER_BATCH: 1 ================================================ FILE: configs/pascal_voc/e2e_faster_rcnn_R_50_C4_1x_4_gpu_voc.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN: PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TEST: 300 ANCHOR_SIZES: (128, 256, 512) ROI_BOX_HEAD: NUM_CLASSES: 21 DATASETS: TRAIN: ("voc_2007_train", "voc_2007_val") TEST: ("voc_2007_test",) SOLVER: BASE_LR: 0.004 WEIGHT_DECAY: 0.0001 STEPS: (12500, ) MAX_ITER: 17500 IMS_PER_BATCH: 4 TEST: IMS_PER_BATCH: 4 ================================================ FILE: configs/pascal_voc/e2e_mask_rcnn_R_50_FPN_1x_cocostyle.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" NUM_CLASSES: 21 ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("voc_2012_train_cocostyle",) TEST: ("voc_2012_val_cocostyle",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.01 WEIGHT_DECAY: 0.0001 STEPS: (18000,) MAX_ITER: 24000 ================================================ FILE: configs/quick_schedules/e2e_faster_rcnn_R_50_C4_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN: PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: BATCH_SIZE_PER_IMAGE: 256 DATASETS: TRAIN: ("coco_2014_minival",) TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (600,) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (1500,) MAX_ITER: 2000 IMS_PER_BATCH: 2 TEST: IMS_PER_BATCH: 2 ================================================ FILE: configs/quick_schedules/e2e_faster_rcnn_R_50_FPN_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TRAIN: ("coco_2014_minival",) TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (600,) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (1500,) MAX_ITER: 2000 IMS_PER_BATCH: 4 TEST: IMS_PER_BATCH: 2 ================================================ FILE: configs/quick_schedules/e2e_faster_rcnn_X_101_32x8d_FPN_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-32x8d" BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" DATASETS: TRAIN: ("coco_2014_minival",) TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (600,) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (1500,) MAX_ITER: 2000 IMS_PER_BATCH: 2 TEST: IMS_PER_BATCH: 2 ================================================ FILE: configs/quick_schedules/e2e_keypoint_rcnn_R_50_FPN_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" NUM_CLASSES: 2 ROI_KEYPOINT_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "KeypointRCNNFeatureExtractor" PREDICTOR: "KeypointRCNNPredictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 56 SHARE_BOX_FEATURE_EXTRACTOR: False KEYPOINT_ON: True DATASETS: TRAIN: ("keypoints_coco_2014_minival",) TEST: ("keypoints_coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (1500,) MAX_ITER: 2000 IMS_PER_BATCH: 4 TEST: IMS_PER_BATCH: 2 ================================================ FILE: configs/quick_schedules/e2e_mask_rcnn_R_50_C4_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN: PRE_NMS_TOP_N_TEST: 6000 POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: BATCH_SIZE_PER_IMAGE: 256 ROI_MASK_HEAD: PREDICTOR: "MaskRCNNC4Predictor" SHARE_BOX_FEATURE_EXTRACTOR: True MASK_ON: True DATASETS: TRAIN: ("coco_2014_minival",) TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (600,) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (1500,) MAX_ITER: 2000 IMS_PER_BATCH: 4 TEST: IMS_PER_BATCH: 2 ================================================ FILE: configs/quick_schedules/e2e_mask_rcnn_R_50_FPN_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_minival",) TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (600,) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (1500,) MAX_ITER: 2000 IMS_PER_BATCH: 4 TEST: IMS_PER_BATCH: 2 ================================================ FILE: configs/quick_schedules/e2e_mask_rcnn_X_101_32x8d_FPN_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-32x8d" BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" ROI_MASK_HEAD: POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" PREDICTOR: "MaskRCNNC4Predictor" POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 2 RESOLUTION: 28 SHARE_BOX_FEATURE_EXTRACTOR: False MASK_ON: True DATASETS: TRAIN: ("coco_2014_minival",) TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (600,) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (1500,) MAX_ITER: 2000 IMS_PER_BATCH: 2 TEST: IMS_PER_BATCH: 2 ================================================ FILE: configs/quick_schedules/rpn_R_50_C4_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN_ONLY: True RPN: PRE_NMS_TOP_N_TEST: 12000 POST_NMS_TOP_N_TEST: 2000 DATASETS: TRAIN: ("coco_2014_minival",) TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (600,) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (1500,) MAX_ITER: 2000 IMS_PER_BATCH: 4 TEST: IMS_PER_BATCH: 2 ================================================ FILE: configs/quick_schedules/rpn_R_50_FPN_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN_ONLY: True BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 2000 FPN_POST_NMS_TOP_N_TEST: 2000 DATASETS: TRAIN: ("coco_2014_minival",) TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (600,) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (1500,) MAX_ITER: 2000 IMS_PER_BATCH: 4 TEST: IMS_PER_BATCH: 2 ================================================ FILE: configs/retinanet/retinanet_R-101-FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" RPN_ONLY: True RETINANET_ON: True BACKBONE: CONV_BODY: "R-101-FPN-RETINANET" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" RETINANET: SCALES_PER_OCTAVE: 3 STRADDLE_THRESH: -1 FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (800, ) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 4 gpus BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 8 ================================================ FILE: configs/retinanet/retinanet_R-101-FPN_P5_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" RPN_ONLY: True RETINANET_ON: True BACKBONE: CONV_BODY: "R-101-FPN-RETINANET" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" RETINANET: SCALES_PER_OCTAVE: 3 STRADDLE_THRESH: -1 USE_C5: False FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (800, ) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 4 gpus BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 8 ================================================ FILE: configs/retinanet/retinanet_R-50-FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN_ONLY: True RETINANET_ON: True BACKBONE: CONV_BODY: "R-50-FPN-RETINANET" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" RETINANET: SCALES_PER_OCTAVE: 3 STRADDLE_THRESH: -1 FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 4 gpus BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 8 ================================================ FILE: configs/retinanet/retinanet_R-50-FPN_1x_quick.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN_ONLY: True RETINANET_ON: True BACKBONE: CONV_BODY: "R-50-FPN-RETINANET" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" RETINANET: SCALES_PER_OCTAVE: 3 STRADDLE_THRESH: -1 FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 DATASETS: TRAIN: ("coco_2014_minival",) TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (600,) MAX_SIZE_TRAIN: 1000 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1000 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (3500,) MAX_ITER: 4000 IMS_PER_BATCH: 4 ================================================ FILE: configs/retinanet/retinanet_R-50-FPN_P5_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN_ONLY: True RETINANET_ON: True BACKBONE: CONV_BODY: "R-50-FPN-RETINANET" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" RETINANET: SCALES_PER_OCTAVE: 3 STRADDLE_THRESH: -1 USE_C5: False FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (800,) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 4 gpus BASE_LR: 0.005 WEIGHT_DECAY: 0.0001 STEPS: (120000, 160000) MAX_ITER: 180000 IMS_PER_BATCH: 8 ================================================ FILE: configs/retinanet/retinanet_X_101_32x8d_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-32x8d" RPN_ONLY: True RETINANET_ON: True BACKBONE: CONV_BODY: "R-101-FPN-RETINANET" RESNETS: BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 RPN: USE_FPN: True FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TRAIN: 2000 PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 1000 FPN_POST_NMS_TOP_N_TEST: 1000 ROI_HEADS: USE_FPN: True BATCH_SIZE_PER_IMAGE: 256 ROI_BOX_HEAD: POOLER_RESOLUTION: 7 POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) POOLER_SAMPLING_RATIO: 2 FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" PREDICTOR: "FPNPredictor" RETINANET: SCALES_PER_OCTAVE: 3 STRADDLE_THRESH: -1 FG_IOU_THRESHOLD: 0.5 BG_IOU_THRESHOLD: 0.4 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) INPUT: MIN_SIZE_TRAIN: (800, ) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: # Assume 4 gpus BASE_LR: 0.0025 WEIGHT_DECAY: 0.0001 STEPS: (240000, 320000) MAX_ITER: 360000 IMS_PER_BATCH: 4 ================================================ FILE: configs/rpn_R_101_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" RPN_ONLY: True BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 2000 FPN_POST_NMS_TOP_N_TEST: 2000 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/rpn_R_50_C4_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN_ONLY: True RPN: PRE_NMS_TOP_N_TEST: 12000 POST_NMS_TOP_N_TEST: 2000 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/rpn_R_50_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" RPN_ONLY: True BACKBONE: CONV_BODY: "R-50-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 2000 FPN_POST_NMS_TOP_N_TEST: 2000 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: configs/rpn_X_101_32x8d_FPN_1x.yaml ================================================ MODEL: META_ARCHITECTURE: "GeneralizedRCNN" WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-32x8d" RPN_ONLY: True BACKBONE: CONV_BODY: "R-101-FPN" RESNETS: BACKBONE_OUT_CHANNELS: 256 STRIDE_IN_1X1: False NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 RPN: USE_FPN: True ANCHOR_STRIDE: (4, 8, 16, 32, 64) PRE_NMS_TOP_N_TEST: 1000 POST_NMS_TOP_N_TEST: 2000 FPN_POST_NMS_TOP_N_TEST: 2000 DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",) DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: BASE_LR: 0.02 WEIGHT_DECAY: 0.0001 STEPS: (60000, 80000) MAX_ITER: 90000 ================================================ FILE: demo/README.md ================================================ ## Webcam and Jupyter notebook demo This folder contains a simple webcam demo that illustrates how you can use `maskrcnn_benchmark` for inference. ### With your preferred environment You can start it by running it from this folder, using one of the following commands: ```bash # by default, it runs on the GPU # for best results, use min-image-size 800 python webcam.py --min-image-size 800 # can also run it on the CPU python webcam.py --min-image-size 300 MODEL.DEVICE cpu # or change the model that you want to use python webcam.py --config-file ../configs/caffe2/e2e_mask_rcnn_R_101_FPN_1x_caffe2.yaml --min-image-size 300 MODEL.DEVICE cpu # in order to see the probability heatmaps, pass --show-mask-heatmaps python webcam.py --min-image-size 300 --show-mask-heatmaps MODEL.DEVICE cpu ``` ### With Docker Build the image with the tag `maskrcnn-benchmark` (check [INSTALL.md](../INSTALL.md) for instructions) Adjust permissions of the X server host (be careful with this step, refer to [here](http://wiki.ros.org/docker/Tutorials/GUI) for alternatives) ```bash xhost + ``` Then run a container with the demo: ``` docker run --rm -it \ -e DISPLAY=${DISPLAY} \ --privileged \ -v /tmp/.X11-unix:/tmp/.X11-unix \ --device=/dev/video0:/dev/video0 \ --ipc=host maskrcnn-benchmark \ python demo/webcam.py --min-image-size 300 \ --config-file configs/caffe2/e2e_mask_rcnn_R_50_FPN_1x_caffe2.yaml ``` **DISCLAIMER:** *This was tested for an Ubuntu 16.04 machine, the volume mapping may vary depending on your platform* ================================================ FILE: demo/fcos_demo.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import argparse import cv2, os from maskrcnn_benchmark.config import cfg from predictor import COCODemo import time def main(): parser = argparse.ArgumentParser(description="PyTorch Object Detection Webcam Demo") parser.add_argument( "--config-file", default="configs/fcos/fcos_R_50_FPN_1x.yaml", metavar="FILE", help="path to config file", ) parser.add_argument( "--weights", default="FCOS_R_50_FPN_1x.pth", metavar="FILE", help="path to the trained model", ) parser.add_argument( "--images-dir", default="demo/images", metavar="DIR", help="path to demo images directory", ) parser.add_argument( "--min-image-size", type=int, default=800, help="Smallest size of the image to feed to the model. " "Model was trained with 800, which gives best results", ) parser.add_argument( "opts", help="Modify model config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() # load config from file and command-line arguments cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.MODEL.WEIGHT = args.weights cfg.freeze() # The following per-class thresholds are computed by maximizing # per-class f-measure in their precision-recall curve. # Please see compute_thresholds_for_classes() in coco_eval.py for details. thresholds_for_classes = [ 0.23860901594161987, 0.24108672142028809, 0.2470853328704834, 0.2316885143518448, 0.2708061933517456, 0.23173952102661133, 0.31990334391593933, 0.21302376687526703, 0.20151866972446442, 0.20928964018821716, 0.3793887197971344, 0.2715213894844055, 0.2836397588253021, 0.26449233293533325, 0.1728038638830185, 0.314998596906662, 0.28575003147125244, 0.28987520933151245, 0.2727000117301941, 0.23306897282600403, 0.265937477350235, 0.32663893699645996, 0.27102580666542053, 0.29177549481391907, 0.2043062448501587, 0.24331751465797424, 0.20752687752246857, 0.22951272130012512, 0.22753854095935822, 0.2159966081380844, 0.1993938684463501, 0.23676514625549316, 0.20982342958450317, 0.18315598368644714, 0.2489681988954544, 0.24793922901153564, 0.287187397480011, 0.23045086860656738, 0.2462811917066574, 0.21191294491291046, 0.22845126688480377, 0.24365000426769257, 0.22687821090221405, 0.18365581333637238, 0.2035856395959854, 0.23478077352046967, 0.18431290984153748, 0.18184082210063934, 0.2708037495613098, 0.2268175482749939, 0.19970566034317017, 0.21832780539989471, 0.21120598912239075, 0.270445853471756, 0.189377561211586, 0.2101106345653534, 0.2112293541431427, 0.23484709858894348, 0.22701986134052277, 0.20732736587524414, 0.1953316181898117, 0.3237660229206085, 0.3078872859477997, 0.2881140112876892, 0.38746657967567444, 0.20038367807865143, 0.28123822808265686, 0.2588447630405426, 0.2796839773654938, 0.266757994890213, 0.3266656696796417, 0.25759157538414, 0.2578003704547882, 0.17009201645851135, 0.29051828384399414, 0.24002137780189514, 0.22378061711788177, 0.26134759187698364, 0.1730124056339264, 0.1857597529888153 ] demo_im_names = os.listdir(args.images_dir) # prepare object that handles inference plus adds predictions on top of image coco_demo = COCODemo( cfg, confidence_thresholds_for_classes=thresholds_for_classes, min_image_size=args.min_image_size ) for im_name in demo_im_names: img = cv2.imread(os.path.join(args.images_dir, im_name)) if img is None: continue start_time = time.time() composite = coco_demo.run_on_opencv_image(img) print("{}\tinference time: {:.2f}s".format(im_name, time.time() - start_time)) cv2.imshow(im_name, composite) print("Press any keys to exit ...") cv2.waitKey() cv2.destroyAllWindows() if __name__ == "__main__": main() ================================================ FILE: demo/predictor.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import cv2 import torch from torchvision import transforms as T from maskrcnn_benchmark.modeling.detector import build_detection_model from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer from maskrcnn_benchmark.structures.image_list import to_image_list from maskrcnn_benchmark.modeling.roi_heads.mask_head.inference import Masker from maskrcnn_benchmark import layers as L from maskrcnn_benchmark.utils import cv2_util class COCODemo(object): # COCO categories for pretty print CATEGORIES = [ "__background", "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", ] def __init__( self, cfg, confidence_thresholds_for_classes, show_mask_heatmaps=False, masks_per_dim=2, min_image_size=224, ): self.cfg = cfg.clone() self.model = build_detection_model(cfg) self.model.eval() self.device = torch.device(cfg.MODEL.DEVICE) self.model.to(self.device) self.min_image_size = min_image_size save_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, self.model, save_dir=save_dir) _ = checkpointer.load(cfg.MODEL.WEIGHT) self.transforms = self.build_transform() mask_threshold = -1 if show_mask_heatmaps else 0.5 self.masker = Masker(threshold=mask_threshold, padding=1) # used to make colors for each class self.palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1]) self.cpu_device = torch.device("cpu") self.confidence_thresholds_for_classes = torch.tensor(confidence_thresholds_for_classes) self.show_mask_heatmaps = show_mask_heatmaps self.masks_per_dim = masks_per_dim def build_transform(self): """ Creates a basic transformation that was used to train the models """ cfg = self.cfg # we are loading images with OpenCV, so we don't need to convert them # to BGR, they are already! So all we need to do is to normalize # by 255 if we want to convert to BGR255 format, or flip the channels # if we want it to be in RGB in [0-1] range. if cfg.INPUT.TO_BGR255: to_bgr_transform = T.Lambda(lambda x: x * 255) else: to_bgr_transform = T.Lambda(lambda x: x[[2, 1, 0]]) normalize_transform = T.Normalize( mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD ) transform = T.Compose( [ T.ToPILImage(), T.Resize(self.min_image_size), T.ToTensor(), to_bgr_transform, normalize_transform, ] ) return transform def run_on_opencv_image(self, image): """ Arguments: image (np.ndarray): an image as returned by OpenCV Returns: prediction (BoxList): the detected objects. Additional information of the detection properties can be found in the fields of the BoxList via `prediction.fields()` """ predictions = self.compute_prediction(image) top_predictions = self.select_top_predictions(predictions) result = image.copy() if self.show_mask_heatmaps: return self.create_mask_montage(result, top_predictions) result = self.overlay_boxes(result, top_predictions) if self.cfg.MODEL.MASK_ON: result = self.overlay_mask(result, top_predictions) if self.cfg.MODEL.KEYPOINT_ON: result = self.overlay_keypoints(result, top_predictions) result = self.overlay_class_names(result, top_predictions) return result def compute_prediction(self, original_image): """ Arguments: original_image (np.ndarray): an image as returned by OpenCV Returns: prediction (BoxList): the detected objects. Additional information of the detection properties can be found in the fields of the BoxList via `prediction.fields()` """ # apply pre-processing to image image = self.transforms(original_image) # convert to an ImageList, padded so that it is divisible by # cfg.DATALOADER.SIZE_DIVISIBILITY image_list = to_image_list(image, self.cfg.DATALOADER.SIZE_DIVISIBILITY) image_list = image_list.to(self.device) # compute predictions with torch.no_grad(): predictions = self.model(image_list) predictions = [o.to(self.cpu_device) for o in predictions] # always single image is passed at a time prediction = predictions[0] # reshape prediction (a BoxList) into the original image size height, width = original_image.shape[:-1] prediction = prediction.resize((width, height)) if prediction.has_field("mask"): # if we have masks, paste the masks in the right position # in the image, as defined by the bounding boxes masks = prediction.get_field("mask") # always single image is passed at a time masks = self.masker([masks], [prediction])[0] prediction.add_field("mask", masks) return prediction def select_top_predictions(self, predictions): """ Select only predictions which have a `score` > self.confidence_threshold, and returns the predictions in descending order of score Arguments: predictions (BoxList): the result of the computation by the model. It should contain the field `scores`. Returns: prediction (BoxList): the detected objects. Additional information of the detection properties can be found in the fields of the BoxList via `prediction.fields()` """ scores = predictions.get_field("scores") labels = predictions.get_field("labels") thresholds = self.confidence_thresholds_for_classes[(labels - 1).long()] keep = torch.nonzero(scores > thresholds).squeeze(1) predictions = predictions[keep] scores = predictions.get_field("scores") _, idx = scores.sort(0, descending=True) return predictions[idx] def compute_colors_for_labels(self, labels): """ Simple function that adds fixed colors depending on the class """ colors = labels[:, None] * self.palette colors = (colors % 255).numpy().astype("uint8") return colors def overlay_boxes(self, image, predictions): """ Adds the predicted boxes on top of the image Arguments: image (np.ndarray): an image as returned by OpenCV predictions (BoxList): the result of the computation by the model. It should contain the field `labels`. """ labels = predictions.get_field("labels") boxes = predictions.bbox colors = self.compute_colors_for_labels(labels).tolist() for box, color in zip(boxes, colors): box = box.to(torch.int64) top_left, bottom_right = box[:2].tolist(), box[2:].tolist() image = cv2.rectangle( image, tuple(top_left), tuple(bottom_right), tuple(color), 2 ) return image def overlay_mask(self, image, predictions): """ Adds the instances contours for each predicted object. Each label has a different color. Arguments: image (np.ndarray): an image as returned by OpenCV predictions (BoxList): the result of the computation by the model. It should contain the field `mask` and `labels`. """ masks = predictions.get_field("mask").numpy() labels = predictions.get_field("labels") colors = self.compute_colors_for_labels(labels).tolist() for mask, color in zip(masks, colors): thresh = mask[0, :, :, None] contours, hierarchy = cv2_util.findContours( thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE ) image = cv2.drawContours(image, contours, -1, color, 3) composite = image return composite def overlay_keypoints(self, image, predictions): keypoints = predictions.get_field("keypoints") kps = keypoints.keypoints scores = keypoints.get_field("logits") kps = torch.cat((kps[:, :, 0:2], scores[:, :, None]), dim=2).numpy() for region in kps: image = vis_keypoints(image, region.transpose((1, 0))) return image def create_mask_montage(self, image, predictions): """ Create a montage showing the probability heatmaps for each one one of the detected objects Arguments: image (np.ndarray): an image as returned by OpenCV predictions (BoxList): the result of the computation by the model. It should contain the field `mask`. """ masks = predictions.get_field("mask") masks_per_dim = self.masks_per_dim masks = L.interpolate( masks.float(), scale_factor=1 / masks_per_dim ).byte() height, width = masks.shape[-2:] max_masks = masks_per_dim ** 2 masks = masks[:max_masks] # handle case where we have less detections than max_masks if len(masks) < max_masks: masks_padded = torch.zeros(max_masks, 1, height, width, dtype=torch.uint8) masks_padded[: len(masks)] = masks masks = masks_padded masks = masks.reshape(masks_per_dim, masks_per_dim, height, width) result = torch.zeros( (masks_per_dim * height, masks_per_dim * width), dtype=torch.uint8 ) for y in range(masks_per_dim): start_y = y * height end_y = (y + 1) * height for x in range(masks_per_dim): start_x = x * width end_x = (x + 1) * width result[start_y:end_y, start_x:end_x] = masks[y, x] return cv2.applyColorMap(result.numpy(), cv2.COLORMAP_JET) def overlay_class_names(self, image, predictions): """ Adds detected class names and scores in the positions defined by the top-left corner of the predicted bounding box Arguments: image (np.ndarray): an image as returned by OpenCV predictions (BoxList): the result of the computation by the model. It should contain the field `scores` and `labels`. """ scores = predictions.get_field("scores").tolist() labels = predictions.get_field("labels").tolist() labels = [self.CATEGORIES[i] for i in labels] boxes = predictions.bbox template = "{}: {:.2f}" for box, score, label in zip(boxes, scores, labels): x, y = box[:2] s = template.format(label, score) cv2.putText( image, s, (x, y), cv2.FONT_HERSHEY_SIMPLEX, .5, (255, 255, 255), 1 ) return image import numpy as np import matplotlib.pyplot as plt from maskrcnn_benchmark.structures.keypoint import PersonKeypoints def vis_keypoints(img, kps, kp_thresh=2, alpha=0.7): """Visualizes keypoints (adapted from vis_one_image). kps has shape (4, #keypoints) where 4 rows are (x, y, logit, prob). """ dataset_keypoints = PersonKeypoints.NAMES kp_lines = PersonKeypoints.CONNECTIONS # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv. cmap = plt.get_cmap('rainbow') colors = [cmap(i) for i in np.linspace(0, 1, len(kp_lines) + 2)] colors = [(c[2] * 255, c[1] * 255, c[0] * 255) for c in colors] # Perform the drawing on a copy of the image, to allow for blending. kp_mask = np.copy(img) # Draw mid shoulder / mid hip first for better visualization. mid_shoulder = ( kps[:2, dataset_keypoints.index('right_shoulder')] + kps[:2, dataset_keypoints.index('left_shoulder')]) / 2.0 sc_mid_shoulder = np.minimum( kps[2, dataset_keypoints.index('right_shoulder')], kps[2, dataset_keypoints.index('left_shoulder')]) mid_hip = ( kps[:2, dataset_keypoints.index('right_hip')] + kps[:2, dataset_keypoints.index('left_hip')]) / 2.0 sc_mid_hip = np.minimum( kps[2, dataset_keypoints.index('right_hip')], kps[2, dataset_keypoints.index('left_hip')]) nose_idx = dataset_keypoints.index('nose') if sc_mid_shoulder > kp_thresh and kps[2, nose_idx] > kp_thresh: cv2.line( kp_mask, tuple(mid_shoulder), tuple(kps[:2, nose_idx]), color=colors[len(kp_lines)], thickness=2, lineType=cv2.LINE_AA) if sc_mid_shoulder > kp_thresh and sc_mid_hip > kp_thresh: cv2.line( kp_mask, tuple(mid_shoulder), tuple(mid_hip), color=colors[len(kp_lines) + 1], thickness=2, lineType=cv2.LINE_AA) # Draw the keypoints. for l in range(len(kp_lines)): i1 = kp_lines[l][0] i2 = kp_lines[l][1] p1 = kps[0, i1], kps[1, i1] p2 = kps[0, i2], kps[1, i2] if kps[2, i1] > kp_thresh and kps[2, i2] > kp_thresh: cv2.line( kp_mask, p1, p2, color=colors[l], thickness=2, lineType=cv2.LINE_AA) if kps[2, i1] > kp_thresh: cv2.circle( kp_mask, p1, radius=3, color=colors[l], thickness=-1, lineType=cv2.LINE_AA) if kps[2, i2] > kp_thresh: cv2.circle( kp_mask, p2, radius=3, color=colors[l], thickness=-1, lineType=cv2.LINE_AA) # Blend the keypoints. return cv2.addWeighted(img, 1.0 - alpha, kp_mask, alpha, 0) ================================================ FILE: demo/webcam.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import argparse import cv2 from maskrcnn_benchmark.config import cfg from predictor import COCODemo import time def main(): parser = argparse.ArgumentParser(description="PyTorch Object Detection Webcam Demo") parser.add_argument( "--config-file", default="../configs/caffe2/e2e_mask_rcnn_R_50_FPN_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument( "--confidence-threshold", type=float, default=0.7, help="Minimum score for the prediction to be shown", ) parser.add_argument( "--min-image-size", type=int, default=224, help="Smallest size of the image to feed to the model. " "Model was trained with 800, which gives best results", ) parser.add_argument( "--show-mask-heatmaps", dest="show_mask_heatmaps", help="Show a heatmap probability for the top masks-per-dim masks", action="store_true", ) parser.add_argument( "--masks-per-dim", type=int, default=2, help="Number of heatmaps per dimension to show", ) parser.add_argument( "opts", help="Modify model config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() # load config from file and command-line arguments cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() # prepare object that handles inference plus adds predictions on top of image coco_demo = COCODemo( cfg, confidence_threshold=args.confidence_threshold, show_mask_heatmaps=args.show_mask_heatmaps, masks_per_dim=args.masks_per_dim, min_image_size=args.min_image_size, ) cam = cv2.VideoCapture(0) while True: start_time = time.time() ret_val, img = cam.read() composite = coco_demo.run_on_opencv_image(img) print("Time: {:.2f} s / img".format(time.time() - start_time)) cv2.imshow("COCO detections", composite) if cv2.waitKey(1) == 27: break # esc to quit cv2.destroyAllWindows() if __name__ == "__main__": main() ================================================ FILE: docker/Dockerfile ================================================ ARG CUDA="9.0" ARG CUDNN="7" FROM nvidia/cuda:${CUDA}-cudnn${CUDNN}-devel-ubuntu16.04 RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections # install basics RUN apt-get update -y \ && apt-get install -y apt-utils git curl ca-certificates bzip2 cmake tree htop bmon iotop g++ \ && apt-get install -y libglib2.0-0 libsm6 libxext6 libxrender-dev # Install Miniconda RUN curl -so /miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ && chmod +x /miniconda.sh \ && /miniconda.sh -b -p /miniconda \ && rm /miniconda.sh ENV PATH=/miniconda/bin:$PATH # Create a Python 3.6 environment RUN /miniconda/bin/conda install -y conda-build \ && /miniconda/bin/conda create -y --name py36 python=3.6.7 \ && /miniconda/bin/conda clean -ya ENV CONDA_DEFAULT_ENV=py36 ENV CONDA_PREFIX=/miniconda/envs/$CONDA_DEFAULT_ENV ENV PATH=$CONDA_PREFIX/bin:$PATH ENV CONDA_AUTO_UPDATE_CONDA=false RUN conda install -y ipython RUN pip install ninja yacs cython matplotlib opencv-python tqdm # Install PyTorch 1.0 Nightly ARG CUDA RUN conda install pytorch-nightly cudatoolkit=${CUDA} -c pytorch \ && conda clean -ya # Install TorchVision master RUN git clone https://github.com/pytorch/vision.git \ && cd vision \ && python setup.py install # install pycocotools RUN git clone https://github.com/cocodataset/cocoapi.git \ && cd cocoapi/PythonAPI \ && python setup.py build_ext install # install PyTorch Detection ARG FORCE_CUDA="1" ENV FORCE_CUDA=${FORCE_CUDA} RUN git clone https://github.com/facebookresearch/maskrcnn-benchmark.git \ && cd maskrcnn-benchmark \ && python setup.py build develop WORKDIR /maskrcnn-benchmark ================================================ FILE: docker/docker-jupyter/Dockerfile ================================================ ARG CUDA="9.0" ARG CUDNN="7" FROM nvidia/cuda:${CUDA}-cudnn${CUDNN}-devel-ubuntu16.04 RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections # install basics RUN apt-get update -y \ && apt-get install -y apt-utils git curl ca-certificates bzip2 cmake tree htop bmon iotop g++ # Install Miniconda RUN curl -so /miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ && chmod +x /miniconda.sh \ && /miniconda.sh -b -p /miniconda \ && rm /miniconda.sh ENV PATH=/miniconda/bin:$PATH # Create a Python 3.6 environment RUN /miniconda/bin/conda install -y conda-build \ && /miniconda/bin/conda create -y --name py36 python=3.6.7 \ && /miniconda/bin/conda clean -ya ENV CONDA_DEFAULT_ENV=py36 ENV CONDA_PREFIX=/miniconda/envs/$CONDA_DEFAULT_ENV ENV PATH=$CONDA_PREFIX/bin:$PATH ENV CONDA_AUTO_UPDATE_CONDA=false RUN conda install -y ipython RUN pip install ninja yacs cython matplotlib jupyter # Install PyTorch 1.0 Nightly and OpenCV RUN conda install -y pytorch-nightly -c pytorch \ && conda install -y opencv -c menpo \ && conda clean -ya WORKDIR /root USER root RUN mkdir /notebooks WORKDIR /notebooks # Install TorchVision master RUN git clone https://github.com/pytorch/vision.git \ && cd vision \ && python setup.py install # install pycocotools RUN git clone https://github.com/cocodataset/cocoapi.git \ && cd cocoapi/PythonAPI \ && python setup.py build_ext install # install PyTorch Detection RUN git clone https://github.com/facebookresearch/maskrcnn-benchmark.git \ && cd maskrcnn-benchmark \ && python setup.py build develop RUN jupyter notebook --generate-config ENV CONFIG_PATH="/root/.jupyter/jupyter_notebook_config.py" COPY "jupyter_notebook_config.py" ${CONFIG_PATH} ENTRYPOINT ["sh", "-c", "jupyter notebook --allow-root -y --no-browser --ip=0.0.0.0 --config=${CONFIG_PATH}"] ================================================ FILE: docker/docker-jupyter/jupyter_notebook_config.py ================================================ import os from IPython.lib import passwd #c = c # pylint:disable=undefined-variable c = get_config() c.NotebookApp.ip = '0.0.0.0' c.NotebookApp.port = int(os.getenv('PORT', 8888)) c.NotebookApp.open_browser = False # sets a password if PASSWORD is set in the environment if 'PASSWORD' in os.environ: password = os.environ['PASSWORD'] if password: c.NotebookApp.password = passwd(password) else: c.NotebookApp.password = '' c.NotebookApp.token = '' del os.environ['PASSWORD'] ================================================ FILE: maskrcnn_benchmark/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. ================================================ FILE: maskrcnn_benchmark/config/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from .defaults import _C as cfg ================================================ FILE: maskrcnn_benchmark/config/defaults.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import os from yacs.config import CfgNode as CN # ----------------------------------------------------------------------------- # Convention about Training / Test specific parameters # ----------------------------------------------------------------------------- # Whenever an argument can be either used for training or for testing, the # corresponding name will be post-fixed by a _TRAIN for a training parameter, # or _TEST for a test-specific parameter. # For example, the number of images during training will be # IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be # IMAGES_PER_BATCH_TEST # ----------------------------------------------------------------------------- # Config definition # ----------------------------------------------------------------------------- _C = CN() _C.MODEL = CN() _C.MODEL.RPN_ONLY = False _C.MODEL.MASK_ON = False _C.MODEL.FCOS_ON = True _C.MODEL.RETINANET_ON = False _C.MODEL.KEYPOINT_ON = False _C.MODEL.DEVICE = "cuda" _C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN" _C.MODEL.CLS_AGNOSTIC_BBOX_REG = False # If the WEIGHT starts with a catalog://, like :R-50, the code will look for # the path in paths_catalog. Else, it will use it as the specified absolute # path _C.MODEL.WEIGHT = "" _C.MODEL.USE_SYNCBN = False # ----------------------------------------------------------------------------- # INPUT # ----------------------------------------------------------------------------- _C.INPUT = CN() # Size of the smallest side of the image during training _C.INPUT.MIN_SIZE_TRAIN = (800,) # (800,) # The range of the smallest side for multi-scale training _C.INPUT.MIN_SIZE_RANGE_TRAIN = (-1, -1) # -1 means disabled and it will use MIN_SIZE_TRAIN # Maximum size of the side of the image during training _C.INPUT.MAX_SIZE_TRAIN = 1333 # Size of the smallest side of the image during testing _C.INPUT.MIN_SIZE_TEST = 800 # Maximum size of the side of the image during testing _C.INPUT.MAX_SIZE_TEST = 1333 # Values to be used for image normalization _C.INPUT.PIXEL_MEAN = [102.9801, 115.9465, 122.7717] # Values to be used for image normalization _C.INPUT.PIXEL_STD = [1., 1., 1.] # Convert image to BGR format (for Caffe2 models), in range 0-255 _C.INPUT.TO_BGR255 = True # ----------------------------------------------------------------------------- # Dataset # ----------------------------------------------------------------------------- _C.DATASETS = CN() # List of the dataset names for training, as present in paths_catalog.py _C.DATASETS.TRAIN = () # List of the dataset names for testing, as present in paths_catalog.py _C.DATASETS.TEST = () # ----------------------------------------------------------------------------- # DataLoader # ----------------------------------------------------------------------------- _C.DATALOADER = CN() # Number of data loading threads _C.DATALOADER.NUM_WORKERS = 4 # If > 0, this enforces that each collated batch should have a size divisible # by SIZE_DIVISIBILITY _C.DATALOADER.SIZE_DIVISIBILITY = 0 # If True, each batch should contain only images for which the aspect ratio # is compatible. This groups portrait images together, and landscape images # are not batched with portrait images. _C.DATALOADER.ASPECT_RATIO_GROUPING = True # ---------------------------------------------------------------------------- # # Backbone options # ---------------------------------------------------------------------------- # _C.MODEL.BACKBONE = CN() # The backbone conv body to use # The string must match a function that is imported in modeling.model_builder # (e.g., 'FPN.add_fpn_ResNet101_conv5_body' to specify a ResNet-101-FPN # backbone) _C.MODEL.BACKBONE.CONV_BODY = "R-50-C4" # Add StopGrad at a specified stage so the bottom layers are frozen _C.MODEL.BACKBONE.FREEZE_CONV_BODY_AT = 2 # GN for backbone _C.MODEL.BACKBONE.USE_GN = False # ---------------------------------------------------------------------------- # # FPN options # ---------------------------------------------------------------------------- # _C.MODEL.FPN = CN() _C.MODEL.FPN.USE_GN = False _C.MODEL.FPN.USE_RELU = False # ---------------------------------------------------------------------------- # # Group Norm options # ---------------------------------------------------------------------------- # _C.MODEL.GROUP_NORM = CN() # Number of dimensions per group in GroupNorm (-1 if using NUM_GROUPS) _C.MODEL.GROUP_NORM.DIM_PER_GP = -1 # Number of groups in GroupNorm (-1 if using DIM_PER_GP) _C.MODEL.GROUP_NORM.NUM_GROUPS = 32 # GroupNorm's small constant in the denominator _C.MODEL.GROUP_NORM.EPSILON = 1e-5 # ---------------------------------------------------------------------------- # # RPN options # ---------------------------------------------------------------------------- # _C.MODEL.RPN = CN() _C.MODEL.RPN.USE_FPN = False # Base RPN anchor sizes given in absolute pixels w.r.t. the scaled network input _C.MODEL.RPN.ANCHOR_SIZES = (32, 64, 128, 256, 512) # Stride of the feature map that RPN is attached. # For FPN, number of strides should match number of scales _C.MODEL.RPN.ANCHOR_STRIDE = (16,) # RPN anchor aspect ratios _C.MODEL.RPN.ASPECT_RATIOS = (0.5, 1.0, 2.0) # Remove RPN anchors that go outside the image by RPN_STRADDLE_THRESH pixels # Set to -1 or a large value, e.g. 100000, to disable pruning anchors _C.MODEL.RPN.STRADDLE_THRESH = 0 # Minimum overlap required between an anchor and ground-truth box for the # (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD # ==> positive RPN example) _C.MODEL.RPN.FG_IOU_THRESHOLD = 0.7 # Maximum overlap allowed between an anchor and ground-truth box for the # (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD # ==> negative RPN example) _C.MODEL.RPN.BG_IOU_THRESHOLD = 0.3 # Total number of RPN examples per image _C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256 # Target fraction of foreground (positive) examples per RPN minibatch _C.MODEL.RPN.POSITIVE_FRACTION = 0.5 # Number of top scoring RPN proposals to keep before applying NMS # When FPN is used, this is *per FPN level* (not total) _C.MODEL.RPN.PRE_NMS_TOP_N_TRAIN = 12000 _C.MODEL.RPN.PRE_NMS_TOP_N_TEST = 6000 # Number of top scoring RPN proposals to keep after applying NMS _C.MODEL.RPN.POST_NMS_TOP_N_TRAIN = 2000 _C.MODEL.RPN.POST_NMS_TOP_N_TEST = 1000 # NMS threshold used on RPN proposals _C.MODEL.RPN.NMS_THRESH = 0.7 # Proposal height and width both need to be greater than RPN_MIN_SIZE # (a the scale used during training or inference) _C.MODEL.RPN.MIN_SIZE = 0 # Number of top scoring RPN proposals to keep after combining proposals from # all FPN levels _C.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN = 2000 _C.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 2000 # Custom rpn head, empty to use default conv or separable conv _C.MODEL.RPN.RPN_HEAD = "SingleConvRPNHead" # ---------------------------------------------------------------------------- # # ROI HEADS options # ---------------------------------------------------------------------------- # _C.MODEL.ROI_HEADS = CN() _C.MODEL.ROI_HEADS.USE_FPN = False # Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD) _C.MODEL.ROI_HEADS.FG_IOU_THRESHOLD = 0.5 # Overlap threshold for an RoI to be considered background # (class = 0 if overlap in [0, BG_IOU_THRESHOLD)) _C.MODEL.ROI_HEADS.BG_IOU_THRESHOLD = 0.5 # Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets # These are empirically chosen to approximately lead to unit variance targets _C.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS = (10., 10., 5., 5.) # RoI minibatch size *per image* (number of regions of interest [ROIs]) # Total number of RoIs per training minibatch = # TRAIN.BATCH_SIZE_PER_IM * TRAIN.IMS_PER_BATCH # E.g., a common configuration is: 512 * 2 * 8 = 8192 _C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512 # Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0) _C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25 # Only used on test mode # Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to # balance obtaining high recall with not having too many low precision # detections that will slow down inference post processing steps (like NMS) _C.MODEL.ROI_HEADS.SCORE_THRESH = 0.05 # Overlap threshold used for non-maximum suppression (suppress boxes with # IoU >= this threshold) _C.MODEL.ROI_HEADS.NMS = 0.5 # Maximum number of detections to return per image (100 is based on the limit # established for the COCO dataset) _C.MODEL.ROI_HEADS.DETECTIONS_PER_IMG = 100 _C.MODEL.ROI_BOX_HEAD = CN() _C.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor" _C.MODEL.ROI_BOX_HEAD.PREDICTOR = "FastRCNNPredictor" _C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14 _C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0 _C.MODEL.ROI_BOX_HEAD.POOLER_SCALES = (1.0 / 16,) _C.MODEL.ROI_BOX_HEAD.NUM_CLASSES = 81 # Hidden layer dimension when using an MLP for the RoI box head _C.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM = 1024 # GN _C.MODEL.ROI_BOX_HEAD.USE_GN = False # Dilation _C.MODEL.ROI_BOX_HEAD.DILATION = 1 _C.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM = 256 _C.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS = 4 _C.MODEL.ROI_MASK_HEAD = CN() _C.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor" _C.MODEL.ROI_MASK_HEAD.PREDICTOR = "MaskRCNNC4Predictor" _C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14 _C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0 _C.MODEL.ROI_MASK_HEAD.POOLER_SCALES = (1.0 / 16,) _C.MODEL.ROI_MASK_HEAD.MLP_HEAD_DIM = 1024 _C.MODEL.ROI_MASK_HEAD.CONV_LAYERS = (256, 256, 256, 256) _C.MODEL.ROI_MASK_HEAD.RESOLUTION = 14 _C.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True # Whether or not resize and translate masks to the input image. _C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS = False _C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD = 0.5 # Dilation _C.MODEL.ROI_MASK_HEAD.DILATION = 1 # GN _C.MODEL.ROI_MASK_HEAD.USE_GN = False _C.MODEL.ROI_KEYPOINT_HEAD = CN() _C.MODEL.ROI_KEYPOINT_HEAD.FEATURE_EXTRACTOR = "KeypointRCNNFeatureExtractor" _C.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR = "KeypointRCNNPredictor" _C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14 _C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0 _C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SCALES = (1.0 / 16,) _C.MODEL.ROI_KEYPOINT_HEAD.MLP_HEAD_DIM = 1024 _C.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS = tuple(512 for _ in range(8)) _C.MODEL.ROI_KEYPOINT_HEAD.RESOLUTION = 14 _C.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES = 17 _C.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True # ---------------------------------------------------------------------------- # # ResNe[X]t options (ResNets = {ResNet, ResNeXt} # Note that parts of a resnet may be used for both the backbone and the head # These options apply to both # ---------------------------------------------------------------------------- # _C.MODEL.RESNETS = CN() # Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt _C.MODEL.RESNETS.NUM_GROUPS = 1 # Baseline width of each group _C.MODEL.RESNETS.WIDTH_PER_GROUP = 64 # Place the stride 2 conv on the 1x1 filter # Use True only for the original MSRA ResNet; use False for C2 and Torch models _C.MODEL.RESNETS.STRIDE_IN_1X1 = True # Residual transformation function _C.MODEL.RESNETS.TRANS_FUNC = "BottleneckWithFixedBatchNorm" # ResNet's stem function (conv1 and pool1) _C.MODEL.RESNETS.STEM_FUNC = "StemWithFixedBatchNorm" # Apply dilation in stage "res5" _C.MODEL.RESNETS.RES5_DILATION = 1 _C.MODEL.RESNETS.BACKBONE_OUT_CHANNELS = 256 * 4 _C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256 _C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64 # ---------------------------------------------------------------------------- # # FCOS Options # ---------------------------------------------------------------------------- # _C.MODEL.FCOS = CN() _C.MODEL.FCOS.NUM_CLASSES = 81 # the number of classes including background _C.MODEL.FCOS.FPN_STRIDES = [8, 16, 32, 64, 128] _C.MODEL.FCOS.PRIOR_PROB = 0.01 _C.MODEL.FCOS.INFERENCE_TH = 0.05 _C.MODEL.FCOS.NMS_TH = 0.6 _C.MODEL.FCOS.PRE_NMS_TOP_N = 1000 # Focal loss parameter: alpha _C.MODEL.FCOS.LOSS_ALPHA = 0.25 # Focal loss parameter: gamma _C.MODEL.FCOS.LOSS_GAMMA = 2.0 _C.MODEL.FCOS.CENTER_SAMPLE = False _C.MODEL.FCOS.POS_RADIUS = 1.5 _C.MODEL.FCOS.LOC_LOSS_TYPE = 'iou' _C.MODEL.FCOS.DENSE_POINTS = 1 # the number of convolutions used in the cls and bbox tower _C.MODEL.FCOS.NUM_CONVS = 4 # ---------------------------------------------------------------------------- # # RetinaNet Options (Follow the Detectron version) # ---------------------------------------------------------------------------- # _C.MODEL.RETINANET = CN() # This is the number of foreground classes and background. _C.MODEL.RETINANET.NUM_CLASSES = 81 # Anchor aspect ratios to use _C.MODEL.RETINANET.ANCHOR_SIZES = (32, 64, 128, 256, 512) _C.MODEL.RETINANET.ASPECT_RATIOS = (0.5, 1.0, 2.0) _C.MODEL.RETINANET.ANCHOR_STRIDES = (8, 16, 32, 64, 128) _C.MODEL.RETINANET.STRADDLE_THRESH = 0 # Anchor scales per octave _C.MODEL.RETINANET.OCTAVE = 2.0 _C.MODEL.RETINANET.SCALES_PER_OCTAVE = 3 # Use C5 or P5 to generate P6 _C.MODEL.RETINANET.USE_C5 = True # Convolutions to use in the cls and bbox tower # NOTE: this doesn't include the last conv for logits _C.MODEL.RETINANET.NUM_CONVS = 4 # Weight for bbox_regression loss _C.MODEL.RETINANET.BBOX_REG_WEIGHT = 4.0 # Smooth L1 loss beta for bbox regression _C.MODEL.RETINANET.BBOX_REG_BETA = 0.11 # During inference, #locs to select based on cls score before NMS is performed # per FPN level _C.MODEL.RETINANET.PRE_NMS_TOP_N = 1000 # IoU overlap ratio for labeling an anchor as positive # Anchors with >= iou overlap are labeled positive _C.MODEL.RETINANET.FG_IOU_THRESHOLD = 0.5 # IoU overlap ratio for labeling an anchor as negative # Anchors with < iou overlap are labeled negative _C.MODEL.RETINANET.BG_IOU_THRESHOLD = 0.4 # Focal loss parameter: alpha _C.MODEL.RETINANET.LOSS_ALPHA = 0.25 # Focal loss parameter: gamma _C.MODEL.RETINANET.LOSS_GAMMA = 2.0 # Prior prob for the positives at the beginning of training. This is used to set # the bias init for the logits layer _C.MODEL.RETINANET.PRIOR_PROB = 0.01 # Inference cls score threshold, anchors with score > INFERENCE_TH are # considered for inference _C.MODEL.RETINANET.INFERENCE_TH = 0.05 # NMS threshold used in RetinaNet _C.MODEL.RETINANET.NMS_TH = 0.4 # ---------------------------------------------------------------------------- # # FBNet options # ---------------------------------------------------------------------------- # _C.MODEL.FBNET = CN() _C.MODEL.FBNET.ARCH = "default" # custom arch _C.MODEL.FBNET.ARCH_DEF = "" _C.MODEL.FBNET.BN_TYPE = "bn" _C.MODEL.FBNET.SCALE_FACTOR = 1.0 # the output channels will be divisible by WIDTH_DIVISOR _C.MODEL.FBNET.WIDTH_DIVISOR = 1 _C.MODEL.FBNET.DW_CONV_SKIP_BN = True _C.MODEL.FBNET.DW_CONV_SKIP_RELU = True # > 0 scale, == 0 skip, < 0 same dimension _C.MODEL.FBNET.DET_HEAD_LAST_SCALE = 1.0 _C.MODEL.FBNET.DET_HEAD_BLOCKS = [] # overwrite the stride for the head, 0 to use original value _C.MODEL.FBNET.DET_HEAD_STRIDE = 0 # > 0 scale, == 0 skip, < 0 same dimension _C.MODEL.FBNET.KPTS_HEAD_LAST_SCALE = 0.0 _C.MODEL.FBNET.KPTS_HEAD_BLOCKS = [] # overwrite the stride for the head, 0 to use original value _C.MODEL.FBNET.KPTS_HEAD_STRIDE = 0 # > 0 scale, == 0 skip, < 0 same dimension _C.MODEL.FBNET.MASK_HEAD_LAST_SCALE = 0.0 _C.MODEL.FBNET.MASK_HEAD_BLOCKS = [] # overwrite the stride for the head, 0 to use original value _C.MODEL.FBNET.MASK_HEAD_STRIDE = 0 # 0 to use all blocks defined in arch_def _C.MODEL.FBNET.RPN_HEAD_BLOCKS = 0 _C.MODEL.FBNET.RPN_BN_TYPE = "" # ---------------------------------------------------------------------------- # # Solver # ---------------------------------------------------------------------------- # _C.SOLVER = CN() _C.SOLVER.MAX_ITER = 40000 _C.SOLVER.BASE_LR = 0.001 _C.SOLVER.BIAS_LR_FACTOR = 2 _C.SOLVER.MOMENTUM = 0.9 _C.SOLVER.WEIGHT_DECAY = 0.0005 _C.SOLVER.WEIGHT_DECAY_BIAS = 0 _C.SOLVER.GAMMA = 0.1 _C.SOLVER.STEPS = (30000,) _C.SOLVER.WARMUP_FACTOR = 1.0 / 3 _C.SOLVER.WARMUP_ITERS = 500 _C.SOLVER.WARMUP_METHOD = "linear" _C.SOLVER.CHECKPOINT_PERIOD = 2500 # Number of images per batch # This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will # see 2 images per batch _C.SOLVER.IMS_PER_BATCH = 16 # ---------------------------------------------------------------------------- # # Specific test options # ---------------------------------------------------------------------------- # _C.TEST = CN() _C.TEST.EXPECTED_RESULTS = [] _C.TEST.EXPECTED_RESULTS_SIGMA_TOL = 4 # Number of images per batch # This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will # see 2 images per batch _C.TEST.IMS_PER_BATCH = 8 # Number of detections per image _C.TEST.DETECTIONS_PER_IMG = 100 # ---------------------------------------------------------------------------- # # Misc options # ---------------------------------------------------------------------------- # _C.OUTPUT_DIR = "." _C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py") ================================================ FILE: maskrcnn_benchmark/config/paths_catalog.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """Centralized catalog of paths.""" import os class DatasetCatalog(object): DATA_DIR = "datasets" DATASETS = { "coco_2017_train": { "img_dir": "coco/train2017", "ann_file": "coco/annotations/instances_train2017.json" }, "coco_2017_val": { "img_dir": "coco/val2017", "ann_file": "coco/annotations/instances_val2017.json" }, "coco_2014_train": { "img_dir": "coco/train2014", "ann_file": "coco/annotations/instances_train2014.json" }, "coco_2014_val": { "img_dir": "coco/val2014", "ann_file": "coco/annotations/instances_val2014.json" }, "coco_2014_minival": { "img_dir": "coco/val2014", "ann_file": "coco/annotations/instances_minival2014.json" }, "coco_2014_valminusminival": { "img_dir": "coco/val2014", "ann_file": "coco/annotations/instances_valminusminival2014.json" }, "keypoints_coco_2014_train": { "img_dir": "coco/train2014", "ann_file": "coco/annotations/person_keypoints_train2014.json", }, "keypoints_coco_2014_val": { "img_dir": "coco/val2014", "ann_file": "coco/annotations/person_keypoints_val2014.json" }, "keypoints_coco_2014_minival": { "img_dir": "coco/val2014", "ann_file": "coco/annotations/person_keypoints_minival2014.json", }, "keypoints_coco_2014_valminusminival": { "img_dir": "coco/val2014", "ann_file": "coco/annotations/person_keypoints_valminusminival2014.json", }, "voc_2007_train": { "data_dir": "voc/VOC2007", "split": "train" }, "voc_2007_train_cocostyle": { "img_dir": "voc/VOC2007/JPEGImages", "ann_file": "voc/VOC2007/Annotations/pascal_train2007.json" }, "voc_2007_val": { "data_dir": "voc/VOC2007", "split": "val" }, "voc_2007_val_cocostyle": { "img_dir": "voc/VOC2007/JPEGImages", "ann_file": "voc/VOC2007/Annotations/pascal_val2007.json" }, "voc_2007_test": { "data_dir": "voc/VOC2007", "split": "test" }, "voc_2007_test_cocostyle": { "img_dir": "voc/VOC2007/JPEGImages", "ann_file": "voc/VOC2007/Annotations/pascal_test2007.json" }, "voc_2012_train": { "data_dir": "voc/VOC2012", "split": "train" }, "voc_2012_train_cocostyle": { "img_dir": "voc/VOC2012/JPEGImages", "ann_file": "voc/VOC2012/Annotations/pascal_train2012.json" }, "voc_2012_val": { "data_dir": "voc/VOC2012", "split": "val" }, "voc_2012_val_cocostyle": { "img_dir": "voc/VOC2012/JPEGImages", "ann_file": "voc/VOC2012/Annotations/pascal_val2012.json" }, "voc_2012_test": { "data_dir": "voc/VOC2012", "split": "test" # PASCAL VOC2012 doesn't made the test annotations available, so there's no json annotation }, "cityscapes_fine_instanceonly_seg_train_cocostyle": { "img_dir": "cityscapes/images", "ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_train.json" }, "cityscapes_fine_instanceonly_seg_val_cocostyle": { "img_dir": "cityscapes/images", "ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_val.json" }, "cityscapes_fine_instanceonly_seg_test_cocostyle": { "img_dir": "cityscapes/images", "ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_test.json" } } @staticmethod def get(name): if "coco" in name: data_dir = DatasetCatalog.DATA_DIR attrs = DatasetCatalog.DATASETS[name] args = dict( root=os.path.join(data_dir, attrs["img_dir"]), ann_file=os.path.join(data_dir, attrs["ann_file"]), ) return dict( factory="COCODataset", args=args, ) elif "voc" in name: data_dir = DatasetCatalog.DATA_DIR attrs = DatasetCatalog.DATASETS[name] args = dict( data_dir=os.path.join(data_dir, attrs["data_dir"]), split=attrs["split"], ) return dict( factory="PascalVOCDataset", args=args, ) raise RuntimeError("Dataset not available: {}".format(name)) class ModelCatalog(object): S3_C2_DETECTRON_URL = "https://dl.fbaipublicfiles.com/detectron" C2_IMAGENET_MODELS = { "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl", "MSRA/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl", "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl", "MSRA/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl", "FAIR/20171220/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl", "FAIR/20171220/X-101-64x4d": "ImageNetPretrained/20171220/X-101-64x4d.pkl", } C2_DETECTRON_SUFFIX = "output/train/{}coco_2014_train%3A{}coco_2014_valminusminival/generalized_rcnn/model_final.pkl" C2_DETECTRON_MODELS = { "35857197/e2e_faster_rcnn_R-50-C4_1x": "01_33_49.iAX0mXvW", "35857345/e2e_faster_rcnn_R-50-FPN_1x": "01_36_30.cUF7QR7I", "35857890/e2e_faster_rcnn_R-101-FPN_1x": "01_38_50.sNxI7sX7", "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "06_31_39.5MIHi1fZ", "35858791/e2e_mask_rcnn_R-50-C4_1x": "01_45_57.ZgkA7hPB", "35858933/e2e_mask_rcnn_R-50-FPN_1x": "01_48_14.DzEQe4wC", "35861795/e2e_mask_rcnn_R-101-FPN_1x": "02_31_37.KqyEK4tT", "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "06_35_59.RZotkLKI", "37129812/e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x": "09_35_36.8pzTQKYK", # keypoints "37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "08_42_54.kdzV35ao" } @staticmethod def get(name): if name.startswith("Caffe2Detectron/COCO"): return ModelCatalog.get_c2_detectron_12_2017_baselines(name) if name.startswith("ImageNetPretrained"): return ModelCatalog.get_c2_imagenet_pretrained(name) raise RuntimeError("model not present in the catalog {}".format(name)) @staticmethod def get_c2_imagenet_pretrained(name): prefix = ModelCatalog.S3_C2_DETECTRON_URL name = name[len("ImageNetPretrained/"):] name = ModelCatalog.C2_IMAGENET_MODELS[name] url = "/".join([prefix, name]) return url @staticmethod def get_c2_detectron_12_2017_baselines(name): # Detectron C2 models are stored following the structure # prefix//2012_2017_baselines/.yaml./suffix # we use as identifiers in the catalog Caffe2Detectron/COCO// prefix = ModelCatalog.S3_C2_DETECTRON_URL dataset_tag = "keypoints_" if "keypoint" in name else "" suffix = ModelCatalog.C2_DETECTRON_SUFFIX.format(dataset_tag, dataset_tag) # remove identification prefix name = name[len("Caffe2Detectron/COCO/"):] # split in and model_id, model_name = name.split("/") # parsing to make it match the url address from the Caffe2 models model_name = "{}.yaml".format(model_name) signature = ModelCatalog.C2_DETECTRON_MODELS[name] unique_name = ".".join([model_name, signature]) url = "/".join([prefix, model_id, "12_2017_baselines", unique_name, suffix]) return url ================================================ FILE: maskrcnn_benchmark/csrc/ROIAlign.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #pragma once #include "cpu/vision.h" #ifdef WITH_CUDA #include "cuda/vision.h" #endif // Interface for Python at::Tensor ROIAlign_forward(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); #else AT_ERROR("Not compiled with GPU support"); #endif } return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); } at::Tensor ROIAlign_backward(const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width, const int sampling_ratio) { if (grad.type().is_cuda()) { #ifdef WITH_CUDA return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: maskrcnn_benchmark/csrc/ROIPool.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #pragma once #include "cpu/vision.h" #ifdef WITH_CUDA #include "cuda/vision.h" #endif std::tuple ROIPool_forward(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } at::Tensor ROIPool_backward(const at::Tensor& grad, const at::Tensor& input, const at::Tensor& rois, const at::Tensor& argmax, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width) { if (grad.type().is_cuda()) { #ifdef WITH_CUDA return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: maskrcnn_benchmark/csrc/SigmoidFocalLoss.h ================================================ #pragma once #include "cpu/vision.h" #ifdef WITH_CUDA #include "cuda/vision.h" #endif // Interface for Python at::Tensor SigmoidFocalLoss_forward( const at::Tensor& logits, const at::Tensor& targets, const int num_classes, const float gamma, const float alpha) { if (logits.type().is_cuda()) { #ifdef WITH_CUDA return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } at::Tensor SigmoidFocalLoss_backward( const at::Tensor& logits, const at::Tensor& targets, const at::Tensor& d_losses, const int num_classes, const float gamma, const float alpha) { if (logits.type().is_cuda()) { #ifdef WITH_CUDA return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include "cpu/vision.h" // implementation taken from Caffe2 template struct PreCalc { int pos1; int pos2; int pos3; int pos4; T w1; T w2; T w3; T w4; }; template void pre_calc_for_bilinear_interpolate( const int height, const int width, const int pooled_height, const int pooled_width, const int iy_upper, const int ix_upper, T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w, int roi_bin_grid_h, int roi_bin_grid_w, std::vector>& pre_calc) { int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { for (int iy = 0; iy < iy_upper; iy++) { const T yy = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < ix_upper; ix++) { const T xx = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T x = xx; T y = yy; // deal with: inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { // empty PreCalc pc; pc.pos1 = 0; pc.pos2 = 0; pc.pos3 = 0; pc.pos4 = 0; pc.w1 = 0; pc.w2 = 0; pc.w3 = 0; pc.w4 = 0; pre_calc[pre_calc_index] = pc; pre_calc_index += 1; continue; } if (y <= 0) { y = 0; } if (x <= 0) { x = 0; } int y_low = (int)y; int x_low = (int)x; int y_high; int x_high; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T)y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T)x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; // save weights and indeces PreCalc pc; pc.pos1 = y_low * width + x_low; pc.pos2 = y_low * width + x_high; pc.pos3 = y_high * width + x_low; pc.pos4 = y_high * width + x_high; pc.w1 = w1; pc.w2 = w2; pc.w3 = w3; pc.w4 = w4; pre_calc[pre_calc_index] = pc; pre_calc_index += 1; } } } } } template void ROIAlignForward_cpu_kernel( const int nthreads, const T* bottom_data, const T& spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int sampling_ratio, const T* bottom_rois, //int roi_cols, T* top_data) { //AT_ASSERT(roi_cols == 4 || roi_cols == 5); int roi_cols = 5; int n_rois = nthreads / channels / pooled_width / pooled_height; // (n, c, ph, pw) is an element in the pooled output // can be parallelized using omp // #pragma omp parallel for num_threads(32) for (int n = 0; n < n_rois; n++) { int index_n = n * channels * pooled_width * pooled_height; // roi could have 4 or 5 columns const T* offset_bottom_rois = bottom_rois + n * roi_cols; int roi_batch_ind = 0; if (roi_cols == 5) { roi_batch_ind = offset_bottom_rois[0]; offset_bottom_rois++; } // Do not using rounding; this implementation detail is critical T roi_start_w = offset_bottom_rois[0] * spatial_scale; T roi_start_h = offset_bottom_rois[1] * spatial_scale; T roi_end_w = offset_bottom_rois[2] * spatial_scale; T roi_end_h = offset_bottom_rois[3] * spatial_scale; // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale); // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale); // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale); // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale); // Force malformed ROIs to be 1x1 T roi_width = std::max(roi_end_w - roi_start_w, (T)1.); T roi_height = std::max(roi_end_h - roi_start_h, (T)1.); T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 // we want to precalculate indeces and weights shared by all chanels, // this is the key point of optimiation std::vector> pre_calc( roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); pre_calc_for_bilinear_interpolate( height, width, pooled_height, pooled_width, roi_bin_grid_h, roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w, roi_bin_grid_h, roi_bin_grid_w, pre_calc); for (int c = 0; c < channels; c++) { int index_n_c = index_n + c * pooled_width * pooled_height; const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { int index = index_n_c + ph * pooled_width + pw; T output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { for (int ix = 0; ix < roi_bin_grid_w; ix++) { PreCalc pc = pre_calc[pre_calc_index]; output_val += pc.w1 * offset_bottom_data[pc.pos1] + pc.w2 * offset_bottom_data[pc.pos2] + pc.w3 * offset_bottom_data[pc.pos3] + pc.w4 * offset_bottom_data[pc.pos4]; pre_calc_index += 1; } } output_val /= count; top_data[index] = output_val; } // for pw } // for ph } // for c } // for n } at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio) { AT_ASSERTM(!input.type().is_cuda(), "input must be a CPU tensor"); AT_ASSERTM(!rois.type().is_cuda(), "rois must be a CPU tensor"); auto num_rois = rois.size(0); auto channels = input.size(1); auto height = input.size(2); auto width = input.size(3); auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); auto output_size = num_rois * pooled_height * pooled_width * channels; if (output.numel() == 0) { return output; } AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] { ROIAlignForward_cpu_kernel( output_size, input.data(), spatial_scale, channels, height, width, pooled_height, pooled_width, sampling_ratio, rois.data(), output.data()); }); return output; } ================================================ FILE: maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include "cpu/vision.h" template at::Tensor nms_cpu_kernel(const at::Tensor& dets, const at::Tensor& scores, const float threshold) { AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor"); AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor"); AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores"); if (dets.numel() == 0) { return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); } auto x1_t = dets.select(1, 0).contiguous(); auto y1_t = dets.select(1, 1).contiguous(); auto x2_t = dets.select(1, 2).contiguous(); auto y2_t = dets.select(1, 3).contiguous(); at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1); auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); auto ndets = dets.size(0); at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); auto suppressed = suppressed_t.data(); auto order = order_t.data(); auto x1 = x1_t.data(); auto y1 = y1_t.data(); auto x2 = x2_t.data(); auto y2 = y2_t.data(); auto areas = areas_t.data(); for (int64_t _i = 0; _i < ndets; _i++) { auto i = order[_i]; if (suppressed[i] == 1) continue; auto ix1 = x1[i]; auto iy1 = y1[i]; auto ix2 = x2[i]; auto iy2 = y2[i]; auto iarea = areas[i]; for (int64_t _j = _i + 1; _j < ndets; _j++) { auto j = order[_j]; if (suppressed[j] == 1) continue; auto xx1 = std::max(ix1, x1[j]); auto yy1 = std::max(iy1, y1[j]); auto xx2 = std::min(ix2, x2[j]); auto yy2 = std::min(iy2, y2[j]); auto w = std::max(static_cast(0), xx2 - xx1 + 1); auto h = std::max(static_cast(0), yy2 - yy1 + 1); auto inter = w * h; auto ovr = inter / (iarea + areas[j] - inter); if (ovr >= threshold) suppressed[j] = 1; } } return at::nonzero(suppressed_t == 0).squeeze(1); } at::Tensor nms_cpu(const at::Tensor& dets, const at::Tensor& scores, const float threshold) { at::Tensor result; AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] { result = nms_cpu_kernel(dets, scores, threshold); }); return result; } ================================================ FILE: maskrcnn_benchmark/csrc/cpu/vision.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #pragma once #include at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio); at::Tensor nms_cpu(const at::Tensor& dets, const at::Tensor& scores, const float threshold); ================================================ FILE: maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include #include #include #include #include // TODO make it in a common file #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ i += blockDim.x * gridDim.x) template __device__ T bilinear_interpolate(const T* bottom_data, const int height, const int width, T y, T x, const int index /* index for debug only*/) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { //empty return 0; } if (y <= 0) y = 0; if (x <= 0) x = 0; int y_low = (int) y; int x_low = (int) x; int y_high; int x_high; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T) y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T) x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; // do bilinear interpolation T v1 = bottom_data[y_low * width + x_low]; T v2 = bottom_data[y_low * width + x_high]; T v3 = bottom_data[y_high * width + x_low]; T v4 = bottom_data[y_high * width + x_high]; T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __global__ void RoIAlignForward(const int nthreads, const T* bottom_data, const T spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int sampling_ratio, const T* bottom_rois, T* top_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; // Do not using rounding; this implementation detail is critical T roi_start_w = offset_bottom_rois[1] * spatial_scale; T roi_start_h = offset_bottom_rois[2] * spatial_scale; T roi_end_w = offset_bottom_rois[3] * spatial_scale; T roi_end_h = offset_bottom_rois[4] * spatial_scale; // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale); // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale); // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale); // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale); // Force malformed ROIs to be 1x1 T roi_width = max(roi_end_w - roi_start_w, (T)1.); T roi_height = max(roi_end_h - roi_start_h, (T)1.); T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 T output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1 { const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix ++) { const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, index); output_val += val; } } output_val /= count; top_data[index] = output_val; } } template __device__ void bilinear_interpolate_gradient( const int height, const int width, T y, T x, T & w1, T & w2, T & w3, T & w4, int & x_low, int & x_high, int & y_low, int & y_high, const int index /* index for debug only*/) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { //empty w1 = w2 = w3 = w4 = 0.; x_low = x_high = y_low = y_high = -1; return; } if (y <= 0) y = 0; if (x <= 0) x = 0; y_low = (int) y; x_low = (int) x; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T) y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T) x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; // reference in forward // T v1 = bottom_data[y_low * width + x_low]; // T v2 = bottom_data[y_low * width + x_high]; // T v3 = bottom_data[y_high * width + x_low]; // T v4 = bottom_data[y_high * width + x_high]; // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; return; } template __global__ void RoIAlignBackwardFeature(const int nthreads, const T* top_diff, const int num_rois, const T spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int sampling_ratio, T* bottom_diff, const T* bottom_rois) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; // Do not using rounding; this implementation detail is critical T roi_start_w = offset_bottom_rois[1] * spatial_scale; T roi_start_h = offset_bottom_rois[2] * spatial_scale; T roi_end_w = offset_bottom_rois[3] * spatial_scale; T roi_end_h = offset_bottom_rois[4] * spatial_scale; // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale); // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale); // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale); // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale); // Force malformed ROIs to be 1x1 T roi_width = max(roi_end_w - roi_start_w, (T)1.); T roi_height = max(roi_end_h - roi_start_h, (T)1.); T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width; int top_offset = (n * channels + c) * pooled_height * pooled_width; const T* offset_top_diff = top_diff + top_offset; const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1 { const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix ++) { const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high, index); T g1 = top_diff_this_bin * w1 / count; T g2 = top_diff_this_bin * w2 / count; T g3 = top_diff_this_bin * w3 / count; T g4 = top_diff_this_bin * w4 / count; if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(offset_bottom_diff + y_low * width + x_low, static_cast(g1)); atomicAdd(offset_bottom_diff + y_low * width + x_high, static_cast(g2)); atomicAdd(offset_bottom_diff + y_high * width + x_low, static_cast(g3)); atomicAdd(offset_bottom_diff + y_high * width + x_high, static_cast(g4)); } // if } // ix } // iy } // CUDA_1D_KERNEL_LOOP } // RoIAlignBackward at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio) { AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); auto num_rois = rois.size(0); auto channels = input.size(1); auto height = input.size(2); auto width = input.size(3); auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); auto output_size = num_rois * pooled_height * pooled_width * channels; cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L)); dim3 block(512); if (output.numel() == 0) { THCudaCheck(cudaGetLastError()); return output; } AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] { RoIAlignForward<<>>( output_size, input.contiguous().data(), spatial_scale, channels, height, width, pooled_height, pooled_width, sampling_ratio, rois.contiguous().data(), output.data()); }); THCudaCheck(cudaGetLastError()); return output; } // TODO remove the dependency on input and use instead its sizes -> save memory at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width, const int sampling_ratio) { AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor"); AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); auto num_rois = rois.size(0); auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options()); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L)); dim3 block(512); // handle possibly empty gradients if (grad.numel() == 0) { THCudaCheck(cudaGetLastError()); return grad_input; } AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIAlign_backward", [&] { RoIAlignBackwardFeature<<>>( grad.numel(), grad.contiguous().data(), num_rois, spatial_scale, channels, height, width, pooled_height, pooled_width, sampling_ratio, grad_input.data(), rois.contiguous().data()); }); THCudaCheck(cudaGetLastError()); return grad_input; } ================================================ FILE: maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include #include #include #include #include // TODO make it in a common file #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ i += blockDim.x * gridDim.x) template __global__ void RoIPoolFForward(const int nthreads, const T* bottom_data, const T spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const T* bottom_rois, T* top_data, int* argmax_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; int roi_start_w = round(offset_bottom_rois[1] * spatial_scale); int roi_start_h = round(offset_bottom_rois[2] * spatial_scale); int roi_end_w = round(offset_bottom_rois[3] * spatial_scale); int roi_end_h = round(offset_bottom_rois[4] * spatial_scale); // Force malformed ROIs to be 1x1 int roi_width = max(roi_end_w - roi_start_w + 1, 1); int roi_height = max(roi_end_h - roi_start_h + 1, 1); T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); int hstart = static_cast(floor(static_cast(ph) * bin_size_h)); int wstart = static_cast(floor(static_cast(pw) * bin_size_w)); int hend = static_cast(ceil(static_cast(ph + 1) * bin_size_h)); int wend = static_cast(ceil(static_cast(pw + 1) * bin_size_w)); // Add roi offsets and clip to input boundaries hstart = min(max(hstart + roi_start_h, 0), height); hend = min(max(hend + roi_start_h, 0), height); wstart = min(max(wstart + roi_start_w, 0), width); wend = min(max(wend + roi_start_w, 0), width); bool is_empty = (hend <= hstart) || (wend <= wstart); // Define an empty pooling region to be zero T maxval = is_empty ? 0 : -FLT_MAX; // If nothing is pooled, argmax = -1 causes nothing to be backprop'd int maxidx = -1; const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { int bottom_index = h * width + w; if (offset_bottom_data[bottom_index] > maxval) { maxval = offset_bottom_data[bottom_index]; maxidx = bottom_index; } } } top_data[index] = maxval; argmax_data[index] = maxidx; } } template __global__ void RoIPoolFBackward(const int nthreads, const T* top_diff, const int* argmax_data, const int num_rois, const T spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, T* bottom_diff, const T* bottom_rois) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; int bottom_offset = (roi_batch_ind * channels + c) * height * width; int top_offset = (n * channels + c) * pooled_height * pooled_width; const T* offset_top_diff = top_diff + top_offset; T* offset_bottom_diff = bottom_diff + bottom_offset; const int* offset_argmax_data = argmax_data + top_offset; int argmax = offset_argmax_data[ph * pooled_width + pw]; if (argmax != -1) { atomicAdd( offset_bottom_diff + argmax, static_cast(offset_top_diff[ph * pooled_width + pw])); } } } std::tuple ROIPool_forward_cuda(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width) { AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); auto num_rois = rois.size(0); auto channels = input.size(1); auto height = input.size(2); auto width = input.size(3); auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); auto output_size = num_rois * pooled_height * pooled_width * channels; auto argmax = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options().dtype(at::kInt)); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L)); dim3 block(512); if (output.numel() == 0) { THCudaCheck(cudaGetLastError()); return std::make_tuple(output, argmax); } AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIPool_forward", [&] { RoIPoolFForward<<>>( output_size, input.contiguous().data(), spatial_scale, channels, height, width, pooled_height, pooled_width, rois.contiguous().data(), output.data(), argmax.data()); }); THCudaCheck(cudaGetLastError()); return std::make_tuple(output, argmax); } // TODO remove the dependency on input and use instead its sizes -> save memory at::Tensor ROIPool_backward_cuda(const at::Tensor& grad, const at::Tensor& input, const at::Tensor& rois, const at::Tensor& argmax, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width) { AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor"); AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); // TODO add more checks auto num_rois = rois.size(0); auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options()); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L)); dim3 block(512); // handle possibly empty gradients if (grad.numel() == 0) { THCudaCheck(cudaGetLastError()); return grad_input; } AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIPool_backward", [&] { RoIPoolFBackward<<>>( grad.numel(), grad.contiguous().data(), argmax.data(), num_rois, spatial_scale, channels, height, width, pooled_height, pooled_width, grad_input.data(), rois.contiguous().data()); }); THCudaCheck(cudaGetLastError()); return grad_input; } ================================================ FILE: maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. // This file is modified from https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu // Cheng-Yang Fu // cyfu@cs.unc.edu #include #include #include #include #include #include // TODO make it in a common file #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ i += blockDim.x * gridDim.x) template __global__ void SigmoidFocalLossForward(const int nthreads, const T* logits, const int* targets, const int num_classes, const float gamma, const float alpha, const int num, T* losses) { CUDA_1D_KERNEL_LOOP(i, nthreads) { int n = i / num_classes; int d = i % num_classes; // current class[0~79]; int t = targets[n]; // target class [1~80]; // Decide it is positive or negative case. T c1 = (t == (d+1)); T c2 = (t>=0 & t != (d+1)); T zn = (1.0 - alpha); T zp = (alpha); // p = 1. / 1. + expf(-x); p = sigmoid(x) T p = 1. / (1. + expf(-logits[i])); // (1-p)**gamma * log(p) where T term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN)); // p**gamma * log(1-p) T term2 = powf(p, gamma) * (-1. * logits[i] * (logits[i] >= 0) - logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))); losses[i] = 0.0; losses[i] += -c1 * term1 * zp; losses[i] += -c2 * term2 * zn; } // CUDA_1D_KERNEL_LOOP } // SigmoidFocalLossForward template __global__ void SigmoidFocalLossBackward(const int nthreads, const T* logits, const int* targets, const T* d_losses, const int num_classes, const float gamma, const float alpha, const int num, T* d_logits) { CUDA_1D_KERNEL_LOOP(i, nthreads) { int n = i / num_classes; int d = i % num_classes; // current class[0~79]; int t = targets[n]; // target class [1~80], 0 is background; // Decide it is positive or negative case. T c1 = (t == (d+1)); T c2 = (t>=0 & t != (d+1)); T zn = (1.0 - alpha); T zp = (alpha); // p = 1. / 1. + expf(-x); p = sigmoid(x) T p = 1. / (1. + expf(-logits[i])); // (1-p)**g * (1 - p - g*p*log(p) T term1 = powf((1. - p), gamma) * (1. - p - (p * gamma * logf(max(p, FLT_MIN)))); // (p**g) * (g*(1-p)*log(1-p) - p) T term2 = powf(p, gamma) * ((-1. * logits[i] * (logits[i] >= 0) - logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) * (1. - p) * gamma - p); d_logits[i] = 0.0; d_logits[i] += -c1 * term1 * zp; d_logits[i] += -c2 * term2 * zn; d_logits[i] = d_logits[i] * d_losses[i]; } // CUDA_1D_KERNEL_LOOP } // SigmoidFocalLossBackward at::Tensor SigmoidFocalLoss_forward_cuda( const at::Tensor& logits, const at::Tensor& targets, const int num_classes, const float gamma, const float alpha) { AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); const int num_samples = logits.size(0); auto losses = at::empty({num_samples, logits.size(1)}, logits.options()); auto losses_size = num_samples * logits.size(1); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)losses_size, 512L), 4096L)); dim3 block(512); if (losses.numel() == 0) { THCudaCheck(cudaGetLastError()); return losses; } AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_forward", [&] { SigmoidFocalLossForward<<>>( losses_size, logits.contiguous().data(), targets.contiguous().data(), num_classes, gamma, alpha, num_samples, losses.data()); }); THCudaCheck(cudaGetLastError()); return losses; } at::Tensor SigmoidFocalLoss_backward_cuda( const at::Tensor& logits, const at::Tensor& targets, const at::Tensor& d_losses, const int num_classes, const float gamma, const float alpha) { AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); AT_ASSERTM(d_losses.type().is_cuda(), "d_losses must be a CUDA tensor"); AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); const int num_samples = logits.size(0); AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes"); auto d_logits = at::zeros({num_samples, num_classes}, logits.options()); auto d_logits_size = num_samples * logits.size(1); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)d_logits_size, 512L), 4096L)); dim3 block(512); if (d_logits.numel() == 0) { THCudaCheck(cudaGetLastError()); return d_logits; } AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_backward", [&] { SigmoidFocalLossBackward<<>>( d_logits_size, logits.contiguous().data(), targets.contiguous().data(), d_losses.contiguous().data(), num_classes, gamma, alpha, num_samples, d_logits.data()); }); THCudaCheck(cudaGetLastError()); return d_logits; } ================================================ FILE: maskrcnn_benchmark/csrc/cuda/nms.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include #include #include #include #include #include int const threadsPerBlock = sizeof(unsigned long long) * 8; __device__ inline float devIoU(float const * const a, float const * const b) { float left = max(a[0], b[0]), right = min(a[2], b[2]); float top = max(a[1], b[1]), bottom = min(a[3], b[3]); float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); float interS = width * height; float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); return interS / (Sa + Sb - interS); } __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, const float *dev_boxes, unsigned long long *dev_mask) { const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); __shared__ float block_boxes[threadsPerBlock * 5]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 5 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; block_boxes[threadIdx.x * 5 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; block_boxes[threadIdx.x * 5 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; block_boxes[threadIdx.x * 5 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; block_boxes[threadIdx.x * 5 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; const float *cur_box = dev_boxes + cur_box_idx * 5; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { t |= 1ULL << i; } } const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); dev_mask[cur_box_idx * col_blocks + col_start] = t; } } // boxes is a N x 5 tensor at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) { using scalar_t = float; AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); auto scores = boxes.select(1, 4); auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); auto boxes_sorted = boxes.index_select(0, order_t); int boxes_num = boxes.size(0); const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); scalar_t* boxes_dev = boxes_sorted.data(); THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState unsigned long long* mask_dev = NULL; //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, // boxes_num * col_blocks * sizeof(unsigned long long))); mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), THCCeilDiv(boxes_num, threadsPerBlock)); dim3 threads(threadsPerBlock); nms_kernel<<>>(boxes_num, nms_overlap_thresh, boxes_dev, mask_dev); std::vector mask_host(boxes_num * col_blocks); THCudaCheck(cudaMemcpy(&mask_host[0], mask_dev, sizeof(unsigned long long) * boxes_num * col_blocks, cudaMemcpyDeviceToHost)); std::vector remv(col_blocks); memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); int64_t* keep_out = keep.data(); int num_to_keep = 0; for (int i = 0; i < boxes_num; i++) { int nblock = i / threadsPerBlock; int inblock = i % threadsPerBlock; if (!(remv[nblock] & (1ULL << inblock))) { keep_out[num_to_keep++] = i; unsigned long long *p = &mask_host[0] + i * col_blocks; for (int j = nblock; j < col_blocks; j++) { remv[j] |= p[j]; } } } THCudaFree(state, mask_dev); // TODO improve this part return std::get<0>(order_t.index({ keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( order_t.device(), keep.scalar_type()) }).sort(0, false)); } ================================================ FILE: maskrcnn_benchmark/csrc/cuda/vision.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #pragma once #include at::Tensor SigmoidFocalLoss_forward_cuda( const at::Tensor& logits, const at::Tensor& targets, const int num_classes, const float gamma, const float alpha); at::Tensor SigmoidFocalLoss_backward_cuda( const at::Tensor& logits, const at::Tensor& targets, const at::Tensor& d_losses, const int num_classes, const float gamma, const float alpha); at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio); at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width, const int sampling_ratio); std::tuple ROIPool_forward_cuda(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width); at::Tensor ROIPool_backward_cuda(const at::Tensor& grad, const at::Tensor& input, const at::Tensor& rois, const at::Tensor& argmax, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width); at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh); at::Tensor compute_flow_cuda(const at::Tensor& boxes, const int height, const int width); ================================================ FILE: maskrcnn_benchmark/csrc/nms.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #pragma once #include "cpu/vision.h" #ifdef WITH_CUDA #include "cuda/vision.h" #endif at::Tensor nms(const at::Tensor& dets, const at::Tensor& scores, const float threshold) { if (dets.type().is_cuda()) { #ifdef WITH_CUDA // TODO raise error if not compiled with CUDA if (dets.numel() == 0) return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); auto b = at::cat({dets, scores.unsqueeze(1)}, 1); return nms_cuda(b, threshold); #else AT_ERROR("Not compiled with GPU support"); #endif } at::Tensor result = nms_cpu(dets, scores, threshold); return result; } ================================================ FILE: maskrcnn_benchmark/csrc/vision.cpp ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include "nms.h" #include "ROIAlign.h" #include "ROIPool.h" #include "SigmoidFocalLoss.h" PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("nms", &nms, "non-maximum suppression"); m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward"); m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward"); m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward"); m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward"); m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward"); m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward"); } ================================================ FILE: maskrcnn_benchmark/data/README.md ================================================ # Setting Up Datasets This file describes how to perform training on other datasets. Only Pascal VOC dataset can be loaded from its original format and be outputted to Pascal style results currently. We expect the annotations from other datasets be converted to COCO json format, and the output will be in COCO-style. (i.e. AP, AP50, AP75, APs, APm, APl for bbox and segm) ## Creating Symlinks for PASCAL VOC We assume that your symlinked `datasets/voc/VOC` directory has the following structure: ``` VOC |_ JPEGImages | |_ .jpg | |_ ... | |_ .jpg |_ Annotations | |_ pascal_train.json (optional) | |_ pascal_val.json (optional) | |_ pascal_test.json (optional) | |_ .xml | |_ ... | |_ .xml |_ VOCdevkit ``` Create symlinks for `voc/VOC`: ``` cd ~/github/maskrcnn-benchmark mkdir -p datasets/voc/VOC ln -s /path/to/VOC /datasets/voc/VOC ``` Example configuration files for PASCAL VOC could be found [here](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/configs/pascal_voc/). ### PASCAL VOC Annotations in COCO Format To output COCO-style evaluation result, PASCAL VOC annotations in COCO json format is required and could be downloaded from [here](https://storage.googleapis.com/coco-dataset/external/PASCAL_VOC.zip) via http://cocodataset.org/#external. ## Creating Symlinks for Cityscapes: We assume that your symlinked `datasets/cityscapes` directory has the following structure: ``` cityscapes |_ images | |_ .jpg | |_ ... | |_ .jpg |_ annotations | |_ instanceonly_gtFile_train.json | |_ ... |_ raw |_ gtFine |_ ... |_ README.md ``` Create symlinks for `cityscapes`: ``` cd ~/github/maskrcnn-benchmark mkdir -p datasets/cityscapes ln -s /path/to/cityscapes datasets/data/cityscapes ``` ### Steps to convert Cityscapes Annotations to COCO Format 1. Download gtFine_trainvaltest.zip from https://www.cityscapes-dataset.com/downloads/ (login required) 2. Extract it to /path/to/gtFine_trainvaltest ``` cityscapes |_ gtFine_trainvaltest.zip |_ gtFine_trainvaltest |_ gtFine ``` 3. Run the below commands to convert the annotations ``` cd ~/github git clone https://github.com/mcordts/cityscapesScripts.git cd cityscapesScripts cp ~/github/maskrcnn-benchmark/tools/cityscapes/instances2dict_with_polygons.py cityscapesscripts/evaluation python setup.py install cd ~/github/maskrcnn-benchmark python tools/cityscapes/convert_cityscapes_to_coco.py --datadir /path/to/cityscapes --outdir /path/to/cityscapes/annotations ``` Example configuration files for Cityscapes could be found [here](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/configs/cityscapes/). ================================================ FILE: maskrcnn_benchmark/data/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from .build import make_data_loader ================================================ FILE: maskrcnn_benchmark/data/build.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import bisect import copy import logging import torch.utils.data from maskrcnn_benchmark.utils.comm import get_world_size from maskrcnn_benchmark.utils.imports import import_file from . import datasets as D from . import samplers from .collate_batch import BatchCollator from .transforms import build_transforms def build_dataset(dataset_list, transforms, dataset_catalog, is_train=True): """ Arguments: dataset_list (list[str]): Contains the names of the datasets, i.e., coco_2014_trian, coco_2014_val, etc transforms (callable): transforms to apply to each (image, target) sample dataset_catalog (DatasetCatalog): contains the information on how to construct a dataset. is_train (bool): whether to setup the dataset for training or testing """ if not isinstance(dataset_list, (list, tuple)): raise RuntimeError( "dataset_list should be a list of strings, got {}".format(dataset_list) ) datasets = [] for dataset_name in dataset_list: data = dataset_catalog.get(dataset_name) factory = getattr(D, data["factory"]) args = data["args"] # for COCODataset, we want to remove images without annotations # during training if data["factory"] == "COCODataset": args["remove_images_without_annotations"] = is_train if data["factory"] == "PascalVOCDataset": args["use_difficult"] = not is_train args["transforms"] = transforms # make dataset from factory dataset = factory(**args) datasets.append(dataset) # for testing, return a list of datasets if not is_train: return datasets # for training, concatenate all datasets into a single one dataset = datasets[0] if len(datasets) > 1: dataset = D.ConcatDataset(datasets) return [dataset] def make_data_sampler(dataset, shuffle, distributed): if distributed: return samplers.DistributedSampler(dataset, shuffle=shuffle) if shuffle: sampler = torch.utils.data.sampler.RandomSampler(dataset) else: sampler = torch.utils.data.sampler.SequentialSampler(dataset) return sampler def _quantize(x, bins): bins = copy.copy(bins) bins = sorted(bins) quantized = list(map(lambda y: bisect.bisect_right(bins, y), x)) return quantized def _compute_aspect_ratios(dataset): aspect_ratios = [] for i in range(len(dataset)): img_info = dataset.get_img_info(i) aspect_ratio = float(img_info["height"]) / float(img_info["width"]) aspect_ratios.append(aspect_ratio) return aspect_ratios def make_batch_data_sampler( dataset, sampler, aspect_grouping, images_per_batch, num_iters=None, start_iter=0 ): if aspect_grouping: if not isinstance(aspect_grouping, (list, tuple)): aspect_grouping = [aspect_grouping] aspect_ratios = _compute_aspect_ratios(dataset) group_ids = _quantize(aspect_ratios, aspect_grouping) batch_sampler = samplers.GroupedBatchSampler( sampler, group_ids, images_per_batch, drop_uneven=False ) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_batch, drop_last=False ) if num_iters is not None: batch_sampler = samplers.IterationBasedBatchSampler( batch_sampler, num_iters, start_iter ) return batch_sampler def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0): num_gpus = get_world_size() if is_train: images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_gpus == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = True num_iters = cfg.SOLVER.MAX_ITER else: images_per_batch = cfg.TEST.IMS_PER_BATCH assert ( images_per_batch % num_gpus == 0 ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = False if not is_distributed else True num_iters = None start_iter = 0 if images_per_gpu > 1: logger = logging.getLogger(__name__) logger.warning( "When using more than one image per GPU you may encounter " "an out-of-memory (OOM) error if your GPU does not have " "sufficient memory. If this happens, you can reduce " "SOLVER.IMS_PER_BATCH (for training) or " "TEST.IMS_PER_BATCH (for inference). For training, you must " "also adjust the learning rate and schedule length according " "to the linear scaling rule. See for example: " "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14" ) # group images which have similar aspect ratio. In this case, we only # group in two cases: those with width / height > 1, and the other way around, # but the code supports more general grouping strategy aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] paths_catalog = import_file( "maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True ) DatasetCatalog = paths_catalog.DatasetCatalog dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST transforms = build_transforms(cfg, is_train) datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train) data_loaders = [] for dataset in datasets: sampler = make_data_sampler(dataset, shuffle, is_distributed) batch_sampler = make_batch_data_sampler( dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter ) collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) num_workers = cfg.DATALOADER.NUM_WORKERS data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=collator, ) data_loaders.append(data_loader) if is_train: # during training, a single (possibly concatenated) data_loader is returned assert len(data_loaders) == 1 return data_loaders[0] return data_loaders ================================================ FILE: maskrcnn_benchmark/data/collate_batch.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from maskrcnn_benchmark.structures.image_list import to_image_list class BatchCollator(object): """ From a list of samples from the dataset, returns the batched images and targets. This should be passed to the DataLoader """ def __init__(self, size_divisible=0): self.size_divisible = size_divisible def __call__(self, batch): transposed_batch = list(zip(*batch)) images = to_image_list(transposed_batch[0], self.size_divisible) targets = transposed_batch[1] img_ids = transposed_batch[2] return images, targets, img_ids ================================================ FILE: maskrcnn_benchmark/data/datasets/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from .coco import COCODataset from .voc import PascalVOCDataset from .concat_dataset import ConcatDataset __all__ = ["COCODataset", "ConcatDataset", "PascalVOCDataset"] ================================================ FILE: maskrcnn_benchmark/data/datasets/coco.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch import torchvision from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask from maskrcnn_benchmark.structures.keypoint import PersonKeypoints min_keypoints_per_image = 10 def _count_visible_keypoints(anno): return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno) def _has_only_empty_bbox(anno): return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno) def has_valid_annotation(anno): # if it's empty, there is no annotation if len(anno) == 0: return False # if all boxes have close to zero area, there is no annotation if _has_only_empty_bbox(anno): return False # keypoints task have a slight different critera for considering # if an annotation is valid if "keypoints" not in anno[0]: return True # for keypoint detection tasks, only consider valid images those # containing at least min_keypoints_per_image if _count_visible_keypoints(anno) >= min_keypoints_per_image: return True return False class COCODataset(torchvision.datasets.coco.CocoDetection): def __init__( self, ann_file, root, remove_images_without_annotations, transforms=None ): super(COCODataset, self).__init__(root, ann_file) # sort indices for reproducible results self.ids = sorted(self.ids) # filter images without detection annotations if remove_images_without_annotations: ids = [] for img_id in self.ids: ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None) anno = self.coco.loadAnns(ann_ids) if has_valid_annotation(anno): ids.append(img_id) self.ids = ids self.json_category_id_to_contiguous_id = { v: i + 1 for i, v in enumerate(self.coco.getCatIds()) } self.contiguous_category_id_to_json_id = { v: k for k, v in self.json_category_id_to_contiguous_id.items() } self.id_to_img_map = {k: v for k, v in enumerate(self.ids)} self.transforms = transforms def __getitem__(self, idx): img, anno = super(COCODataset, self).__getitem__(idx) # filter crowd annotations # TODO might be better to add an extra field anno = [obj for obj in anno if obj["iscrowd"] == 0] boxes = [obj["bbox"] for obj in anno] boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes target = BoxList(boxes, img.size, mode="xywh").convert("xyxy") classes = [obj["category_id"] for obj in anno] classes = [self.json_category_id_to_contiguous_id[c] for c in classes] classes = torch.tensor(classes) target.add_field("labels", classes) masks = [obj["segmentation"] for obj in anno] masks = SegmentationMask(masks, img.size, mode='poly') target.add_field("masks", masks) if anno and "keypoints" in anno[0]: keypoints = [obj["keypoints"] for obj in anno] keypoints = PersonKeypoints(keypoints, img.size) target.add_field("keypoints", keypoints) target = target.clip_to_image(remove_empty=True) if self.transforms is not None: img, target = self.transforms(img, target) return img, target, idx def get_img_info(self, index): img_id = self.id_to_img_map[index] img_data = self.coco.imgs[img_id] return img_data ================================================ FILE: maskrcnn_benchmark/data/datasets/concat_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import bisect from torch.utils.data.dataset import ConcatDataset as _ConcatDataset class ConcatDataset(_ConcatDataset): """ Same as torch.utils.data.dataset.ConcatDataset, but exposes an extra method for querying the sizes of the image """ def get_idxs(self, idx): dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) if dataset_idx == 0: sample_idx = idx else: sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] return dataset_idx, sample_idx def get_img_info(self, idx): dataset_idx, sample_idx = self.get_idxs(idx) return self.datasets[dataset_idx].get_img_info(sample_idx) ================================================ FILE: maskrcnn_benchmark/data/datasets/evaluation/__init__.py ================================================ from maskrcnn_benchmark.data import datasets from .coco import coco_evaluation from .voc import voc_evaluation def evaluate(dataset, predictions, output_folder, **kwargs): """evaluate dataset using different methods based on dataset type. Args: dataset: Dataset object predictions(list[BoxList]): each item in the list represents the prediction results for one image. output_folder: output folder, to save evaluation files or results. **kwargs: other args. Returns: evaluation result """ args = dict( dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs ) if isinstance(dataset, datasets.COCODataset): return coco_evaluation(**args) elif isinstance(dataset, datasets.PascalVOCDataset): return voc_evaluation(**args) else: dataset_name = dataset.__class__.__name__ raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name)) ================================================ FILE: maskrcnn_benchmark/data/datasets/evaluation/coco/__init__.py ================================================ from .coco_eval import do_coco_evaluation def coco_evaluation( dataset, predictions, output_folder, box_only, iou_types, expected_results, expected_results_sigma_tol, ): return do_coco_evaluation( dataset=dataset, predictions=predictions, box_only=box_only, output_folder=output_folder, iou_types=iou_types, expected_results=expected_results, expected_results_sigma_tol=expected_results_sigma_tol, ) ================================================ FILE: maskrcnn_benchmark/data/datasets/evaluation/coco/coco_eval.py ================================================ import logging import tempfile import os import torch from collections import OrderedDict from tqdm import tqdm from maskrcnn_benchmark.modeling.roi_heads.mask_head.inference import Masker from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou def do_coco_evaluation( dataset, predictions, box_only, output_folder, iou_types, expected_results, expected_results_sigma_tol, ): logger = logging.getLogger("maskrcnn_benchmark.inference") if box_only: logger.info("Evaluating bbox proposals") areas = {"all": "", "small": "s", "medium": "m", "large": "l"} res = COCOResults("box_proposal") for limit in [100, 1000]: for area, suffix in areas.items(): stats = evaluate_box_proposals( predictions, dataset, area=area, limit=limit ) key = "AR{}@{:d}".format(suffix, limit) res.results["box_proposal"][key] = stats["ar"].item() logger.info(res) check_expected_results(res, expected_results, expected_results_sigma_tol) if output_folder: torch.save(res, os.path.join(output_folder, "box_proposals.pth")) return logger.info("Preparing results for COCO format") coco_results = {} if "bbox" in iou_types: logger.info("Preparing bbox results") coco_results["bbox"] = prepare_for_coco_detection(predictions, dataset) if "segm" in iou_types: logger.info("Preparing segm results") coco_results["segm"] = prepare_for_coco_segmentation(predictions, dataset) if 'keypoints' in iou_types: logger.info('Preparing keypoints results') coco_results['keypoints'] = prepare_for_coco_keypoint(predictions, dataset) results = COCOResults(*iou_types) logger.info("Evaluating predictions") for iou_type in iou_types: with tempfile.NamedTemporaryFile() as f: file_path = f.name if output_folder: file_path = os.path.join(output_folder, iou_type + ".json") res = evaluate_predictions_on_coco( dataset.coco, coco_results[iou_type], file_path, iou_type ) results.update(res) logger.info(results) check_expected_results(results, expected_results, expected_results_sigma_tol) if output_folder: torch.save(results, os.path.join(output_folder, "coco_results.pth")) return results, coco_results def prepare_for_coco_detection(predictions, dataset): # assert isinstance(dataset, COCODataset) coco_results = [] for image_id, prediction in enumerate(predictions): original_id = dataset.id_to_img_map[image_id] if len(prediction) == 0: continue img_info = dataset.get_img_info(image_id) image_width = img_info["width"] image_height = img_info["height"] prediction = prediction.resize((image_width, image_height)) prediction = prediction.convert("xywh") boxes = prediction.bbox.tolist() scores = prediction.get_field("scores").tolist() labels = prediction.get_field("labels").tolist() mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels] coco_results.extend( [ { "image_id": original_id, "category_id": mapped_labels[k], "bbox": box, "score": scores[k], } for k, box in enumerate(boxes) ] ) return coco_results def prepare_for_coco_segmentation(predictions, dataset): import pycocotools.mask as mask_util import numpy as np masker = Masker(threshold=0.5, padding=1) # assert isinstance(dataset, COCODataset) coco_results = [] for image_id, prediction in tqdm(enumerate(predictions)): original_id = dataset.id_to_img_map[image_id] if len(prediction) == 0: continue img_info = dataset.get_img_info(image_id) image_width = img_info["width"] image_height = img_info["height"] prediction = prediction.resize((image_width, image_height)) masks = prediction.get_field("mask") # t = time.time() # Masker is necessary only if masks haven't been already resized. if list(masks.shape[-2:]) != [image_height, image_width]: masks = masker(masks.expand(1, -1, -1, -1, -1), prediction) masks = masks[0] # logger.info('Time mask: {}'.format(time.time() - t)) # prediction = prediction.convert('xywh') # boxes = prediction.bbox.tolist() scores = prediction.get_field("scores").tolist() labels = prediction.get_field("labels").tolist() # rles = prediction.get_field('mask') rles = [ mask_util.encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0] for mask in masks ] for rle in rles: rle["counts"] = rle["counts"].decode("utf-8") mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels] coco_results.extend( [ { "image_id": original_id, "category_id": mapped_labels[k], "segmentation": rle, "score": scores[k], } for k, rle in enumerate(rles) ] ) return coco_results def prepare_for_coco_keypoint(predictions, dataset): # assert isinstance(dataset, COCODataset) coco_results = [] for image_id, prediction in enumerate(predictions): original_id = dataset.id_to_img_map[image_id] if len(prediction.bbox) == 0: continue # TODO replace with get_img_info? image_width = dataset.coco.imgs[original_id]['width'] image_height = dataset.coco.imgs[original_id]['height'] prediction = prediction.resize((image_width, image_height)) prediction = prediction.convert('xywh') boxes = prediction.bbox.tolist() scores = prediction.get_field('scores').tolist() labels = prediction.get_field('labels').tolist() keypoints = prediction.get_field('keypoints') keypoints = keypoints.resize((image_width, image_height)) keypoints = keypoints.keypoints.view(keypoints.keypoints.shape[0], -1).tolist() mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels] coco_results.extend([{ 'image_id': original_id, 'category_id': mapped_labels[k], 'keypoints': keypoint, 'score': scores[k]} for k, keypoint in enumerate(keypoints)]) return coco_results # inspired from Detectron def evaluate_box_proposals( predictions, dataset, thresholds=None, area="all", limit=None ): """Evaluate detection proposal recall metrics. This function is a much faster alternative to the official COCO API recall evaluation code. However, it produces slightly different results. """ # Record max overlap value for each gt box # Return vector of overlap values areas = { "all": 0, "small": 1, "medium": 2, "large": 3, "96-128": 4, "128-256": 5, "256-512": 6, "512-inf": 7, } area_ranges = [ [0 ** 2, 1e5 ** 2], # all [0 ** 2, 32 ** 2], # small [32 ** 2, 96 ** 2], # medium [96 ** 2, 1e5 ** 2], # large [96 ** 2, 128 ** 2], # 96-128 [128 ** 2, 256 ** 2], # 128-256 [256 ** 2, 512 ** 2], # 256-512 [512 ** 2, 1e5 ** 2], ] # 512-inf assert area in areas, "Unknown area range: {}".format(area) area_range = area_ranges[areas[area]] gt_overlaps = [] num_pos = 0 for image_id, prediction in enumerate(predictions): original_id = dataset.id_to_img_map[image_id] img_info = dataset.get_img_info(image_id) image_width = img_info["width"] image_height = img_info["height"] prediction = prediction.resize((image_width, image_height)) # sort predictions in descending order # TODO maybe remove this and make it explicit in the documentation inds = prediction.get_field("objectness").sort(descending=True)[1] prediction = prediction[inds] ann_ids = dataset.coco.getAnnIds(imgIds=original_id) anno = dataset.coco.loadAnns(ann_ids) gt_boxes = [obj["bbox"] for obj in anno if obj["iscrowd"] == 0] gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes gt_boxes = BoxList(gt_boxes, (image_width, image_height), mode="xywh").convert( "xyxy" ) gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0]) if len(gt_boxes) == 0: continue valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]) gt_boxes = gt_boxes[valid_gt_inds] num_pos += len(gt_boxes) if len(gt_boxes) == 0: continue if len(prediction) == 0: continue if limit is not None and len(prediction) > limit: prediction = prediction[:limit] overlaps = boxlist_iou(prediction, gt_boxes) _gt_overlaps = torch.zeros(len(gt_boxes)) for j in range(min(len(prediction), len(gt_boxes))): # find which proposal box maximally covers each gt box # and get the iou amount of coverage for each gt box max_overlaps, argmax_overlaps = overlaps.max(dim=0) # find which gt box is 'best' covered (i.e. 'best' = most iou) gt_ovr, gt_ind = max_overlaps.max(dim=0) assert gt_ovr >= 0 # find the proposal box that covers the best covered gt box box_ind = argmax_overlaps[gt_ind] # record the iou coverage of this gt box _gt_overlaps[j] = overlaps[box_ind, gt_ind] assert _gt_overlaps[j] == gt_ovr # mark the proposal box and the gt box as used overlaps[box_ind, :] = -1 overlaps[:, gt_ind] = -1 # append recorded iou coverage level gt_overlaps.append(_gt_overlaps) gt_overlaps = torch.cat(gt_overlaps, dim=0) gt_overlaps, _ = torch.sort(gt_overlaps) if thresholds is None: step = 0.05 thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) recalls = torch.zeros_like(thresholds) # compute recall for each iou threshold for i, t in enumerate(thresholds): recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) # ar = 2 * np.trapz(recalls, thresholds) ar = recalls.mean() return { "ar": ar, "recalls": recalls, "thresholds": thresholds, "gt_overlaps": gt_overlaps, "num_pos": num_pos, } def evaluate_predictions_on_coco( coco_gt, coco_results, json_result_file, iou_type="bbox" ): import json with open(json_result_file, "w") as f: json.dump(coco_results, f) from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval coco_dt = coco_gt.loadRes(str(json_result_file)) if coco_results else COCO() # coco_dt = coco_gt.loadRes(coco_results) coco_eval = COCOeval(coco_gt, coco_dt, iou_type) coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() compute_thresholds_for_classes(coco_eval) return coco_eval def compute_thresholds_for_classes(coco_eval): ''' The function is used to compute the thresholds corresponding to best f-measure. The resulting thresholds are used in fcos_demo.py. :param coco_eval: :return: ''' import numpy as np # dimension of precision: [TxRxKxAxM] precision = coco_eval.eval['precision'] # we compute thresholds with IOU being 0.5 precision = precision[0, :, :, 0, -1] scores = coco_eval.eval['scores'] scores = scores[0, :, :, 0, -1] recall = np.linspace(0, 1, num=precision.shape[0]) recall = recall[:, None] f_measure = (2 * precision * recall) / (np.maximum(precision + recall, 1e-6)) max_f_measure = f_measure.max(axis=0) max_f_measure_inds = f_measure.argmax(axis=0) scores = scores[max_f_measure_inds, range(len(max_f_measure_inds))] print("Maximum f-measures for classes:") print(list(max_f_measure)) print("Score thresholds for classes (used in demos for visualization purposes):") print(list(scores)) class COCOResults(object): METRICS = { "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"], "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"], "box_proposal": [ "AR@100", "ARs@100", "ARm@100", "ARl@100", "AR@1000", "ARs@1000", "ARm@1000", "ARl@1000", ], "keypoints": ["AP", "AP50", "AP75", "APm", "APl"], } def __init__(self, *iou_types): allowed_types = ("box_proposal", "bbox", "segm", "keypoints") assert all(iou_type in allowed_types for iou_type in iou_types) results = OrderedDict() for iou_type in iou_types: results[iou_type] = OrderedDict( [(metric, -1) for metric in COCOResults.METRICS[iou_type]] ) self.results = results def update(self, coco_eval): if coco_eval is None: return from pycocotools.cocoeval import COCOeval assert isinstance(coco_eval, COCOeval) s = coco_eval.stats iou_type = coco_eval.params.iouType res = self.results[iou_type] metrics = COCOResults.METRICS[iou_type] for idx, metric in enumerate(metrics): res[metric] = s[idx] def __repr__(self): # TODO make it pretty return repr(self.results) def check_expected_results(results, expected_results, sigma_tol): if not expected_results: return logger = logging.getLogger("maskrcnn_benchmark.inference") for task, metric, (mean, std) in expected_results: actual_val = results.results[task][metric] lo = mean - sigma_tol * std hi = mean + sigma_tol * std ok = (lo < actual_val) and (actual_val < hi) msg = ( "{} > {} sanity check (actual vs. expected): " "{:.3f} vs. mean={:.4f}, std={:.4}, range=({:.4f}, {:.4f})" ).format(task, metric, actual_val, mean, std, lo, hi) if not ok: msg = "FAIL: " + msg logger.error(msg) else: msg = "PASS: " + msg logger.info(msg) ================================================ FILE: maskrcnn_benchmark/data/datasets/evaluation/voc/__init__.py ================================================ import logging from .voc_eval import do_voc_evaluation def voc_evaluation(dataset, predictions, output_folder, box_only, **_): logger = logging.getLogger("maskrcnn_benchmark.inference") if box_only: logger.warning("voc evaluation doesn't support box_only, ignored.") logger.info("performing voc evaluation, ignored iou_types.") return do_voc_evaluation( dataset=dataset, predictions=predictions, output_folder=output_folder, logger=logger, ) ================================================ FILE: maskrcnn_benchmark/data/datasets/evaluation/voc/voc_eval.py ================================================ # A modification version from chainercv repository. # (See https://github.com/chainer/chainercv/blob/master/chainercv/evaluations/eval_detection_voc.py) from __future__ import division import os from collections import defaultdict import numpy as np from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou def do_voc_evaluation(dataset, predictions, output_folder, logger): # TODO need to make the use_07_metric format available # for the user to choose pred_boxlists = [] gt_boxlists = [] for image_id, prediction in enumerate(predictions): img_info = dataset.get_img_info(image_id) if len(prediction) == 0: continue image_width = img_info["width"] image_height = img_info["height"] prediction = prediction.resize((image_width, image_height)) pred_boxlists.append(prediction) gt_boxlist = dataset.get_groundtruth(image_id) gt_boxlists.append(gt_boxlist) result = eval_detection_voc( pred_boxlists=pred_boxlists, gt_boxlists=gt_boxlists, iou_thresh=0.5, use_07_metric=True, ) result_str = "mAP: {:.4f}\n".format(result["map"]) for i, ap in enumerate(result["ap"]): if i == 0: # skip background continue result_str += "{:<16}: {:.4f}\n".format( dataset.map_class_id_to_class_name(i), ap ) logger.info(result_str) if output_folder: with open(os.path.join(output_folder, "result.txt"), "w") as fid: fid.write(result_str) return result def eval_detection_voc(pred_boxlists, gt_boxlists, iou_thresh=0.5, use_07_metric=False): """Evaluate on voc dataset. Args: pred_boxlists(list[BoxList]): pred boxlist, has labels and scores fields. gt_boxlists(list[BoxList]): ground truth boxlist, has labels field. iou_thresh: iou thresh use_07_metric: boolean Returns: dict represents the results """ assert len(gt_boxlists) == len( pred_boxlists ), "Length of gt and pred lists need to be same." prec, rec = calc_detection_voc_prec_rec( pred_boxlists=pred_boxlists, gt_boxlists=gt_boxlists, iou_thresh=iou_thresh ) ap = calc_detection_voc_ap(prec, rec, use_07_metric=use_07_metric) return {"ap": ap, "map": np.nanmean(ap)} def calc_detection_voc_prec_rec(gt_boxlists, pred_boxlists, iou_thresh=0.5): """Calculate precision and recall based on evaluation code of PASCAL VOC. This function calculates precision and recall of predicted bounding boxes obtained from a dataset which has :math:`N` images. The code is based on the evaluation code used in PASCAL VOC Challenge. """ n_pos = defaultdict(int) score = defaultdict(list) match = defaultdict(list) for gt_boxlist, pred_boxlist in zip(gt_boxlists, pred_boxlists): pred_bbox = pred_boxlist.bbox.numpy() pred_label = pred_boxlist.get_field("labels").numpy() pred_score = pred_boxlist.get_field("scores").numpy() gt_bbox = gt_boxlist.bbox.numpy() gt_label = gt_boxlist.get_field("labels").numpy() gt_difficult = gt_boxlist.get_field("difficult").numpy() for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)): pred_mask_l = pred_label == l pred_bbox_l = pred_bbox[pred_mask_l] pred_score_l = pred_score[pred_mask_l] # sort by score order = pred_score_l.argsort()[::-1] pred_bbox_l = pred_bbox_l[order] pred_score_l = pred_score_l[order] gt_mask_l = gt_label == l gt_bbox_l = gt_bbox[gt_mask_l] gt_difficult_l = gt_difficult[gt_mask_l] n_pos[l] += np.logical_not(gt_difficult_l).sum() score[l].extend(pred_score_l) if len(pred_bbox_l) == 0: continue if len(gt_bbox_l) == 0: match[l].extend((0,) * pred_bbox_l.shape[0]) continue # VOC evaluation follows integer typed bounding boxes. pred_bbox_l = pred_bbox_l.copy() pred_bbox_l[:, 2:] += 1 gt_bbox_l = gt_bbox_l.copy() gt_bbox_l[:, 2:] += 1 iou = boxlist_iou( BoxList(pred_bbox_l, gt_boxlist.size), BoxList(gt_bbox_l, gt_boxlist.size), ).numpy() gt_index = iou.argmax(axis=1) # set -1 if there is no matching ground truth gt_index[iou.max(axis=1) < iou_thresh] = -1 del iou selec = np.zeros(gt_bbox_l.shape[0], dtype=bool) for gt_idx in gt_index: if gt_idx >= 0: if gt_difficult_l[gt_idx]: match[l].append(-1) else: if not selec[gt_idx]: match[l].append(1) else: match[l].append(0) selec[gt_idx] = True else: match[l].append(0) n_fg_class = max(n_pos.keys()) + 1 prec = [None] * n_fg_class rec = [None] * n_fg_class for l in n_pos.keys(): score_l = np.array(score[l]) match_l = np.array(match[l], dtype=np.int8) order = score_l.argsort()[::-1] match_l = match_l[order] tp = np.cumsum(match_l == 1) fp = np.cumsum(match_l == 0) # If an element of fp + tp is 0, # the corresponding element of prec[l] is nan. prec[l] = tp / (fp + tp) # If n_pos[l] is 0, rec[l] is None. if n_pos[l] > 0: rec[l] = tp / n_pos[l] return prec, rec def calc_detection_voc_ap(prec, rec, use_07_metric=False): """Calculate average precisions based on evaluation code of PASCAL VOC. This function calculates average precisions from given precisions and recalls. The code is based on the evaluation code used in PASCAL VOC Challenge. Args: prec (list of numpy.array): A list of arrays. :obj:`prec[l]` indicates precision for class :math:`l`. If :obj:`prec[l]` is :obj:`None`, this function returns :obj:`numpy.nan` for class :math:`l`. rec (list of numpy.array): A list of arrays. :obj:`rec[l]` indicates recall for class :math:`l`. If :obj:`rec[l]` is :obj:`None`, this function returns :obj:`numpy.nan` for class :math:`l`. use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric for calculating average precision. The default value is :obj:`False`. Returns: ~numpy.ndarray: This function returns an array of average precisions. The :math:`l`-th value corresponds to the average precision for class :math:`l`. If :obj:`prec[l]` or :obj:`rec[l]` is :obj:`None`, the corresponding value is set to :obj:`numpy.nan`. """ n_fg_class = len(prec) ap = np.empty(n_fg_class) for l in range(n_fg_class): if prec[l] is None or rec[l] is None: ap[l] = np.nan continue if use_07_metric: # 11 point metric ap[l] = 0 for t in np.arange(0.0, 1.1, 0.1): if np.sum(rec[l] >= t) == 0: p = 0 else: p = np.max(np.nan_to_num(prec[l])[rec[l] >= t]) ap[l] += p / 11 else: # correct AP calculation # first append sentinel values at the end mpre = np.concatenate(([0], np.nan_to_num(prec[l]), [0])) mrec = np.concatenate(([0], rec[l], [1])) mpre = np.maximum.accumulate(mpre[::-1])[::-1] # to calculate area under PR curve, look for points # where X axis (recall) changes value i = np.where(mrec[1:] != mrec[:-1])[0] # and sum (\Delta recall) * prec ap[l] = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) return ap ================================================ FILE: maskrcnn_benchmark/data/datasets/list_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """ Simple dataset class that wraps a list of path names """ from PIL import Image from maskrcnn_benchmark.structures.bounding_box import BoxList class ListDataset(object): def __init__(self, image_lists, transforms=None): self.image_lists = image_lists self.transforms = transforms def __getitem__(self, item): img = Image.open(self.image_lists[item]).convert("RGB") # dummy target w, h = img.size target = BoxList([[0, 0, w, h]], img.size, mode="xyxy") if self.transforms is not None: img, target = self.transforms(img, target) return img, target def __len__(self): return len(self.image_lists) def get_img_info(self, item): """ Return the image dimensions for the image, without loading and pre-processing it """ pass ================================================ FILE: maskrcnn_benchmark/data/datasets/voc.py ================================================ import os import torch import torch.utils.data from PIL import Image import sys if sys.version_info[0] == 2: import xml.etree.cElementTree as ET else: import xml.etree.ElementTree as ET from maskrcnn_benchmark.structures.bounding_box import BoxList class PascalVOCDataset(torch.utils.data.Dataset): CLASSES = ( "__background__ ", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor", ) def __init__(self, data_dir, split, use_difficult=False, transforms=None): self.root = data_dir self.image_set = split self.keep_difficult = use_difficult self.transforms = transforms self._annopath = os.path.join(self.root, "Annotations", "%s.xml") self._imgpath = os.path.join(self.root, "JPEGImages", "%s.jpg") self._imgsetpath = os.path.join(self.root, "ImageSets", "Main", "%s.txt") with open(self._imgsetpath % self.image_set) as f: self.ids = f.readlines() self.ids = [x.strip("\n") for x in self.ids] self.id_to_img_map = {k: v for k, v in enumerate(self.ids)} cls = PascalVOCDataset.CLASSES self.class_to_ind = dict(zip(cls, range(len(cls)))) def __getitem__(self, index): img_id = self.ids[index] img = Image.open(self._imgpath % img_id).convert("RGB") target = self.get_groundtruth(index) target = target.clip_to_image(remove_empty=True) if self.transforms is not None: img, target = self.transforms(img, target) return img, target, index def __len__(self): return len(self.ids) def get_groundtruth(self, index): img_id = self.ids[index] anno = ET.parse(self._annopath % img_id).getroot() anno = self._preprocess_annotation(anno) height, width = anno["im_info"] target = BoxList(anno["boxes"], (width, height), mode="xyxy") target.add_field("labels", anno["labels"]) target.add_field("difficult", anno["difficult"]) return target def _preprocess_annotation(self, target): boxes = [] gt_classes = [] difficult_boxes = [] TO_REMOVE = 1 for obj in target.iter("object"): difficult = int(obj.find("difficult").text) == 1 if not self.keep_difficult and difficult: continue name = obj.find("name").text.lower().strip() bb = obj.find("bndbox") # Make pixel indexes 0-based # Refer to "https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/pascal_voc.py#L208-L211" box = [ bb.find("xmin").text, bb.find("ymin").text, bb.find("xmax").text, bb.find("ymax").text, ] bndbox = tuple( map(lambda x: x - TO_REMOVE, list(map(int, box))) ) boxes.append(bndbox) gt_classes.append(self.class_to_ind[name]) difficult_boxes.append(difficult) size = target.find("size") im_info = tuple(map(int, (size.find("height").text, size.find("width").text))) res = { "boxes": torch.tensor(boxes, dtype=torch.float32), "labels": torch.tensor(gt_classes), "difficult": torch.tensor(difficult_boxes), "im_info": im_info, } return res def get_img_info(self, index): img_id = self.ids[index] anno = ET.parse(self._annopath % img_id).getroot() size = anno.find("size") im_info = tuple(map(int, (size.find("height").text, size.find("width").text))) return {"height": im_info[0], "width": im_info[1]} def map_class_id_to_class_name(self, class_id): return PascalVOCDataset.CLASSES[class_id] ================================================ FILE: maskrcnn_benchmark/data/samplers/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from .distributed import DistributedSampler from .grouped_batch_sampler import GroupedBatchSampler from .iteration_based_batch_sampler import IterationBasedBatchSampler __all__ = ["DistributedSampler", "GroupedBatchSampler", "IterationBasedBatchSampler"] ================================================ FILE: maskrcnn_benchmark/data/samplers/distributed.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. # Code is copy-pasted exactly as in torch.utils.data.distributed. # FIXME remove this once c10d fixes the bug it has import math import torch import torch.distributed as dist from torch.utils.data.sampler import Sampler class DistributedSampler(Sampler): """Sampler that restricts data loading to a subset of the dataset. It is especially useful in conjunction with :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each process can pass a DistributedSampler instance as a DataLoader sampler, and load a subset of the original dataset that is exclusive to it. .. note:: Dataset is assumed to be of constant size. Arguments: dataset: Dataset used for sampling. num_replicas (optional): Number of processes participating in distributed training. rank (optional): Rank of the current process within num_replicas. """ def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle def __iter__(self): if self.shuffle: # deterministically shuffle based on epoch g = torch.Generator() g.manual_seed(self.epoch) indices = torch.randperm(len(self.dataset), generator=g).tolist() else: indices = torch.arange(len(self.dataset)).tolist() # add extra samples to make it evenly divisible indices += indices[: (self.total_size - len(indices))] assert len(indices) == self.total_size # subsample offset = self.num_samples * self.rank indices = indices[offset : offset + self.num_samples] assert len(indices) == self.num_samples return iter(indices) def __len__(self): return self.num_samples def set_epoch(self, epoch): self.epoch = epoch ================================================ FILE: maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import itertools import torch from torch.utils.data.sampler import BatchSampler from torch.utils.data.sampler import Sampler class GroupedBatchSampler(BatchSampler): """ Wraps another sampler to yield a mini-batch of indices. It enforces that elements from the same group should appear in groups of batch_size. It also tries to provide mini-batches which follows an ordering which is as close as possible to the ordering from the original sampler. Arguments: sampler (Sampler): Base sampler. batch_size (int): Size of mini-batch. drop_uneven (bool): If ``True``, the sampler will drop the batches whose size is less than ``batch_size`` """ def __init__(self, sampler, group_ids, batch_size, drop_uneven=False): if not isinstance(sampler, Sampler): raise ValueError( "sampler should be an instance of " "torch.utils.data.Sampler, but got sampler={}".format(sampler) ) self.sampler = sampler self.group_ids = torch.as_tensor(group_ids) assert self.group_ids.dim() == 1 self.batch_size = batch_size self.drop_uneven = drop_uneven self.groups = torch.unique(self.group_ids).sort(0)[0] self._can_reuse_batches = False def _prepare_batches(self): dataset_size = len(self.group_ids) # get the sampled indices from the sampler sampled_ids = torch.as_tensor(list(self.sampler)) # potentially not all elements of the dataset were sampled # by the sampler (e.g., DistributedSampler). # construct a tensor which contains -1 if the element was # not sampled, and a non-negative number indicating the # order where the element was sampled. # for example. if sampled_ids = [3, 1] and dataset_size = 5, # the order is [-1, 1, -1, 0, -1] order = torch.full((dataset_size,), -1, dtype=torch.int64) order[sampled_ids] = torch.arange(len(sampled_ids)) # get a mask with the elements that were sampled mask = order >= 0 # find the elements that belong to each individual cluster clusters = [(self.group_ids == i) & mask for i in self.groups] # get relative order of the elements inside each cluster # that follows the order from the sampler relative_order = [order[cluster] for cluster in clusters] # with the relative order, find the absolute order in the # sampled space permutation_ids = [s[s.sort()[1]] for s in relative_order] # permute each cluster so that they follow the order from # the sampler permuted_clusters = [sampled_ids[idx] for idx in permutation_ids] # splits each cluster in batch_size, and merge as a list of tensors splits = [c.split(self.batch_size) for c in permuted_clusters] merged = tuple(itertools.chain.from_iterable(splits)) # now each batch internally has the right order, but # they are grouped by clusters. Find the permutation between # different batches that brings them as close as possible to # the order that we have in the sampler. For that, we will consider the # ordering as coming from the first element of each batch, and sort # correspondingly first_element_of_batch = [t[0].item() for t in merged] # get and inverse mapping from sampled indices and the position where # they occur (as returned by the sampler) inv_sampled_ids_map = {v: k for k, v in enumerate(sampled_ids.tolist())} # from the first element in each batch, get a relative ordering first_index_of_batch = torch.as_tensor( [inv_sampled_ids_map[s] for s in first_element_of_batch] ) # permute the batches so that they approximately follow the order # from the sampler permutation_order = first_index_of_batch.sort(0)[1].tolist() # finally, permute the batches batches = [merged[i].tolist() for i in permutation_order] if self.drop_uneven: kept = [] for batch in batches: if len(batch) == self.batch_size: kept.append(batch) batches = kept return batches def __iter__(self): if self._can_reuse_batches: batches = self._batches self._can_reuse_batches = False else: batches = self._prepare_batches() self._batches = batches return iter(batches) def __len__(self): if not hasattr(self, "_batches"): self._batches = self._prepare_batches() self._can_reuse_batches = True return len(self._batches) ================================================ FILE: maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from torch.utils.data.sampler import BatchSampler class IterationBasedBatchSampler(BatchSampler): """ Wraps a BatchSampler, resampling from it until a specified number of iterations have been sampled """ def __init__(self, batch_sampler, num_iterations, start_iter=0): self.batch_sampler = batch_sampler self.num_iterations = num_iterations self.start_iter = start_iter def __iter__(self): iteration = self.start_iter while iteration <= self.num_iterations: # if the underlying sampler has a set_epoch method, like # DistributedSampler, used for making each process see # a different split of the dataset, then set it if hasattr(self.batch_sampler.sampler, "set_epoch"): self.batch_sampler.sampler.set_epoch(iteration) for batch in self.batch_sampler: iteration += 1 if iteration > self.num_iterations: break yield batch def __len__(self): return self.num_iterations ================================================ FILE: maskrcnn_benchmark/data/transforms/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from .transforms import Compose from .transforms import Resize from .transforms import RandomHorizontalFlip from .transforms import ToTensor from .transforms import Normalize from .build import build_transforms ================================================ FILE: maskrcnn_benchmark/data/transforms/build.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from . import transforms as T def build_transforms(cfg, is_train=True): if is_train: if cfg.INPUT.MIN_SIZE_RANGE_TRAIN[0] == -1: min_size = cfg.INPUT.MIN_SIZE_TRAIN else: assert len(cfg.INPUT.MIN_SIZE_RANGE_TRAIN) == 2, \ "MIN_SIZE_RANGE_TRAIN must have two elements (lower bound, upper bound)" min_size = list(range( cfg.INPUT.MIN_SIZE_RANGE_TRAIN[0], cfg.INPUT.MIN_SIZE_RANGE_TRAIN[1] + 1 )) max_size = cfg.INPUT.MAX_SIZE_TRAIN flip_prob = 0.5 # cfg.INPUT.FLIP_PROB_TRAIN else: min_size = cfg.INPUT.MIN_SIZE_TEST max_size = cfg.INPUT.MAX_SIZE_TEST flip_prob = 0 to_bgr255 = cfg.INPUT.TO_BGR255 normalize_transform = T.Normalize( mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, to_bgr255=to_bgr255 ) transform = T.Compose( [ T.Resize(min_size, max_size), T.RandomHorizontalFlip(flip_prob), T.ToTensor(), normalize_transform, ] ) return transform ================================================ FILE: maskrcnn_benchmark/data/transforms/transforms.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import random import torch import torchvision from torchvision.transforms import functional as F class Compose(object): def __init__(self, transforms): self.transforms = transforms def __call__(self, image, target): for t in self.transforms: image, target = t(image, target) return image, target def __repr__(self): format_string = self.__class__.__name__ + "(" for t in self.transforms: format_string += "\n" format_string += " {0}".format(t) format_string += "\n)" return format_string class Resize(object): def __init__(self, min_size, max_size): if not isinstance(min_size, (list, tuple)): min_size = (min_size,) self.min_size = min_size self.max_size = max_size # modified from torchvision to add support for max size def get_size(self, image_size): w, h = image_size size = random.choice(self.min_size) max_size = self.max_size if max_size is not None: min_original_size = float(min((w, h))) max_original_size = float(max((w, h))) if max_original_size / min_original_size * size > max_size: size = int(round(max_size * min_original_size / max_original_size)) if (w <= h and w == size) or (h <= w and h == size): return (h, w) if w < h: ow = size oh = int(size * h / w) else: oh = size ow = int(size * w / h) return (oh, ow) def __call__(self, image, target): size = self.get_size(image.size) image = F.resize(image, size) target = target.resize(image.size) return image, target class RandomHorizontalFlip(object): def __init__(self, prob=0.5): self.prob = prob def __call__(self, image, target): if random.random() < self.prob: image = F.hflip(image) target = target.transpose(0) return image, target class ToTensor(object): def __call__(self, image, target): return F.to_tensor(image), target class Normalize(object): def __init__(self, mean, std, to_bgr255=True): self.mean = mean self.std = std self.to_bgr255 = to_bgr255 def __call__(self, image, target): if self.to_bgr255: image = image[[2, 1, 0]] * 255 image = F.normalize(image, mean=self.mean, std=self.std) return image, target ================================================ FILE: maskrcnn_benchmark/engine/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. ================================================ FILE: maskrcnn_benchmark/engine/inference.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import logging import time import os import torch from tqdm import tqdm from maskrcnn_benchmark.data.datasets.evaluation import evaluate from ..utils.comm import is_main_process, get_world_size from ..utils.comm import all_gather from ..utils.comm import synchronize from ..utils.timer import Timer, get_time_str def compute_on_dataset(model, data_loader, device, timer=None): model.eval() results_dict = {} cpu_device = torch.device("cpu") for _, batch in enumerate(tqdm(data_loader)): images, targets, image_ids = batch images = images.to(device) with torch.no_grad(): if timer: timer.tic() output = model(images) if timer: torch.cuda.synchronize() timer.toc() output = [o.to(cpu_device) for o in output] results_dict.update( {img_id: result for img_id, result in zip(image_ids, output)} ) return results_dict def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu): all_predictions = all_gather(predictions_per_gpu) if not is_main_process(): return # merge the list of dicts predictions = {} for p in all_predictions: predictions.update(p) # convert a dict where the key is the index in a list image_ids = list(sorted(predictions.keys())) if len(image_ids) != image_ids[-1] + 1: logger = logging.getLogger("maskrcnn_benchmark.inference") logger.warning( "Number of images that were gathered from multiple processes is not " "a contiguous set. Some images might be missing from the evaluation" ) # convert to a list predictions = [predictions[i] for i in image_ids] return predictions def inference( model, data_loader, dataset_name, iou_types=("bbox",), box_only=False, device="cuda", expected_results=(), expected_results_sigma_tol=4, output_folder=None, ): # convert to a torch.device for efficiency device = torch.device(device) num_devices = get_world_size() logger = logging.getLogger("maskrcnn_benchmark.inference") dataset = data_loader.dataset logger.info("Start evaluation on {} dataset({} images).".format(dataset_name, len(dataset))) total_timer = Timer() inference_timer = Timer() total_timer.tic() predictions = compute_on_dataset(model, data_loader, device, inference_timer) # wait for all processes to complete before measuring the time synchronize() total_time = total_timer.toc() total_time_str = get_time_str(total_time) logger.info( "Total run time: {} ({} s / img per device, on {} devices)".format( total_time_str, total_time * num_devices / len(dataset), num_devices ) ) total_infer_time = get_time_str(inference_timer.total_time) logger.info( "Model inference time: {} ({} s / img per device, on {} devices)".format( total_infer_time, inference_timer.total_time * num_devices / len(dataset), num_devices, ) ) predictions = _accumulate_predictions_from_multiple_gpus(predictions) if not is_main_process(): return if output_folder: torch.save(predictions, os.path.join(output_folder, "predictions.pth")) extra_args = dict( box_only=box_only, iou_types=iou_types, expected_results=expected_results, expected_results_sigma_tol=expected_results_sigma_tol, ) return evaluate(dataset=dataset, predictions=predictions, output_folder=output_folder, **extra_args) ================================================ FILE: maskrcnn_benchmark/engine/trainer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import datetime import logging import time import torch import torch.distributed as dist from maskrcnn_benchmark.utils.comm import get_world_size, is_pytorch_1_1_0_or_later from maskrcnn_benchmark.utils.metric_logger import MetricLogger def reduce_loss_dict(loss_dict): """ Reduce the loss dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as loss_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): loss_names = [] all_losses = [] for k in sorted(loss_dict.keys()): loss_names.append(k) all_losses.append(loss_dict[k]) all_losses = torch.stack(all_losses, dim=0) dist.reduce(all_losses, dst=0) if dist.get_rank() == 0: # only main process gets accumulated, so only divide by # world_size in this case all_losses /= world_size reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} return reduced_losses def do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ): logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() pytorch_1_1_0_or_later = is_pytorch_1_1_0_or_later() for iteration, (images, targets, _) in enumerate(data_loader, start_iter): data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration # in pytorch >= 1.1.0, scheduler.step() should be run after optimizer.step() if not pytorch_1_1_0_or_later: scheduler.step() images = images.to(device) targets = [target.to(device) for target in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() if pytorch_1_1_0_or_later: scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join( [ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ] ).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, ) ) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if iteration == max_iter: checkpointer.save("model_final", **arguments) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info( "Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter) ) ) ================================================ FILE: maskrcnn_benchmark/layers/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from .batch_norm import FrozenBatchNorm2d from .misc import Conv2d from .misc import ConvTranspose2d from .misc import BatchNorm2d from .misc import interpolate from .nms import nms from .roi_align import ROIAlign from .roi_align import roi_align from .roi_pool import ROIPool from .roi_pool import roi_pool from .smooth_l1_loss import smooth_l1_loss from .sigmoid_focal_loss import SigmoidFocalLoss from .iou_loss import IOULoss from .scale import Scale __all__ = ["nms", "roi_align", "ROIAlign", "roi_pool", "ROIPool", "smooth_l1_loss", "Conv2d", "ConvTranspose2d", "interpolate", "BatchNorm2d", "FrozenBatchNorm2d", "SigmoidFocalLoss", "IOULoss", "Scale"] ================================================ FILE: maskrcnn_benchmark/layers/_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import glob import os.path import torch try: from torch.utils.cpp_extension import load as load_ext from torch.utils.cpp_extension import CUDA_HOME except ImportError: raise ImportError("The cpp layer extensions requires PyTorch 0.4 or higher") def _load_C_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) this_dir = os.path.dirname(this_dir) this_dir = os.path.join(this_dir, "csrc") main_file = glob.glob(os.path.join(this_dir, "*.cpp")) source_cpu = glob.glob(os.path.join(this_dir, "cpu", "*.cpp")) source_cuda = glob.glob(os.path.join(this_dir, "cuda", "*.cu")) source = main_file + source_cpu extra_cflags = [] if torch.cuda.is_available() and CUDA_HOME is not None: source.extend(source_cuda) extra_cflags = ["-DWITH_CUDA"] source = [os.path.join(this_dir, s) for s in source] extra_include_paths = [this_dir] return load_ext( "torchvision", source, extra_cflags=extra_cflags, extra_include_paths=extra_include_paths, ) _C = _load_C_extensions() ================================================ FILE: maskrcnn_benchmark/layers/batch_norm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch import nn class FrozenBatchNorm2d(nn.Module): """ BatchNorm2d where the batch statistics and the affine parameters are fixed """ def __init__(self, n): super(FrozenBatchNorm2d, self).__init__() self.register_buffer("weight", torch.ones(n)) self.register_buffer("bias", torch.zeros(n)) self.register_buffer("running_mean", torch.zeros(n)) self.register_buffer("running_var", torch.ones(n)) def forward(self, x): scale = self.weight * self.running_var.rsqrt() bias = self.bias - self.running_mean * scale scale = scale.reshape(1, -1, 1, 1) bias = bias.reshape(1, -1, 1, 1) return x * scale + bias ================================================ FILE: maskrcnn_benchmark/layers/iou_loss.py ================================================ import torch from torch import nn class IOULoss(nn.Module): def __init__(self, loc_loss_type): super(IOULoss, self).__init__() self.loc_loss_type = loc_loss_type def forward(self, pred, target, weight=None): pred_left = pred[:, 0] pred_top = pred[:, 1] pred_right = pred[:, 2] pred_bottom = pred[:, 3] target_left = target[:, 0] target_top = target[:, 1] target_right = target[:, 2] target_bottom = target[:, 3] target_area = (target_left + target_right) * \ (target_top + target_bottom) pred_area = (pred_left + pred_right) * \ (pred_top + pred_bottom) w_intersect = torch.min(pred_left, target_left) + torch.min(pred_right, target_right) g_w_intersect = torch.max(pred_left, target_left) + torch.max( pred_right, target_right) h_intersect = torch.min(pred_bottom, target_bottom) + torch.min(pred_top, target_top) g_h_intersect = torch.max(pred_bottom, target_bottom) + torch.max(pred_top, target_top) ac_uion = g_w_intersect * g_h_intersect + 1e-7 area_intersect = w_intersect * h_intersect area_union = target_area + pred_area - area_intersect ious = (area_intersect + 1.0) / (area_union + 1.0) gious = ious - (ac_uion - area_union) / ac_uion if self.loc_loss_type == 'iou': losses = -torch.log(ious) elif self.loc_loss_type == 'linear_iou': losses = 1 - ious elif self.loc_loss_type == 'giou': losses = 1 - gious else: raise NotImplementedError if weight is not None and weight.sum() > 0: return (losses * weight).sum() / weight.sum() else: assert losses.numel() != 0 return losses.mean() ================================================ FILE: maskrcnn_benchmark/layers/misc.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """ helper class that supports empty tensors on some nn functions. Ideally, add support directly in PyTorch to empty tensors in those functions. This can be removed once https://github.com/pytorch/pytorch/issues/12013 is implemented """ import math import torch from torch.nn.modules.utils import _ntuple class _NewEmptyTensorOp(torch.autograd.Function): @staticmethod def forward(ctx, x, new_shape): ctx.shape = x.shape return x.new_empty(new_shape) @staticmethod def backward(ctx, grad): shape = ctx.shape return _NewEmptyTensorOp.apply(grad, shape), None class Conv2d(torch.nn.Conv2d): def forward(self, x): if x.numel() > 0: return super(Conv2d, self).forward(x) # get output shape output_shape = [ (i + 2 * p - (di * (k - 1) + 1)) // d + 1 for i, p, di, k, d in zip( x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride ) ] output_shape = [x.shape[0], self.weight.shape[0]] + output_shape return _NewEmptyTensorOp.apply(x, output_shape) class ConvTranspose2d(torch.nn.ConvTranspose2d): def forward(self, x): if x.numel() > 0: return super(ConvTranspose2d, self).forward(x) # get output shape output_shape = [ (i - 1) * d - 2 * p + (di * (k - 1) + 1) + op for i, p, di, k, d, op in zip( x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride, self.output_padding, ) ] output_shape = [x.shape[0], self.bias.shape[0]] + output_shape return _NewEmptyTensorOp.apply(x, output_shape) class BatchNorm2d(torch.nn.BatchNorm2d): def forward(self, x): if x.numel() > 0: return super(BatchNorm2d, self).forward(x) # get output shape output_shape = x.shape return _NewEmptyTensorOp.apply(x, output_shape) def interpolate( input, size=None, scale_factor=None, mode="nearest", align_corners=None ): if input.numel() > 0: return torch.nn.functional.interpolate( input, size, scale_factor, mode, align_corners ) def _check_size_scale_factor(dim): if size is None and scale_factor is None: raise ValueError("either size or scale_factor should be defined") if size is not None and scale_factor is not None: raise ValueError("only one of size or scale_factor should be defined") if ( scale_factor is not None and isinstance(scale_factor, tuple) and len(scale_factor) != dim ): raise ValueError( "scale_factor shape must match input shape. " "Input is {}D, scale_factor size is {}".format(dim, len(scale_factor)) ) def _output_size(dim): _check_size_scale_factor(dim) if size is not None: return size scale_factors = _ntuple(dim)(scale_factor) # math.floor might return float in py2.7 return [ int(math.floor(input.size(i + 2) * scale_factors[i])) for i in range(dim) ] output_shape = tuple(_output_size(2)) output_shape = input.shape[:-2] + output_shape return _NewEmptyTensorOp.apply(input, output_shape) ================================================ FILE: maskrcnn_benchmark/layers/nms.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. # from ._utils import _C from maskrcnn_benchmark import _C nms = _C.nms # nms.__doc__ = """ # This function performs Non-maximum suppresion""" ================================================ FILE: maskrcnn_benchmark/layers/roi_align.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch import nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair from maskrcnn_benchmark import _C class _ROIAlign(Function): @staticmethod def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio): ctx.save_for_backward(roi) ctx.output_size = _pair(output_size) ctx.spatial_scale = spatial_scale ctx.sampling_ratio = sampling_ratio ctx.input_shape = input.size() output = _C.roi_align_forward( input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio ) return output @staticmethod @once_differentiable def backward(ctx, grad_output): rois, = ctx.saved_tensors output_size = ctx.output_size spatial_scale = ctx.spatial_scale sampling_ratio = ctx.sampling_ratio bs, ch, h, w = ctx.input_shape grad_input = _C.roi_align_backward( grad_output, rois, spatial_scale, output_size[0], output_size[1], bs, ch, h, w, sampling_ratio, ) return grad_input, None, None, None, None roi_align = _ROIAlign.apply class ROIAlign(nn.Module): def __init__(self, output_size, spatial_scale, sampling_ratio): super(ROIAlign, self).__init__() self.output_size = output_size self.spatial_scale = spatial_scale self.sampling_ratio = sampling_ratio def forward(self, input, rois): return roi_align( input, rois, self.output_size, self.spatial_scale, self.sampling_ratio ) def __repr__(self): tmpstr = self.__class__.__name__ + "(" tmpstr += "output_size=" + str(self.output_size) tmpstr += ", spatial_scale=" + str(self.spatial_scale) tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) tmpstr += ")" return tmpstr ================================================ FILE: maskrcnn_benchmark/layers/roi_pool.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch import nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair from maskrcnn_benchmark import _C class _ROIPool(Function): @staticmethod def forward(ctx, input, roi, output_size, spatial_scale): ctx.output_size = _pair(output_size) ctx.spatial_scale = spatial_scale ctx.input_shape = input.size() output, argmax = _C.roi_pool_forward( input, roi, spatial_scale, output_size[0], output_size[1] ) ctx.save_for_backward(input, roi, argmax) return output @staticmethod @once_differentiable def backward(ctx, grad_output): input, rois, argmax = ctx.saved_tensors output_size = ctx.output_size spatial_scale = ctx.spatial_scale bs, ch, h, w = ctx.input_shape grad_input = _C.roi_pool_backward( grad_output, input, rois, argmax, spatial_scale, output_size[0], output_size[1], bs, ch, h, w, ) return grad_input, None, None, None roi_pool = _ROIPool.apply class ROIPool(nn.Module): def __init__(self, output_size, spatial_scale): super(ROIPool, self).__init__() self.output_size = output_size self.spatial_scale = spatial_scale def forward(self, input, rois): return roi_pool(input, rois, self.output_size, self.spatial_scale) def __repr__(self): tmpstr = self.__class__.__name__ + "(" tmpstr += "output_size=" + str(self.output_size) tmpstr += ", spatial_scale=" + str(self.spatial_scale) tmpstr += ")" return tmpstr ================================================ FILE: maskrcnn_benchmark/layers/scale.py ================================================ import torch from torch import nn class Scale(nn.Module): def __init__(self, init_value=1.0): super(Scale, self).__init__() self.scale = nn.Parameter(torch.FloatTensor([init_value])) def forward(self, input): return input * self.scale ================================================ FILE: maskrcnn_benchmark/layers/sigmoid_focal_loss.py ================================================ import torch from torch import nn from torch.autograd import Function from torch.autograd.function import once_differentiable from maskrcnn_benchmark import _C # TODO: Use JIT to replace CUDA implementation in the future. class _SigmoidFocalLoss(Function): @staticmethod def forward(ctx, logits, targets, gamma, alpha): ctx.save_for_backward(logits, targets) num_classes = logits.shape[1] ctx.num_classes = num_classes ctx.gamma = gamma ctx.alpha = alpha losses = _C.sigmoid_focalloss_forward( logits, targets, num_classes, gamma, alpha ) return losses @staticmethod @once_differentiable def backward(ctx, d_loss): logits, targets = ctx.saved_tensors num_classes = ctx.num_classes gamma = ctx.gamma alpha = ctx.alpha d_loss = d_loss.contiguous() d_logits = _C.sigmoid_focalloss_backward( logits, targets, d_loss, num_classes, gamma, alpha ) return d_logits, None, None, None, None sigmoid_focal_loss_cuda = _SigmoidFocalLoss.apply def sigmoid_focal_loss_cpu(logits, targets, gamma, alpha): num_classes = logits.shape[1] gamma = gamma[0] alpha = alpha[0] dtype = targets.dtype device = targets.device class_range = torch.arange(1, num_classes+1, dtype=dtype, device=device).unsqueeze(0) t = targets.unsqueeze(1) p = torch.sigmoid(logits) term1 = (1 - p) ** gamma * torch.log(p) term2 = p ** gamma * torch.log(1 - p) return -(t == class_range).float() * term1 * alpha - ((t != class_range) * (t >= 0)).float() * term2 * (1 - alpha) class SigmoidFocalLoss(nn.Module): def __init__(self, gamma, alpha): super(SigmoidFocalLoss, self).__init__() self.gamma = gamma self.alpha = alpha def forward(self, logits, targets): device = logits.device if logits.is_cuda: loss_func = sigmoid_focal_loss_cuda else: loss_func = sigmoid_focal_loss_cpu loss = loss_func(logits, targets, self.gamma, self.alpha) return loss.sum() def __repr__(self): tmpstr = self.__class__.__name__ + "(" tmpstr += "gamma=" + str(self.gamma) tmpstr += ", alpha=" + str(self.alpha) tmpstr += ")" return tmpstr ================================================ FILE: maskrcnn_benchmark/layers/smooth_l1_loss.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch # TODO maybe push this to nn? def smooth_l1_loss(input, target, beta=1. / 9, size_average=True): """ very similar to the smooth_l1_loss from pytorch, but with the extra beta parameter """ n = torch.abs(input - target) cond = n < beta loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta) if size_average: return loss.mean() return loss.sum() ================================================ FILE: maskrcnn_benchmark/modeling/__init__.py ================================================ ================================================ FILE: maskrcnn_benchmark/modeling/backbone/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from .backbone import build_backbone from . import fbnet ================================================ FILE: maskrcnn_benchmark/modeling/backbone/backbone.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from collections import OrderedDict from torch import nn from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.modeling.make_layers import conv_with_kaiming_uniform from . import fpn as fpn_module from . import resnet from . import mobilenet @registry.BACKBONES.register("R-50-C4") @registry.BACKBONES.register("R-50-C5") @registry.BACKBONES.register("R-101-C4") @registry.BACKBONES.register("R-101-C5") def build_resnet_backbone(cfg): body = resnet.ResNet(cfg) model = nn.Sequential(OrderedDict([("body", body)])) model.out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS return model @registry.BACKBONES.register("R-50-FPN") @registry.BACKBONES.register("R-101-FPN") @registry.BACKBONES.register("R-152-FPN") def build_resnet_fpn_backbone(cfg): body = resnet.ResNet(cfg) in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS fpn = fpn_module.FPN( in_channels_list=[ in_channels_stage2, in_channels_stage2 * 2, in_channels_stage2 * 4, in_channels_stage2 * 8, ], out_channels=out_channels, conv_block=conv_with_kaiming_uniform( cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU ), top_blocks=fpn_module.LastLevelMaxPool(), ) model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)])) model.out_channels = out_channels return model @registry.BACKBONES.register("R-50-FPN-RETINANET") @registry.BACKBONES.register("R-101-FPN-RETINANET") def build_resnet_fpn_p3p7_backbone(cfg): body = resnet.ResNet(cfg) in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS in_channels_p6p7 = in_channels_stage2 * 8 if cfg.MODEL.RETINANET.USE_C5 \ else out_channels fpn = fpn_module.FPN( in_channels_list=[ 0, in_channels_stage2 * 2, in_channels_stage2 * 4, in_channels_stage2 * 8, ], out_channels=out_channels, conv_block=conv_with_kaiming_uniform( cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU ), top_blocks=fpn_module.LastLevelP6P7(in_channels_p6p7, out_channels), ) model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)])) model.out_channels = out_channels return model @registry.BACKBONES.register("MNV2-FPN-RETINANET") def build_mnv2_fpn_backbone(cfg): body = mobilenet.MobileNetV2(cfg) in_channels_stage2 = body.return_features_num_channels out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS fpn = fpn_module.FPN( in_channels_list=[ 0, in_channels_stage2[1], in_channels_stage2[2], in_channels_stage2[3], ], out_channels=out_channels, conv_block=conv_with_kaiming_uniform( cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU ), top_blocks=fpn_module.LastLevelP6P7(out_channels, out_channels), ) model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)])) model.out_channels = out_channels return model def build_backbone(cfg): assert cfg.MODEL.BACKBONE.CONV_BODY in registry.BACKBONES, \ "cfg.MODEL.BACKBONE.CONV_BODY: {} are not registered in registry".format( cfg.MODEL.BACKBONE.CONV_BODY ) return registry.BACKBONES[cfg.MODEL.BACKBONE.CONV_BODY](cfg) ================================================ FILE: maskrcnn_benchmark/modeling/backbone/fbnet.py ================================================ from __future__ import absolute_import, division, print_function, unicode_literals import copy import json import logging from collections import OrderedDict from . import ( fbnet_builder as mbuilder, fbnet_modeldef as modeldef, ) import torch.nn as nn from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.modeling.rpn import rpn from maskrcnn_benchmark.modeling import poolers logger = logging.getLogger(__name__) def create_builder(cfg): bn_type = cfg.MODEL.FBNET.BN_TYPE if bn_type == "gn": bn_type = (bn_type, cfg.GROUP_NORM.NUM_GROUPS) factor = cfg.MODEL.FBNET.SCALE_FACTOR arch = cfg.MODEL.FBNET.ARCH arch_def = cfg.MODEL.FBNET.ARCH_DEF if len(arch_def) > 0: arch_def = json.loads(arch_def) if arch in modeldef.MODEL_ARCH: if len(arch_def) > 0: assert ( arch_def == modeldef.MODEL_ARCH[arch] ), "Two architectures with the same name {},\n{},\n{}".format( arch, arch_def, modeldef.MODEL_ARCH[arch] ) arch_def = modeldef.MODEL_ARCH[arch] else: assert arch_def is not None and len(arch_def) > 0 arch_def = mbuilder.unify_arch_def(arch_def) rpn_stride = arch_def.get("rpn_stride", None) if rpn_stride is not None: assert ( cfg.MODEL.RPN.ANCHOR_STRIDE[0] == rpn_stride ), "Needs to set cfg.MODEL.RPN.ANCHOR_STRIDE to {}, got {}".format( rpn_stride, cfg.MODEL.RPN.ANCHOR_STRIDE ) width_divisor = cfg.MODEL.FBNET.WIDTH_DIVISOR dw_skip_bn = cfg.MODEL.FBNET.DW_CONV_SKIP_BN dw_skip_relu = cfg.MODEL.FBNET.DW_CONV_SKIP_RELU logger.info( "Building fbnet model with arch {} (without scaling):\n{}".format( arch, arch_def ) ) builder = mbuilder.FBNetBuilder( width_ratio=factor, bn_type=bn_type, width_divisor=width_divisor, dw_skip_bn=dw_skip_bn, dw_skip_relu=dw_skip_relu, ) return builder, arch_def def _get_trunk_cfg(arch_def): """ Get all stages except the last one """ num_stages = mbuilder.get_num_stages(arch_def) trunk_stages = arch_def.get("backbone", range(num_stages - 1)) ret = mbuilder.get_blocks(arch_def, stage_indices=trunk_stages) return ret class FBNetTrunk(nn.Module): def __init__( self, builder, arch_def, dim_in, ): super(FBNetTrunk, self).__init__() self.first = builder.add_first(arch_def["first"], dim_in=dim_in) trunk_cfg = _get_trunk_cfg(arch_def) self.stages = builder.add_blocks(trunk_cfg["stages"]) # return features for each stage def forward(self, x): y = self.first(x) y = self.stages(y) ret = [y] return ret @registry.BACKBONES.register("FBNet") def add_conv_body(cfg, dim_in=3): builder, arch_def = create_builder(cfg) body = FBNetTrunk(builder, arch_def, dim_in) model = nn.Sequential(OrderedDict([("body", body)])) model.out_channels = builder.last_depth return model def _get_rpn_stage(arch_def, num_blocks): rpn_stage = arch_def.get("rpn") ret = mbuilder.get_blocks(arch_def, stage_indices=rpn_stage) if num_blocks > 0: logger.warn('Use last {} blocks in {} as rpn'.format(num_blocks, ret)) block_count = len(ret["stages"]) assert num_blocks <= block_count, "use block {}, block count {}".format( num_blocks, block_count ) blocks = range(block_count - num_blocks, block_count) ret = mbuilder.get_blocks(ret, block_indices=blocks) return ret["stages"] class FBNetRPNHead(nn.Module): def __init__( self, cfg, in_channels, builder, arch_def, ): super(FBNetRPNHead, self).__init__() assert in_channels == builder.last_depth rpn_bn_type = cfg.MODEL.FBNET.RPN_BN_TYPE if len(rpn_bn_type) > 0: builder.bn_type = rpn_bn_type use_blocks = cfg.MODEL.FBNET.RPN_HEAD_BLOCKS stages = _get_rpn_stage(arch_def, use_blocks) self.head = builder.add_blocks(stages) self.out_channels = builder.last_depth def forward(self, x): x = [self.head(y) for y in x] return x @registry.RPN_HEADS.register("FBNet.rpn_head") def add_rpn_head(cfg, in_channels, num_anchors): builder, model_arch = create_builder(cfg) builder.last_depth = in_channels assert in_channels == builder.last_depth # builder.name_prefix = "[rpn]" rpn_feature = FBNetRPNHead(cfg, in_channels, builder, model_arch) rpn_regressor = rpn.RPNHeadConvRegressor( cfg, rpn_feature.out_channels, num_anchors) return nn.Sequential(rpn_feature, rpn_regressor) def _get_head_stage(arch, head_name, blocks): # use default name 'head' if the specific name 'head_name' does not existed if head_name not in arch: head_name = "head" head_stage = arch.get(head_name) ret = mbuilder.get_blocks(arch, stage_indices=head_stage, block_indices=blocks) return ret["stages"] # name mapping for head names in arch def and cfg ARCH_CFG_NAME_MAPPING = { "bbox": "ROI_BOX_HEAD", "kpts": "ROI_KEYPOINT_HEAD", "mask": "ROI_MASK_HEAD", } class FBNetROIHead(nn.Module): def __init__( self, cfg, in_channels, builder, arch_def, head_name, use_blocks, stride_init, last_layer_scale, ): super(FBNetROIHead, self).__init__() assert in_channels == builder.last_depth assert isinstance(use_blocks, list) head_cfg_name = ARCH_CFG_NAME_MAPPING[head_name] self.pooler = poolers.make_pooler(cfg, head_cfg_name) stage = _get_head_stage(arch_def, head_name, use_blocks) assert stride_init in [0, 1, 2] if stride_init != 0: stage[0]["block"][3] = stride_init blocks = builder.add_blocks(stage) last_info = copy.deepcopy(arch_def["last"]) last_info[1] = last_layer_scale last = builder.add_last(last_info) self.head = nn.Sequential(OrderedDict([ ("blocks", blocks), ("last", last) ])) self.out_channels = builder.last_depth def forward(self, x, proposals): x = self.pooler(x, proposals) x = self.head(x) return x @registry.ROI_BOX_FEATURE_EXTRACTORS.register("FBNet.roi_head") def add_roi_head(cfg, in_channels): builder, model_arch = create_builder(cfg) builder.last_depth = in_channels # builder.name_prefix = "_[bbox]_" return FBNetROIHead( cfg, in_channels, builder, model_arch, head_name="bbox", use_blocks=cfg.MODEL.FBNET.DET_HEAD_BLOCKS, stride_init=cfg.MODEL.FBNET.DET_HEAD_STRIDE, last_layer_scale=cfg.MODEL.FBNET.DET_HEAD_LAST_SCALE, ) @registry.ROI_KEYPOINT_FEATURE_EXTRACTORS.register("FBNet.roi_head_keypoints") def add_roi_head_keypoints(cfg, in_channels): builder, model_arch = create_builder(cfg) builder.last_depth = in_channels # builder.name_prefix = "_[kpts]_" return FBNetROIHead( cfg, in_channels, builder, model_arch, head_name="kpts", use_blocks=cfg.MODEL.FBNET.KPTS_HEAD_BLOCKS, stride_init=cfg.MODEL.FBNET.KPTS_HEAD_STRIDE, last_layer_scale=cfg.MODEL.FBNET.KPTS_HEAD_LAST_SCALE, ) @registry.ROI_MASK_FEATURE_EXTRACTORS.register("FBNet.roi_head_mask") def add_roi_head_mask(cfg, in_channels): builder, model_arch = create_builder(cfg) builder.last_depth = in_channels # builder.name_prefix = "_[mask]_" return FBNetROIHead( cfg, in_channels, builder, model_arch, head_name="mask", use_blocks=cfg.MODEL.FBNET.MASK_HEAD_BLOCKS, stride_init=cfg.MODEL.FBNET.MASK_HEAD_STRIDE, last_layer_scale=cfg.MODEL.FBNET.MASK_HEAD_LAST_SCALE, ) ================================================ FILE: maskrcnn_benchmark/modeling/backbone/fbnet_builder.py ================================================ """ FBNet model builder """ from __future__ import absolute_import, division, print_function, unicode_literals import copy import logging import math from collections import OrderedDict import torch import torch.nn as nn from maskrcnn_benchmark.layers import ( BatchNorm2d, Conv2d, FrozenBatchNorm2d, interpolate, ) from maskrcnn_benchmark.layers.misc import _NewEmptyTensorOp logger = logging.getLogger(__name__) def _py2_round(x): return math.floor(x + 0.5) if x >= 0.0 else math.ceil(x - 0.5) def _get_divisible_by(num, divisible_by, min_val): ret = int(num) if divisible_by > 0 and num % divisible_by != 0: ret = int((_py2_round(num / divisible_by) or min_val) * divisible_by) return ret PRIMITIVES = { "skip": lambda C_in, C_out, expansion, stride, **kwargs: Identity( C_in, C_out, stride ), "ir_k3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, expansion, stride, **kwargs ), "ir_k5": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, expansion, stride, kernel=5, **kwargs ), "ir_k7": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, expansion, stride, kernel=7, **kwargs ), "ir_k1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, expansion, stride, kernel=1, **kwargs ), "shuffle": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, expansion, stride, shuffle_type="mid", pw_group=4, **kwargs ), "basic_block": lambda C_in, C_out, expansion, stride, **kwargs: CascadeConv3x3( C_in, C_out, stride ), "shift_5x5": lambda C_in, C_out, expansion, stride, **kwargs: ShiftBlock5x5( C_in, C_out, expansion, stride ), # layer search 2 "ir_k3_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=3, **kwargs ), "ir_k3_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 3, stride, kernel=3, **kwargs ), "ir_k3_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 6, stride, kernel=3, **kwargs ), "ir_k3_s4": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 4, stride, kernel=3, shuffle_type="mid", pw_group=4, **kwargs ), "ir_k5_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=5, **kwargs ), "ir_k5_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 3, stride, kernel=5, **kwargs ), "ir_k5_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 6, stride, kernel=5, **kwargs ), "ir_k5_s4": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 4, stride, kernel=5, shuffle_type="mid", pw_group=4, **kwargs ), # layer search se "ir_k3_e1_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=3, se=True, **kwargs ), "ir_k3_e3_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 3, stride, kernel=3, se=True, **kwargs ), "ir_k3_e6_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 6, stride, kernel=3, se=True, **kwargs ), "ir_k3_s4_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 4, stride, kernel=3, shuffle_type="mid", pw_group=4, se=True, **kwargs ), "ir_k5_e1_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=5, se=True, **kwargs ), "ir_k5_e3_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 3, stride, kernel=5, se=True, **kwargs ), "ir_k5_e6_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 6, stride, kernel=5, se=True, **kwargs ), "ir_k5_s4_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 4, stride, kernel=5, shuffle_type="mid", pw_group=4, se=True, **kwargs ), # layer search 3 (in addition to layer search 2) "ir_k3_s2": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=3, shuffle_type="mid", pw_group=2, **kwargs ), "ir_k5_s2": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=5, shuffle_type="mid", pw_group=2, **kwargs ), "ir_k3_s2_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=3, shuffle_type="mid", pw_group=2, se=True, **kwargs ), "ir_k5_s2_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=5, shuffle_type="mid", pw_group=2, se=True, **kwargs ), # layer search 4 (in addition to layer search 3) "ir_k3_sep": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, expansion, stride, kernel=3, cdw=True, **kwargs ), "ir_k33_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=3, cdw=True, **kwargs ), "ir_k33_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 3, stride, kernel=3, cdw=True, **kwargs ), "ir_k33_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 6, stride, kernel=3, cdw=True, **kwargs ), # layer search 5 (in addition to layer search 4) "ir_k7_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=7, **kwargs ), "ir_k7_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 3, stride, kernel=7, **kwargs ), "ir_k7_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 6, stride, kernel=7, **kwargs ), "ir_k7_sep": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, expansion, stride, kernel=7, cdw=True, **kwargs ), "ir_k7_sep_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 1, stride, kernel=7, cdw=True, **kwargs ), "ir_k7_sep_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 3, stride, kernel=7, cdw=True, **kwargs ), "ir_k7_sep_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( C_in, C_out, 6, stride, kernel=7, cdw=True, **kwargs ), } class Identity(nn.Module): def __init__(self, C_in, C_out, stride): super(Identity, self).__init__() self.conv = ( ConvBNRelu( C_in, C_out, kernel=1, stride=stride, pad=0, no_bias=1, use_relu="relu", bn_type="bn", ) if C_in != C_out or stride != 1 else None ) def forward(self, x): if self.conv: out = self.conv(x) else: out = x return out class CascadeConv3x3(nn.Sequential): def __init__(self, C_in, C_out, stride): assert stride in [1, 2] ops = [ Conv2d(C_in, C_in, 3, stride, 1, bias=False), BatchNorm2d(C_in), nn.ReLU(inplace=True), Conv2d(C_in, C_out, 3, 1, 1, bias=False), BatchNorm2d(C_out), ] super(CascadeConv3x3, self).__init__(*ops) self.res_connect = (stride == 1) and (C_in == C_out) def forward(self, x): y = super(CascadeConv3x3, self).forward(x) if self.res_connect: y += x return y class Shift(nn.Module): def __init__(self, C, kernel_size, stride, padding): super(Shift, self).__init__() self.C = C kernel = torch.zeros((C, 1, kernel_size, kernel_size), dtype=torch.float32) ch_idx = 0 assert stride in [1, 2] self.stride = stride self.padding = padding self.kernel_size = kernel_size self.dilation = 1 hks = kernel_size // 2 ksq = kernel_size ** 2 for i in range(kernel_size): for j in range(kernel_size): if i == hks and j == hks: num_ch = C // ksq + C % ksq else: num_ch = C // ksq kernel[ch_idx : ch_idx + num_ch, 0, i, j] = 1 ch_idx += num_ch self.register_parameter("bias", None) self.kernel = nn.Parameter(kernel, requires_grad=False) def forward(self, x): if x.numel() > 0: return nn.functional.conv2d( x, self.kernel, self.bias, (self.stride, self.stride), (self.padding, self.padding), self.dilation, self.C, # groups ) output_shape = [ (i + 2 * p - (di * (k - 1) + 1)) // d + 1 for i, p, di, k, d in zip( x.shape[-2:], (self.padding, self.dilation), (self.dilation, self.dilation), (self.kernel_size, self.kernel_size), (self.stride, self.stride), ) ] output_shape = [x.shape[0], self.C] + output_shape return _NewEmptyTensorOp.apply(x, output_shape) class ShiftBlock5x5(nn.Sequential): def __init__(self, C_in, C_out, expansion, stride): assert stride in [1, 2] self.res_connect = (stride == 1) and (C_in == C_out) C_mid = _get_divisible_by(C_in * expansion, 8, 8) ops = [ # pw Conv2d(C_in, C_mid, 1, 1, 0, bias=False), BatchNorm2d(C_mid), nn.ReLU(inplace=True), # shift Shift(C_mid, 5, stride, 2), # pw-linear Conv2d(C_mid, C_out, 1, 1, 0, bias=False), BatchNorm2d(C_out), ] super(ShiftBlock5x5, self).__init__(*ops) def forward(self, x): y = super(ShiftBlock5x5, self).forward(x) if self.res_connect: y += x return y class ChannelShuffle(nn.Module): def __init__(self, groups): super(ChannelShuffle, self).__init__() self.groups = groups def forward(self, x): """Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]""" N, C, H, W = x.size() g = self.groups assert C % g == 0, "Incompatible group size {} for input channel {}".format( g, C ) return ( x.view(N, g, int(C / g), H, W) .permute(0, 2, 1, 3, 4) .contiguous() .view(N, C, H, W) ) class ConvBNRelu(nn.Sequential): def __init__( self, input_depth, output_depth, kernel, stride, pad, no_bias, use_relu, bn_type, group=1, *args, **kwargs ): super(ConvBNRelu, self).__init__() assert use_relu in ["relu", None] if isinstance(bn_type, (list, tuple)): assert len(bn_type) == 2 assert bn_type[0] == "gn" gn_group = bn_type[1] bn_type = bn_type[0] assert bn_type in ["bn", "af", "gn", None] assert stride in [1, 2, 4] op = Conv2d( input_depth, output_depth, kernel_size=kernel, stride=stride, padding=pad, bias=not no_bias, groups=group, *args, **kwargs ) nn.init.kaiming_normal_(op.weight, mode="fan_out", nonlinearity="relu") if op.bias is not None: nn.init.constant_(op.bias, 0.0) self.add_module("conv", op) if bn_type == "bn": bn_op = BatchNorm2d(output_depth) elif bn_type == "gn": bn_op = nn.GroupNorm(num_groups=gn_group, num_channels=output_depth) elif bn_type == "af": bn_op = FrozenBatchNorm2d(output_depth) if bn_type is not None: self.add_module("bn", bn_op) if use_relu == "relu": self.add_module("relu", nn.ReLU(inplace=True)) class SEModule(nn.Module): reduction = 4 def __init__(self, C): super(SEModule, self).__init__() mid = max(C // self.reduction, 8) conv1 = Conv2d(C, mid, 1, 1, 0) conv2 = Conv2d(mid, C, 1, 1, 0) self.op = nn.Sequential( nn.AdaptiveAvgPool2d(1), conv1, nn.ReLU(inplace=True), conv2, nn.Sigmoid() ) def forward(self, x): return x * self.op(x) class Upsample(nn.Module): def __init__(self, scale_factor, mode, align_corners=None): super(Upsample, self).__init__() self.scale = scale_factor self.mode = mode self.align_corners = align_corners def forward(self, x): return interpolate( x, scale_factor=self.scale, mode=self.mode, align_corners=self.align_corners ) def _get_upsample_op(stride): assert ( stride in [1, 2, 4] or stride in [-1, -2, -4] or (isinstance(stride, tuple) and all(x in [-1, -2, -4] for x in stride)) ) scales = stride ret = None if isinstance(stride, tuple) or stride < 0: scales = [-x for x in stride] if isinstance(stride, tuple) else -stride stride = 1 ret = Upsample(scale_factor=scales, mode="nearest", align_corners=None) return ret, stride class IRFBlock(nn.Module): def __init__( self, input_depth, output_depth, expansion, stride, bn_type="bn", kernel=3, width_divisor=1, shuffle_type=None, pw_group=1, se=False, cdw=False, dw_skip_bn=False, dw_skip_relu=False, ): super(IRFBlock, self).__init__() assert kernel in [1, 3, 5, 7], kernel self.use_res_connect = stride == 1 and input_depth == output_depth self.output_depth = output_depth mid_depth = int(input_depth * expansion) mid_depth = _get_divisible_by(mid_depth, width_divisor, width_divisor) # pw self.pw = ConvBNRelu( input_depth, mid_depth, kernel=1, stride=1, pad=0, no_bias=1, use_relu="relu", bn_type=bn_type, group=pw_group, ) # negative stride to do upsampling self.upscale, stride = _get_upsample_op(stride) # dw if kernel == 1: self.dw = nn.Sequential() elif cdw: dw1 = ConvBNRelu( mid_depth, mid_depth, kernel=kernel, stride=stride, pad=(kernel // 2), group=mid_depth, no_bias=1, use_relu="relu", bn_type=bn_type, ) dw2 = ConvBNRelu( mid_depth, mid_depth, kernel=kernel, stride=1, pad=(kernel // 2), group=mid_depth, no_bias=1, use_relu="relu" if not dw_skip_relu else None, bn_type=bn_type if not dw_skip_bn else None, ) self.dw = nn.Sequential(OrderedDict([("dw1", dw1), ("dw2", dw2)])) else: self.dw = ConvBNRelu( mid_depth, mid_depth, kernel=kernel, stride=stride, pad=(kernel // 2), group=mid_depth, no_bias=1, use_relu="relu" if not dw_skip_relu else None, bn_type=bn_type if not dw_skip_bn else None, ) # pw-linear self.pwl = ConvBNRelu( mid_depth, output_depth, kernel=1, stride=1, pad=0, no_bias=1, use_relu=None, bn_type=bn_type, group=pw_group, ) self.shuffle_type = shuffle_type if shuffle_type is not None: self.shuffle = ChannelShuffle(pw_group) self.se4 = SEModule(output_depth) if se else nn.Sequential() self.output_depth = output_depth def forward(self, x): y = self.pw(x) if self.shuffle_type == "mid": y = self.shuffle(y) if self.upscale is not None: y = self.upscale(y) y = self.dw(y) y = self.pwl(y) if self.use_res_connect: y += x y = self.se4(y) return y def _expand_block_cfg(block_cfg): assert isinstance(block_cfg, list) ret = [] for idx in range(block_cfg[2]): cur = copy.deepcopy(block_cfg) cur[2] = 1 cur[3] = 1 if idx >= 1 else cur[3] ret.append(cur) return ret def expand_stage_cfg(stage_cfg): """ For a single stage """ assert isinstance(stage_cfg, list) ret = [] for x in stage_cfg: ret += _expand_block_cfg(x) return ret def expand_stages_cfg(stage_cfgs): """ For a list of stages """ assert isinstance(stage_cfgs, list) ret = [] for x in stage_cfgs: ret.append(expand_stage_cfg(x)) return ret def _block_cfgs_to_list(block_cfgs): assert isinstance(block_cfgs, list) ret = [] for stage_idx, stage in enumerate(block_cfgs): stage = expand_stage_cfg(stage) for block_idx, block in enumerate(stage): cur = {"stage_idx": stage_idx, "block_idx": block_idx, "block": block} ret.append(cur) return ret def _add_to_arch(arch, info, name): """ arch = [{block_0}, {block_1}, ...] info = [ # stage 0 [ block0_info, block1_info, ... ], ... ] convert to: arch = [ { block_0, name: block0_info, }, { block_1, name: block1_info, }, ... ] """ assert isinstance(arch, list) and all(isinstance(x, dict) for x in arch) assert isinstance(info, list) and all(isinstance(x, list) for x in info) idx = 0 for stage_idx, stage in enumerate(info): for block_idx, block in enumerate(stage): assert ( arch[idx]["stage_idx"] == stage_idx and arch[idx]["block_idx"] == block_idx ), "Index ({}, {}) does not match for block {}".format( stage_idx, block_idx, arch[idx] ) assert name not in arch[idx] arch[idx][name] = block idx += 1 def unify_arch_def(arch_def): """ unify the arch_def to: { ..., "arch": [ { "stage_idx": idx, "block_idx": idx, ... }, {}, ... ] } """ ret = copy.deepcopy(arch_def) assert "block_cfg" in arch_def and "stages" in arch_def["block_cfg"] assert "stages" not in ret # copy 'first', 'last' etc. inside arch_def['block_cfg'] to ret ret.update({x: arch_def["block_cfg"][x] for x in arch_def["block_cfg"]}) ret["stages"] = _block_cfgs_to_list(arch_def["block_cfg"]["stages"]) del ret["block_cfg"] assert "block_op_type" in arch_def _add_to_arch(ret["stages"], arch_def["block_op_type"], "block_op_type") del ret["block_op_type"] return ret def get_num_stages(arch_def): ret = 0 for x in arch_def["stages"]: ret = max(x["stage_idx"], ret) ret = ret + 1 return ret def get_blocks(arch_def, stage_indices=None, block_indices=None): ret = copy.deepcopy(arch_def) ret["stages"] = [] for block in arch_def["stages"]: keep = True if stage_indices not in (None, []) and block["stage_idx"] not in stage_indices: keep = False if block_indices not in (None, []) and block["block_idx"] not in block_indices: keep = False if keep: ret["stages"].append(block) return ret class FBNetBuilder(object): def __init__( self, width_ratio, bn_type="bn", width_divisor=1, dw_skip_bn=False, dw_skip_relu=False, ): self.width_ratio = width_ratio self.last_depth = -1 self.bn_type = bn_type self.width_divisor = width_divisor self.dw_skip_bn = dw_skip_bn self.dw_skip_relu = dw_skip_relu def add_first(self, stage_info, dim_in=3, pad=True): # stage_info: [c, s, kernel] assert len(stage_info) >= 2 channel = stage_info[0] stride = stage_info[1] out_depth = self._get_divisible_width(int(channel * self.width_ratio)) kernel = 3 if len(stage_info) > 2: kernel = stage_info[2] out = ConvBNRelu( dim_in, out_depth, kernel=kernel, stride=stride, pad=kernel // 2 if pad else 0, no_bias=1, use_relu="relu", bn_type=self.bn_type, ) self.last_depth = out_depth return out def add_blocks(self, blocks): """ blocks: [{}, {}, ...] """ assert isinstance(blocks, list) and all( isinstance(x, dict) for x in blocks ), blocks modules = OrderedDict() for block in blocks: stage_idx = block["stage_idx"] block_idx = block["block_idx"] block_op_type = block["block_op_type"] tcns = block["block"] n = tcns[2] assert n == 1 nnblock = self.add_ir_block(tcns, [block_op_type]) nn_name = "xif{}_{}".format(stage_idx, block_idx) assert nn_name not in modules modules[nn_name] = nnblock ret = nn.Sequential(modules) return ret def add_last(self, stage_info): """ skip last layer if channel_scale == 0 use the same output channel if channel_scale < 0 """ assert len(stage_info) == 2 channels = stage_info[0] channel_scale = stage_info[1] if channel_scale == 0.0: return nn.Sequential() if channel_scale > 0: last_channel = ( int(channels * self.width_ratio) if self.width_ratio > 1.0 else channels ) last_channel = int(last_channel * channel_scale) else: last_channel = int(self.last_depth * (-channel_scale)) last_channel = self._get_divisible_width(last_channel) if last_channel == 0: return nn.Sequential() dim_in = self.last_depth ret = ConvBNRelu( dim_in, last_channel, kernel=1, stride=1, pad=0, no_bias=1, use_relu="relu", bn_type=self.bn_type, ) self.last_depth = last_channel return ret # def add_final_pool(self, model, blob_in, kernel_size): # ret = model.AveragePool(blob_in, "final_avg", kernel=kernel_size, stride=1) # return ret def _add_ir_block( self, dim_in, dim_out, stride, expand_ratio, block_op_type, **kwargs ): ret = PRIMITIVES[block_op_type]( dim_in, dim_out, expansion=expand_ratio, stride=stride, bn_type=self.bn_type, width_divisor=self.width_divisor, dw_skip_bn=self.dw_skip_bn, dw_skip_relu=self.dw_skip_relu, **kwargs ) return ret, ret.output_depth def add_ir_block(self, tcns, block_op_types, **kwargs): t, c, n, s = tcns assert n == 1 out_depth = self._get_divisible_width(int(c * self.width_ratio)) dim_in = self.last_depth op, ret_depth = self._add_ir_block( dim_in, out_depth, stride=s, expand_ratio=t, block_op_type=block_op_types[0], **kwargs ) self.last_depth = ret_depth return op def _get_divisible_width(self, width): ret = _get_divisible_by(int(width), self.width_divisor, self.width_divisor) return ret ================================================ FILE: maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py ================================================ from __future__ import absolute_import, division, print_function, unicode_literals def add_archs(archs): global MODEL_ARCH for x in archs: assert x not in MODEL_ARCH, "Duplicated model name {} existed".format(x) MODEL_ARCH[x] = archs[x] MODEL_ARCH = { "default": { "block_op_type": [ # stage 0 ["ir_k3"], # stage 1 ["ir_k3"] * 2, # stage 2 ["ir_k3"] * 3, # stage 3 ["ir_k3"] * 7, # stage 4, bbox head ["ir_k3"] * 4, # stage 5, rpn ["ir_k3"] * 3, # stage 5, mask head ["ir_k3"] * 5, ], "block_cfg": { "first": [32, 2], "stages": [ # [t, c, n, s] # stage 0 [[1, 16, 1, 1]], # stage 1 [[6, 24, 2, 2]], # stage 2 [[6, 32, 3, 2]], # stage 3 [[6, 64, 4, 2], [6, 96, 3, 1]], # stage 4, bbox head [[4, 160, 1, 2], [6, 160, 2, 1], [6, 240, 1, 1]], # [[6, 160, 3, 2], [6, 320, 1, 1]], # stage 5, rpn head [[6, 96, 3, 1]], # stage 6, mask head [[4, 160, 1, 1], [6, 160, 3, 1], [3, 80, 1, -2]], ], # [c, channel_scale] "last": [0, 0.0], "backbone": [0, 1, 2, 3], "rpn": [5], "bbox": [4], "mask": [6], }, }, "xirb16d_dsmask": { "block_op_type": [ # stage 0 ["ir_k3"], # stage 1 ["ir_k3"] * 2, # stage 2 ["ir_k3"] * 3, # stage 3 ["ir_k3"] * 7, # stage 4, bbox head ["ir_k3"] * 4, # stage 5, mask head ["ir_k3"] * 5, # stage 6, rpn ["ir_k3"] * 3, ], "block_cfg": { "first": [16, 2], "stages": [ # [t, c, n, s] # stage 0 [[1, 16, 1, 1]], # stage 1 [[6, 32, 2, 2]], # stage 2 [[6, 48, 3, 2]], # stage 3 [[6, 96, 4, 2], [6, 128, 3, 1]], # stage 4, bbox head [[4, 128, 1, 2], [6, 128, 2, 1], [6, 160, 1, 1]], # stage 5, mask head [[4, 128, 1, 2], [6, 128, 2, 1], [6, 128, 1, -2], [3, 64, 1, -2]], # stage 6, rpn head [[6, 128, 3, 1]], ], # [c, channel_scale] "last": [0, 0.0], "backbone": [0, 1, 2, 3], "rpn": [6], "bbox": [4], "mask": [5], }, }, "mobilenet_v2": { "block_op_type": [ # stage 0 ["ir_k3"], # stage 1 ["ir_k3"] * 2, # stage 2 ["ir_k3"] * 3, # stage 3 ["ir_k3"] * 7, # stage 4 ["ir_k3"] * 4, ], "block_cfg": { "first": [32, 2], "stages": [ # [t, c, n, s] # stage 0 [[1, 16, 1, 1]], # stage 1 [[6, 24, 2, 2]], # stage 2 [[6, 32, 3, 2]], # stage 3 [[6, 64, 4, 2], [6, 96, 3, 1]], # stage 4 [[6, 160, 3, 1], [6, 320, 1, 1]], ], # [c, channel_scale] "last": [0, 0.0], "backbone": [0, 1, 2, 3], "bbox": [4], }, }, } MODEL_ARCH_CHAM = { "cham_v1a": { "block_op_type": [ # stage 0 ["ir_k3"], # stage 1 ["ir_k7"] * 2, # stage 2 ["ir_k3"] * 5, # stage 3 ["ir_k5"] * 7 + ["ir_k3"] * 5, # stage 4, bbox head ["ir_k3"] * 5, # stage 5, rpn ["ir_k3"] * 3, ], "block_cfg": { "first": [32, 2], "stages": [ # [t, c, n, s] # stage 0 [[1, 24, 1, 1]], # stage 1 [[4, 48, 2, 2]], # stage 2 [[7, 64, 5, 2]], # stage 3 [[12, 56, 7, 2], [8, 88, 5, 1]], # stage 4, bbox head [[7, 152, 4, 2], [10, 104, 1, 1]], # stage 5, rpn head [[8, 88, 3, 1]], ], # [c, channel_scale] "last": [0, 0.0], "backbone": [0, 1, 2, 3], "rpn": [5], "bbox": [4], }, }, "cham_v2": { "block_op_type": [ # stage 0 ["ir_k3"], # stage 1 ["ir_k5"] * 4, # stage 2 ["ir_k7"] * 6, # stage 3 ["ir_k5"] * 3 + ["ir_k3"] * 6, # stage 4, bbox head ["ir_k3"] * 7, # stage 5, rpn ["ir_k3"] * 1, ], "block_cfg": { "first": [32, 2], "stages": [ # [t, c, n, s] # stage 0 [[1, 24, 1, 1]], # stage 1 [[8, 32, 4, 2]], # stage 2 [[5, 48, 6, 2]], # stage 3 [[9, 56, 3, 2], [6, 56, 6, 1]], # stage 4, bbox head [[2, 160, 6, 2], [6, 112, 1, 1]], # stage 5, rpn head [[6, 56, 1, 1]], ], # [c, channel_scale] "last": [0, 0.0], "backbone": [0, 1, 2, 3], "rpn": [5], "bbox": [4], }, }, } add_archs(MODEL_ARCH_CHAM) ================================================ FILE: maskrcnn_benchmark/modeling/backbone/fpn.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch import torch.nn.functional as F from torch import nn class FPN(nn.Module): """ Module that adds FPN on top of a list of feature maps. The feature maps are currently supposed to be in increasing depth order, and must be consecutive """ def __init__( self, in_channels_list, out_channels, conv_block, top_blocks=None ): """ Arguments: in_channels_list (list[int]): number of channels for each feature map that will be fed out_channels (int): number of channels of the FPN representation top_blocks (nn.Module or None): if provided, an extra operation will be performed on the output of the last (smallest resolution) FPN output, and the result will extend the result list """ super(FPN, self).__init__() self.inner_blocks = [] self.layer_blocks = [] for idx, in_channels in enumerate(in_channels_list, 1): inner_block = "fpn_inner{}".format(idx) layer_block = "fpn_layer{}".format(idx) if in_channels == 0: continue inner_block_module = conv_block(in_channels, out_channels, 1) layer_block_module = conv_block(out_channels, out_channels, 3, 1) self.add_module(inner_block, inner_block_module) self.add_module(layer_block, layer_block_module) self.inner_blocks.append(inner_block) self.layer_blocks.append(layer_block) self.top_blocks = top_blocks def forward(self, x): """ Arguments: x (list[Tensor]): feature maps for each feature level. Returns: results (tuple[Tensor]): feature maps after FPN layers. They are ordered from highest resolution first. """ last_inner = getattr(self, self.inner_blocks[-1])(x[-1]) results = [] results.append(getattr(self, self.layer_blocks[-1])(last_inner)) for feature, inner_block, layer_block in zip( x[:-1][::-1], self.inner_blocks[:-1][::-1], self.layer_blocks[:-1][::-1] ): if not inner_block: continue inner_top_down = F.interpolate(last_inner, scale_factor=2, mode="nearest") inner_lateral = getattr(self, inner_block)(feature) # TODO use size instead of scale to make it robust to different sizes # inner_top_down = F.upsample(last_inner, size=inner_lateral.shape[-2:], # mode='bilinear', align_corners=False) last_inner = inner_lateral + inner_top_down results.insert(0, getattr(self, layer_block)(last_inner)) if isinstance(self.top_blocks, LastLevelP6P7): last_results = self.top_blocks(x[-1], results[-1]) results.extend(last_results) elif isinstance(self.top_blocks, LastLevelMaxPool): last_results = self.top_blocks(results[-1]) results.extend(last_results) return tuple(results) class LastLevelMaxPool(nn.Module): def forward(self, x): return [F.max_pool2d(x, 1, 2, 0)] class LastLevelP6P7(nn.Module): """ This module is used in RetinaNet to generate extra layers, P6 and P7. """ def __init__(self, in_channels, out_channels): super(LastLevelP6P7, self).__init__() self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1) self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1) for module in [self.p6, self.p7]: nn.init.kaiming_uniform_(module.weight, a=1) nn.init.constant_(module.bias, 0) self.use_P5 = in_channels == out_channels def forward(self, c5, p5): x = p5 if self.use_P5 else c5 p6 = self.p6(x) p7 = self.p7(F.relu(p6)) return [p6, p7] ================================================ FILE: maskrcnn_benchmark/modeling/backbone/mobilenet.py ================================================ # taken from https://github.com/tonylins/pytorch-mobilenet-v2/ # Published by Ji Lin, tonylins # licensed under the Apache License, Version 2.0, January 2004 from torch import nn from torch.nn import BatchNorm2d #from maskrcnn_benchmark.layers import FrozenBatchNorm2d as BatchNorm2d from maskrcnn_benchmark.layers import Conv2d def conv_bn(inp, oup, stride): return nn.Sequential( Conv2d(inp, oup, 3, stride, 1, bias=False), BatchNorm2d(oup), nn.ReLU6(inplace=True) ) def conv_1x1_bn(inp, oup): return nn.Sequential( Conv2d(inp, oup, 1, 1, 0, bias=False), BatchNorm2d(oup), nn.ReLU6(inplace=True) ) class InvertedResidual(nn.Module): def __init__(self, inp, oup, stride, expand_ratio): super(InvertedResidual, self).__init__() self.stride = stride assert stride in [1, 2] hidden_dim = int(round(inp * expand_ratio)) self.use_res_connect = self.stride == 1 and inp == oup if expand_ratio == 1: self.conv = nn.Sequential( # dw Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), BatchNorm2d(hidden_dim), nn.ReLU6(inplace=True), # pw-linear Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), BatchNorm2d(oup), ) else: self.conv = nn.Sequential( # pw Conv2d(inp, hidden_dim, 1, 1, 0, bias=False), BatchNorm2d(hidden_dim), nn.ReLU6(inplace=True), # dw Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), BatchNorm2d(hidden_dim), nn.ReLU6(inplace=True), # pw-linear Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), BatchNorm2d(oup), ) def forward(self, x): if self.use_res_connect: return x + self.conv(x) else: return self.conv(x) class MobileNetV2(nn.Module): """ Should freeze bn """ def __init__(self, cfg, n_class=1000, input_size=224, width_mult=1.): super(MobileNetV2, self).__init__() block = InvertedResidual input_channel = 32 interverted_residual_setting = [ # t, c, n, s [1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2], [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2], [6, 320, 1, 1], ] # building first layer assert input_size % 32 == 0 input_channel = int(input_channel * width_mult) self.return_features_indices = [3, 6, 13, 17] self.return_features_num_channels = [] self.features = nn.ModuleList([conv_bn(3, input_channel, 2)]) # building inverted residual blocks for t, c, n, s in interverted_residual_setting: output_channel = int(c * width_mult) for i in range(n): if i == 0: self.features.append(block(input_channel, output_channel, s, expand_ratio=t)) else: self.features.append(block(input_channel, output_channel, 1, expand_ratio=t)) input_channel = output_channel if len(self.features) - 1 in self.return_features_indices: self.return_features_num_channels.append(output_channel) self._initialize_weights() self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_CONV_BODY_AT) def _freeze_backbone(self, freeze_at): for layer_index in range(freeze_at): for p in self.features[layer_index].parameters(): p.requires_grad = False def forward(self, x): res = [] for i, m in enumerate(self.features): x = m(x) if i in self.return_features_indices: res.append(x) return res def _initialize_weights(self): for m in self.modules(): if isinstance(m, Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, (2. / n) ** 0.5) if m.bias is not None: m.bias.data.zero_() elif isinstance(m, BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() elif isinstance(m, nn.Linear): n = m.weight.size(1) m.weight.data.normal_(0, 0.01) m.bias.data.zero_() ================================================ FILE: maskrcnn_benchmark/modeling/backbone/resnet.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """ Variant of the resnet module that takes cfg as an argument. Example usage. Strings may be specified in the config file. model = ResNet( "StemWithFixedBatchNorm", "BottleneckWithFixedBatchNorm", "ResNet50StagesTo4", ) OR: model = ResNet( "StemWithGN", "BottleneckWithGN", "ResNet50StagesTo4", ) Custom implementations may be written in user code and hooked in via the `register_*` functions. """ from collections import namedtuple import torch import torch.nn.functional as F from torch import nn from maskrcnn_benchmark.layers import FrozenBatchNorm2d from maskrcnn_benchmark.layers import Conv2d from maskrcnn_benchmark.modeling.make_layers import group_norm from maskrcnn_benchmark.utils.registry import Registry # ResNet stage specification StageSpec = namedtuple( "StageSpec", [ "index", # Index of the stage, eg 1, 2, ..,. 5 "block_count", # Number of residual blocks in the stage "return_features", # True => return the last feature map from this stage ], ) # ----------------------------------------------------------------------------- # Standard ResNet models # ----------------------------------------------------------------------------- # ResNet-50 (including all stages) ResNet50StagesTo5 = tuple( StageSpec(index=i, block_count=c, return_features=r) for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 6, False), (4, 3, True)) ) # ResNet-50 up to stage 4 (excludes stage 5) ResNet50StagesTo4 = tuple( StageSpec(index=i, block_count=c, return_features=r) for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 6, True)) ) # ResNet-101 (including all stages) ResNet101StagesTo5 = tuple( StageSpec(index=i, block_count=c, return_features=r) for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 23, False), (4, 3, True)) ) # ResNet-101 up to stage 4 (excludes stage 5) ResNet101StagesTo4 = tuple( StageSpec(index=i, block_count=c, return_features=r) for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 23, True)) ) # ResNet-50-FPN (including all stages) ResNet50FPNStagesTo5 = tuple( StageSpec(index=i, block_count=c, return_features=r) for (i, c, r) in ((1, 3, True), (2, 4, True), (3, 6, True), (4, 3, True)) ) # ResNet-101-FPN (including all stages) ResNet101FPNStagesTo5 = tuple( StageSpec(index=i, block_count=c, return_features=r) for (i, c, r) in ((1, 3, True), (2, 4, True), (3, 23, True), (4, 3, True)) ) # ResNet-152-FPN (including all stages) ResNet152FPNStagesTo5 = tuple( StageSpec(index=i, block_count=c, return_features=r) for (i, c, r) in ((1, 3, True), (2, 8, True), (3, 36, True), (4, 3, True)) ) class ResNet(nn.Module): def __init__(self, cfg): super(ResNet, self).__init__() # If we want to use the cfg in forward(), then we should make a copy # of it and store it for later use: # self.cfg = cfg.clone() # Translate string names to implementations stem_module = _STEM_MODULES[cfg.MODEL.RESNETS.STEM_FUNC] stage_specs = _STAGE_SPECS[cfg.MODEL.BACKBONE.CONV_BODY] transformation_module = _TRANSFORMATION_MODULES[cfg.MODEL.RESNETS.TRANS_FUNC] # Construct the stem module self.stem = stem_module(cfg) # Constuct the specified ResNet stages num_groups = cfg.MODEL.RESNETS.NUM_GROUPS width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS stage2_bottleneck_channels = num_groups * width_per_group stage2_out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS self.stages = [] self.return_features = {} for stage_spec in stage_specs: name = "layer" + str(stage_spec.index) stage2_relative_factor = 2 ** (stage_spec.index - 1) bottleneck_channels = stage2_bottleneck_channels * stage2_relative_factor out_channels = stage2_out_channels * stage2_relative_factor module = _make_stage( transformation_module, in_channels, bottleneck_channels, out_channels, stage_spec.block_count, num_groups, cfg.MODEL.RESNETS.STRIDE_IN_1X1, first_stride=int(stage_spec.index > 1) + 1, ) in_channels = out_channels self.add_module(name, module) self.stages.append(name) self.return_features[name] = stage_spec.return_features # Optionally freeze (requires_grad=False) parts of the backbone self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_CONV_BODY_AT) def _freeze_backbone(self, freeze_at): if freeze_at < 0: return for stage_index in range(freeze_at): if stage_index == 0: m = self.stem # stage 0 is the stem else: m = getattr(self, "layer" + str(stage_index)) for p in m.parameters(): p.requires_grad = False def forward(self, x): outputs = [] x = self.stem(x) for stage_name in self.stages: x = getattr(self, stage_name)(x) if self.return_features[stage_name]: outputs.append(x) return outputs class ResNetHead(nn.Module): def __init__( self, block_module, stages, num_groups=1, width_per_group=64, stride_in_1x1=True, stride_init=None, res2_out_channels=256, dilation=1 ): super(ResNetHead, self).__init__() stage2_relative_factor = 2 ** (stages[0].index - 1) stage2_bottleneck_channels = num_groups * width_per_group out_channels = res2_out_channels * stage2_relative_factor in_channels = out_channels // 2 bottleneck_channels = stage2_bottleneck_channels * stage2_relative_factor block_module = _TRANSFORMATION_MODULES[block_module] self.stages = [] stride = stride_init for stage in stages: name = "layer" + str(stage.index) if not stride: stride = int(stage.index > 1) + 1 module = _make_stage( block_module, in_channels, bottleneck_channels, out_channels, stage.block_count, num_groups, stride_in_1x1, first_stride=stride, dilation=dilation ) stride = None self.add_module(name, module) self.stages.append(name) self.out_channels = out_channels def forward(self, x): for stage in self.stages: x = getattr(self, stage)(x) return x def _make_stage( transformation_module, in_channels, bottleneck_channels, out_channels, block_count, num_groups, stride_in_1x1, first_stride, dilation=1 ): blocks = [] stride = first_stride for _ in range(block_count): blocks.append( transformation_module( in_channels, bottleneck_channels, out_channels, num_groups, stride_in_1x1, stride, dilation=dilation ) ) stride = 1 in_channels = out_channels return nn.Sequential(*blocks) class Bottleneck(nn.Module): def __init__( self, in_channels, bottleneck_channels, out_channels, num_groups, stride_in_1x1, stride, dilation, norm_func ): super(Bottleneck, self).__init__() self.downsample = None if in_channels != out_channels: down_stride = stride if dilation == 1 else 1 self.downsample = nn.Sequential( Conv2d( in_channels, out_channels, kernel_size=1, stride=down_stride, bias=False ), norm_func(out_channels), ) for modules in [self.downsample,]: for l in modules.modules(): if isinstance(l, Conv2d): nn.init.kaiming_uniform_(l.weight, a=1) if dilation > 1: stride = 1 # reset to be 1 # The original MSRA ResNet models have stride in the first 1x1 conv # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have # stride in the 3x3 conv stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) self.conv1 = Conv2d( in_channels, bottleneck_channels, kernel_size=1, stride=stride_1x1, bias=False, ) self.bn1 = norm_func(bottleneck_channels) # TODO: specify init for the above self.conv2 = Conv2d( bottleneck_channels, bottleneck_channels, kernel_size=3, stride=stride_3x3, padding=dilation, bias=False, groups=num_groups, dilation=dilation ) self.bn2 = norm_func(bottleneck_channels) self.conv3 = Conv2d( bottleneck_channels, out_channels, kernel_size=1, bias=False ) self.bn3 = norm_func(out_channels) for l in [self.conv1, self.conv2, self.conv3,]: nn.init.kaiming_uniform_(l.weight, a=1) def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = F.relu_(out) out = self.conv2(out) out = self.bn2(out) out = F.relu_(out) out0 = self.conv3(out) out = self.bn3(out0) if self.downsample is not None: identity = self.downsample(x) out += identity out = F.relu_(out) return out class BaseStem(nn.Module): def __init__(self, cfg, norm_func): super(BaseStem, self).__init__() out_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS self.conv1 = Conv2d( 3, out_channels, kernel_size=7, stride=2, padding=3, bias=False ) self.bn1 = norm_func(out_channels) for l in [self.conv1,]: nn.init.kaiming_uniform_(l.weight, a=1) def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = F.relu_(x) x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) return x class BottleneckWithFixedBatchNorm(Bottleneck): def __init__( self, in_channels, bottleneck_channels, out_channels, num_groups=1, stride_in_1x1=True, stride=1, dilation=1 ): super(BottleneckWithFixedBatchNorm, self).__init__( in_channels=in_channels, bottleneck_channels=bottleneck_channels, out_channels=out_channels, num_groups=num_groups, stride_in_1x1=stride_in_1x1, stride=stride, dilation=dilation, norm_func=FrozenBatchNorm2d ) class StemWithFixedBatchNorm(BaseStem): def __init__(self, cfg): super(StemWithFixedBatchNorm, self).__init__( cfg, norm_func=FrozenBatchNorm2d ) class BottleneckWithGN(Bottleneck): def __init__( self, in_channels, bottleneck_channels, out_channels, num_groups=1, stride_in_1x1=True, stride=1, dilation=1 ): super(BottleneckWithGN, self).__init__( in_channels=in_channels, bottleneck_channels=bottleneck_channels, out_channels=out_channels, num_groups=num_groups, stride_in_1x1=stride_in_1x1, stride=stride, dilation=dilation, norm_func=group_norm ) class StemWithGN(BaseStem): def __init__(self, cfg): super(StemWithGN, self).__init__(cfg, norm_func=group_norm) _TRANSFORMATION_MODULES = Registry({ "BottleneckWithFixedBatchNorm": BottleneckWithFixedBatchNorm, "BottleneckWithGN": BottleneckWithGN, }) _STEM_MODULES = Registry({ "StemWithFixedBatchNorm": StemWithFixedBatchNorm, "StemWithGN": StemWithGN, }) _STAGE_SPECS = Registry({ "R-50-C4": ResNet50StagesTo4, "R-50-C5": ResNet50StagesTo5, "R-101-C4": ResNet101StagesTo4, "R-101-C5": ResNet101StagesTo5, "R-50-FPN": ResNet50FPNStagesTo5, "R-50-FPN-RETINANET": ResNet50FPNStagesTo5, "R-101-FPN": ResNet101FPNStagesTo5, "R-101-FPN-RETINANET": ResNet101FPNStagesTo5, "R-152-FPN": ResNet152FPNStagesTo5, }) ================================================ FILE: maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch class BalancedPositiveNegativeSampler(object): """ This class samples batches, ensuring that they contain a fixed proportion of positives """ def __init__(self, batch_size_per_image, positive_fraction): """ Arguments: batch_size_per_image (int): number of elements to be selected per image positive_fraction (float): percentace of positive elements per batch """ self.batch_size_per_image = batch_size_per_image self.positive_fraction = positive_fraction def __call__(self, matched_idxs): """ Arguments: matched idxs: list of tensors containing -1, 0 or positive values. Each tensor corresponds to a specific image. -1 values are ignored, 0 are considered as negatives and > 0 as positives. Returns: pos_idx (list[tensor]) neg_idx (list[tensor]) Returns two lists of binary masks for each image. The first list contains the positive elements that were selected, and the second list the negative example. """ pos_idx = [] neg_idx = [] for matched_idxs_per_image in matched_idxs: positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1) negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1) num_pos = int(self.batch_size_per_image * self.positive_fraction) # protect against not enough positive examples num_pos = min(positive.numel(), num_pos) num_neg = self.batch_size_per_image - num_pos # protect against not enough negative examples num_neg = min(negative.numel(), num_neg) # randomly select positive and negative examples perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos] perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg] pos_idx_per_image = positive[perm1] neg_idx_per_image = negative[perm2] # create binary mask from indices pos_idx_per_image_mask = torch.zeros_like( matched_idxs_per_image, dtype=torch.uint8 ) neg_idx_per_image_mask = torch.zeros_like( matched_idxs_per_image, dtype=torch.uint8 ) pos_idx_per_image_mask[pos_idx_per_image] = 1 neg_idx_per_image_mask[neg_idx_per_image] = 1 pos_idx.append(pos_idx_per_image_mask) neg_idx.append(neg_idx_per_image_mask) return pos_idx, neg_idx ================================================ FILE: maskrcnn_benchmark/modeling/box_coder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import math import torch class BoxCoder(object): """ This class encodes and decodes a set of bounding boxes into the representation used for training the regressors. """ def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)): """ Arguments: weights (4-element tuple) bbox_xform_clip (float) """ self.weights = weights self.bbox_xform_clip = bbox_xform_clip def encode(self, reference_boxes, proposals): """ Encode a set of proposals with respect to some reference boxes Arguments: reference_boxes (Tensor): reference boxes proposals (Tensor): boxes to be encoded """ TO_REMOVE = 1 # TODO remove ex_widths = proposals[:, 2] - proposals[:, 0] + TO_REMOVE ex_heights = proposals[:, 3] - proposals[:, 1] + TO_REMOVE ex_ctr_x = proposals[:, 0] + 0.5 * ex_widths ex_ctr_y = proposals[:, 1] + 0.5 * ex_heights gt_widths = reference_boxes[:, 2] - reference_boxes[:, 0] + TO_REMOVE gt_heights = reference_boxes[:, 3] - reference_boxes[:, 1] + TO_REMOVE gt_ctr_x = reference_boxes[:, 0] + 0.5 * gt_widths gt_ctr_y = reference_boxes[:, 1] + 0.5 * gt_heights wx, wy, ww, wh = self.weights targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights targets_dw = ww * torch.log(gt_widths / ex_widths) targets_dh = wh * torch.log(gt_heights / ex_heights) targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh), dim=1) return targets def decode(self, rel_codes, boxes): """ From a set of original boxes and encoded relative box offsets, get the decoded boxes. Arguments: rel_codes (Tensor): encoded boxes boxes (Tensor): reference boxes. """ boxes = boxes.to(rel_codes.dtype) TO_REMOVE = 1 # TODO remove widths = boxes[:, 2] - boxes[:, 0] + TO_REMOVE heights = boxes[:, 3] - boxes[:, 1] + TO_REMOVE ctr_x = boxes[:, 0] + 0.5 * widths ctr_y = boxes[:, 1] + 0.5 * heights wx, wy, ww, wh = self.weights dx = rel_codes[:, 0::4] / wx dy = rel_codes[:, 1::4] / wy dw = rel_codes[:, 2::4] / ww dh = rel_codes[:, 3::4] / wh # Prevent sending too large values into torch.exp() dw = torch.clamp(dw, max=self.bbox_xform_clip) dh = torch.clamp(dh, max=self.bbox_xform_clip) pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] pred_w = torch.exp(dw) * widths[:, None] pred_h = torch.exp(dh) * heights[:, None] pred_boxes = torch.zeros_like(rel_codes) # x1 pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w # y1 pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h # x2 (note: "- 1" is correct; don't be fooled by the asymmetry) pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1 # y2 (note: "- 1" is correct; don't be fooled by the asymmetry) pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1 return pred_boxes ================================================ FILE: maskrcnn_benchmark/modeling/detector/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from .detectors import build_detection_model ================================================ FILE: maskrcnn_benchmark/modeling/detector/detectors.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from .generalized_rcnn import GeneralizedRCNN _DETECTION_META_ARCHITECTURES = {"GeneralizedRCNN": GeneralizedRCNN} def build_detection_model(cfg): meta_arch = _DETECTION_META_ARCHITECTURES[cfg.MODEL.META_ARCHITECTURE] return meta_arch(cfg) ================================================ FILE: maskrcnn_benchmark/modeling/detector/generalized_rcnn.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """ Implements the Generalized R-CNN framework """ import torch from torch import nn from maskrcnn_benchmark.structures.image_list import to_image_list from ..backbone import build_backbone from ..rpn.rpn import build_rpn from ..roi_heads.roi_heads import build_roi_heads class GeneralizedRCNN(nn.Module): """ Main class for Generalized R-CNN. Currently supports boxes and masks. It consists of three main parts: - backbone - rpn - heads: takes the features + the proposals from the RPN and computes detections / masks from it. """ def __init__(self, cfg): super(GeneralizedRCNN, self).__init__() self.backbone = build_backbone(cfg) self.rpn = build_rpn(cfg, self.backbone.out_channels) self.roi_heads = build_roi_heads(cfg, self.backbone.out_channels) def forward(self, images, targets=None): """ Arguments: images (list[Tensor] or ImageList): images to be processed targets (list[BoxList]): ground-truth boxes present in the image (optional) Returns: result (list[BoxList] or dict[Tensor]): the output from the model. During training, it returns a dict[Tensor] which contains the losses. During testing, it returns list[BoxList] contains additional fields like `scores`, `labels` and `mask` (for Mask R-CNN models). """ if self.training and targets is None: raise ValueError("In training mode, targets should be passed") images = to_image_list(images) features = self.backbone(images.tensors) proposals, proposal_losses = self.rpn(images, features, targets) if self.roi_heads: x, result, detector_losses = self.roi_heads(features, proposals, targets) else: # RPN-only models don't have roi_heads x = features result = proposals detector_losses = {} if self.training: losses = {} losses.update(detector_losses) losses.update(proposal_losses) return losses return result ================================================ FILE: maskrcnn_benchmark/modeling/make_layers.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """ Miscellaneous utility functions """ import torch from torch import nn from torch.nn import functional as F from maskrcnn_benchmark.config import cfg from maskrcnn_benchmark.layers import Conv2d from maskrcnn_benchmark.modeling.poolers import Pooler def get_group_gn(dim, dim_per_gp, num_groups): """get number of groups used by GroupNorm, based on number of channels.""" assert dim_per_gp == -1 or num_groups == -1, \ "GroupNorm: can only specify G or C/G." if dim_per_gp > 0: assert dim % dim_per_gp == 0, \ "dim: {}, dim_per_gp: {}".format(dim, dim_per_gp) group_gn = dim // dim_per_gp else: assert dim % num_groups == 0, \ "dim: {}, num_groups: {}".format(dim, num_groups) group_gn = num_groups return group_gn def group_norm(out_channels, affine=True, divisor=1): out_channels = out_channels // divisor dim_per_gp = cfg.MODEL.GROUP_NORM.DIM_PER_GP // divisor num_groups = cfg.MODEL.GROUP_NORM.NUM_GROUPS // divisor eps = cfg.MODEL.GROUP_NORM.EPSILON # default: 1e-5 return torch.nn.GroupNorm( get_group_gn(out_channels, dim_per_gp, num_groups), out_channels, eps, affine ) def make_conv3x3( in_channels, out_channels, dilation=1, stride=1, use_gn=False, use_relu=False, kaiming_init=True ): conv = Conv2d( in_channels, out_channels, kernel_size=3, stride=stride, padding=dilation, dilation=dilation, bias=False if use_gn else True ) if kaiming_init: nn.init.kaiming_normal_( conv.weight, mode="fan_out", nonlinearity="relu" ) else: torch.nn.init.normal_(conv.weight, std=0.01) if not use_gn: nn.init.constant_(conv.bias, 0) module = [conv,] if use_gn: module.append(group_norm(out_channels)) if use_relu: module.append(nn.ReLU(inplace=True)) if len(module) > 1: return nn.Sequential(*module) return conv def make_fc(dim_in, hidden_dim, use_gn=False): ''' Caffe2 implementation uses XavierFill, which in fact corresponds to kaiming_uniform_ in PyTorch ''' if use_gn: fc = nn.Linear(dim_in, hidden_dim, bias=False) nn.init.kaiming_uniform_(fc.weight, a=1) return nn.Sequential(fc, group_norm(hidden_dim)) fc = nn.Linear(dim_in, hidden_dim) nn.init.kaiming_uniform_(fc.weight, a=1) nn.init.constant_(fc.bias, 0) return fc def conv_with_kaiming_uniform(use_gn=False, use_relu=False): def make_conv( in_channels, out_channels, kernel_size, stride=1, dilation=1 ): conv = Conv2d( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=dilation * (kernel_size - 1) // 2, dilation=dilation, bias=False if use_gn else True ) # Caffe2 implementation uses XavierFill, which in fact # corresponds to kaiming_uniform_ in PyTorch nn.init.kaiming_uniform_(conv.weight, a=1) if not use_gn: nn.init.constant_(conv.bias, 0) module = [conv,] if use_gn: module.append(group_norm(out_channels)) if use_relu: module.append(nn.ReLU(inplace=True)) if len(module) > 1: return nn.Sequential(*module) return conv return make_conv ================================================ FILE: maskrcnn_benchmark/modeling/matcher.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch class Matcher(object): """ This class assigns to each predicted "element" (e.g., a box) a ground-truth element. Each predicted element will have exactly zero or one matches; each ground-truth element may be assigned to zero or more predicted elements. Matching is based on the MxN match_quality_matrix, that characterizes how well each (ground-truth, predicted)-pair match. For example, if the elements are boxes, the matrix may contain box IoU overlap values. The matcher returns a tensor of size N containing the index of the ground-truth element m that matches to prediction n. If there is no match, a negative value is returned. """ BELOW_LOW_THRESHOLD = -1 BETWEEN_THRESHOLDS = -2 def __init__(self, high_threshold, low_threshold, allow_low_quality_matches=False): """ Args: high_threshold (float): quality values greater than or equal to this value are candidate matches. low_threshold (float): a lower quality threshold used to stratify matches into three levels: 1) matches >= high_threshold 2) BETWEEN_THRESHOLDS matches in [low_threshold, high_threshold) 3) BELOW_LOW_THRESHOLD matches in [0, low_threshold) allow_low_quality_matches (bool): if True, produce additional matches for predictions that have only low-quality match candidates. See set_low_quality_matches_ for more details. """ assert low_threshold <= high_threshold self.high_threshold = high_threshold self.low_threshold = low_threshold self.allow_low_quality_matches = allow_low_quality_matches def __call__(self, match_quality_matrix): """ Args: match_quality_matrix (Tensor[float]): an MxN tensor, containing the pairwise quality between M ground-truth elements and N predicted elements. Returns: matches (Tensor[int64]): an N tensor where N[i] is a matched gt in [0, M - 1] or a negative value indicating that prediction i could not be matched. """ if match_quality_matrix.numel() == 0: # empty targets or proposals not supported during training if match_quality_matrix.shape[0] == 0: raise ValueError( "No ground-truth boxes available for one of the images " "during training") else: raise ValueError( "No proposal boxes available for one of the images " "during training") # match_quality_matrix is M (gt) x N (predicted) # Max over gt elements (dim 0) to find best gt candidate for each prediction matched_vals, matches = match_quality_matrix.max(dim=0) if self.allow_low_quality_matches: all_matches = matches.clone() # Assign candidate matches with low quality to negative (unassigned) values below_low_threshold = matched_vals < self.low_threshold between_thresholds = (matched_vals >= self.low_threshold) & ( matched_vals < self.high_threshold ) matches[below_low_threshold] = Matcher.BELOW_LOW_THRESHOLD matches[between_thresholds] = Matcher.BETWEEN_THRESHOLDS if self.allow_low_quality_matches: self.set_low_quality_matches_(matches, all_matches, match_quality_matrix) return matches def set_low_quality_matches_(self, matches, all_matches, match_quality_matrix): """ Produce additional matches for predictions that have only low-quality matches. Specifically, for each ground-truth find the set of predictions that have maximum overlap with it (including ties); for each prediction in that set, if it is unmatched, then match it to the ground-truth with which it has the highest quality value. """ # For each gt, find the prediction with which it has highest quality highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1) # Find highest quality match available, even if it is low, including ties gt_pred_pairs_of_highest_quality = torch.nonzero( match_quality_matrix == highest_quality_foreach_gt[:, None] ) # Example gt_pred_pairs_of_highest_quality: # tensor([[ 0, 39796], # [ 1, 32055], # [ 1, 32070], # [ 2, 39190], # [ 2, 40255], # [ 3, 40390], # [ 3, 41455], # [ 4, 45470], # [ 5, 45325], # [ 5, 46390]]) # Each row is a (gt index, prediction index) # Note how gt items 1, 2, 3, and 5 each have two ties pred_inds_to_update = gt_pred_pairs_of_highest_quality[:, 1] matches[pred_inds_to_update] = all_matches[pred_inds_to_update] ================================================ FILE: maskrcnn_benchmark/modeling/poolers.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch import torch.nn.functional as F from torch import nn from maskrcnn_benchmark.layers import ROIAlign from .utils import cat class LevelMapper(object): """Determine which FPN level each RoI in a set of RoIs should map to based on the heuristic in the FPN paper. """ def __init__(self, k_min, k_max, canonical_scale=224, canonical_level=4, eps=1e-6): """ Arguments: k_min (int) k_max (int) canonical_scale (int) canonical_level (int) eps (float) """ self.k_min = k_min self.k_max = k_max self.s0 = canonical_scale self.lvl0 = canonical_level self.eps = eps def __call__(self, boxlists): """ Arguments: boxlists (list[BoxList]) """ # Compute level ids s = torch.sqrt(cat([boxlist.area() for boxlist in boxlists])) # Eqn.(1) in FPN paper target_lvls = torch.floor(self.lvl0 + torch.log2(s / self.s0 + self.eps)) target_lvls = torch.clamp(target_lvls, min=self.k_min, max=self.k_max) return target_lvls.to(torch.int64) - self.k_min class Pooler(nn.Module): """ Pooler for Detection with or without FPN. It currently hard-code ROIAlign in the implementation, but that can be made more generic later on. Also, the requirement of passing the scales is not strictly necessary, as they can be inferred from the size of the feature map / size of original image, which is available thanks to the BoxList. """ def __init__(self, output_size, scales, sampling_ratio): """ Arguments: output_size (list[tuple[int]] or list[int]): output size for the pooled region scales (list[float]): scales for each Pooler sampling_ratio (int): sampling ratio for ROIAlign """ super(Pooler, self).__init__() poolers = [] for scale in scales: poolers.append( ROIAlign( output_size, spatial_scale=scale, sampling_ratio=sampling_ratio ) ) self.poolers = nn.ModuleList(poolers) self.output_size = output_size # get the levels in the feature map by leveraging the fact that the network always # downsamples by a factor of 2 at each level. lvl_min = -torch.log2(torch.tensor(scales[0], dtype=torch.float32)).item() lvl_max = -torch.log2(torch.tensor(scales[-1], dtype=torch.float32)).item() self.map_levels = LevelMapper(lvl_min, lvl_max) def convert_to_roi_format(self, boxes): concat_boxes = cat([b.bbox for b in boxes], dim=0) device, dtype = concat_boxes.device, concat_boxes.dtype ids = cat( [ torch.full((len(b), 1), i, dtype=dtype, device=device) for i, b in enumerate(boxes) ], dim=0, ) rois = torch.cat([ids, concat_boxes], dim=1) return rois def forward(self, x, boxes): """ Arguments: x (list[Tensor]): feature maps for each level boxes (list[BoxList]): boxes to be used to perform the pooling operation. Returns: result (Tensor) """ num_levels = len(self.poolers) rois = self.convert_to_roi_format(boxes) if num_levels == 1: return self.poolers[0](x[0], rois) levels = self.map_levels(boxes) num_rois = len(rois) num_channels = x[0].shape[1] output_size = self.output_size[0] dtype, device = x[0].dtype, x[0].device result = torch.zeros( (num_rois, num_channels, output_size, output_size), dtype=dtype, device=device, ) for level, (per_level_feature, pooler) in enumerate(zip(x, self.poolers)): idx_in_level = torch.nonzero(levels == level).squeeze(1) rois_per_level = rois[idx_in_level] result[idx_in_level] = pooler(per_level_feature, rois_per_level) return result def make_pooler(cfg, head_name): resolution = cfg.MODEL[head_name].POOLER_RESOLUTION scales = cfg.MODEL[head_name].POOLER_SCALES sampling_ratio = cfg.MODEL[head_name].POOLER_SAMPLING_RATIO pooler = Pooler( output_size=(resolution, resolution), scales=scales, sampling_ratio=sampling_ratio, ) return pooler ================================================ FILE: maskrcnn_benchmark/modeling/registry.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from maskrcnn_benchmark.utils.registry import Registry BACKBONES = Registry() RPN_HEADS = Registry() ROI_BOX_FEATURE_EXTRACTORS = Registry() ROI_BOX_PREDICTOR = Registry() ROI_KEYPOINT_FEATURE_EXTRACTORS = Registry() ROI_KEYPOINT_PREDICTOR = Registry() ROI_MASK_FEATURE_EXTRACTORS = Registry() ROI_MASK_PREDICTOR = Registry() ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/__init__.py ================================================ ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/box_head/__init__.py ================================================ ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch import nn from .roi_box_feature_extractors import make_roi_box_feature_extractor from .roi_box_predictors import make_roi_box_predictor from .inference import make_roi_box_post_processor from .loss import make_roi_box_loss_evaluator class ROIBoxHead(torch.nn.Module): """ Generic Box Head class. """ def __init__(self, cfg, in_channels): super(ROIBoxHead, self).__init__() self.feature_extractor = make_roi_box_feature_extractor(cfg, in_channels) self.predictor = make_roi_box_predictor( cfg, self.feature_extractor.out_channels) self.post_processor = make_roi_box_post_processor(cfg) self.loss_evaluator = make_roi_box_loss_evaluator(cfg) def forward(self, features, proposals, targets=None): """ Arguments: features (list[Tensor]): feature-maps from possibly several levels proposals (list[BoxList]): proposal boxes targets (list[BoxList], optional): the ground-truth targets. Returns: x (Tensor): the result of the feature extractor proposals (list[BoxList]): during training, the subsampled proposals are returned. During testing, the predicted boxlists are returned losses (dict[Tensor]): During training, returns the losses for the head. During testing, returns an empty dict. """ if self.training: # Faster R-CNN subsamples during training the proposals with a fixed # positive / negative ratio with torch.no_grad(): proposals = self.loss_evaluator.subsample(proposals, targets) # extract features that will be fed to the final classifier. The # feature_extractor generally corresponds to the pooler + heads x = self.feature_extractor(features, proposals) # final classifier that converts the features into predictions class_logits, box_regression = self.predictor(x) if not self.training: result = self.post_processor((class_logits, box_regression), proposals) return x, result, {} loss_classifier, loss_box_reg = self.loss_evaluator( [class_logits], [box_regression] ) return ( x, proposals, dict(loss_classifier=loss_classifier, loss_box_reg=loss_box_reg), ) def build_roi_box_head(cfg, in_channels): """ Constructs a new box head. By default, uses ROIBoxHead, but if it turns out not to be enough, just register a new class and make it a parameter in the config """ return ROIBoxHead(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch import torch.nn.functional as F from torch import nn from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist from maskrcnn_benchmark.modeling.box_coder import BoxCoder class PostProcessor(nn.Module): """ From a set of classification scores, box regression and proposals, computes the post-processed boxes, and applies NMS to obtain the final results """ def __init__( self, score_thresh=0.05, nms=0.5, detections_per_img=100, box_coder=None, cls_agnostic_bbox_reg=False ): """ Arguments: score_thresh (float) nms (float) detections_per_img (int) box_coder (BoxCoder) """ super(PostProcessor, self).__init__() self.score_thresh = score_thresh self.nms = nms self.detections_per_img = detections_per_img if box_coder is None: box_coder = BoxCoder(weights=(10., 10., 5., 5.)) self.box_coder = box_coder self.cls_agnostic_bbox_reg = cls_agnostic_bbox_reg def forward(self, x, boxes): """ Arguments: x (tuple[tensor, tensor]): x contains the class logits and the box_regression from the model. boxes (list[BoxList]): bounding boxes that are used as reference, one for ech image Returns: results (list[BoxList]): one BoxList for each image, containing the extra fields labels and scores """ class_logits, box_regression = x class_prob = F.softmax(class_logits, -1) # TODO think about a representation of batch of boxes image_shapes = [box.size for box in boxes] boxes_per_image = [len(box) for box in boxes] concat_boxes = torch.cat([a.bbox for a in boxes], dim=0) if self.cls_agnostic_bbox_reg: box_regression = box_regression[:, -4:] proposals = self.box_coder.decode( box_regression.view(sum(boxes_per_image), -1), concat_boxes ) if self.cls_agnostic_bbox_reg: proposals = proposals.repeat(1, class_prob.shape[1]) num_classes = class_prob.shape[1] proposals = proposals.split(boxes_per_image, dim=0) class_prob = class_prob.split(boxes_per_image, dim=0) results = [] for prob, boxes_per_img, image_shape in zip( class_prob, proposals, image_shapes ): boxlist = self.prepare_boxlist(boxes_per_img, prob, image_shape) boxlist = boxlist.clip_to_image(remove_empty=False) boxlist = self.filter_results(boxlist, num_classes) results.append(boxlist) return results def prepare_boxlist(self, boxes, scores, image_shape): """ Returns BoxList from `boxes` and adds probability scores information as an extra field `boxes` has shape (#detections, 4 * #classes), where each row represents a list of predicted bounding boxes for each of the object classes in the dataset (including the background class). The detections in each row originate from the same object proposal. `scores` has shape (#detection, #classes), where each row represents a list of object detection confidence scores for each of the object classes in the dataset (including the background class). `scores[i, j]`` corresponds to the box at `boxes[i, j * 4:(j + 1) * 4]`. """ boxes = boxes.reshape(-1, 4) scores = scores.reshape(-1) boxlist = BoxList(boxes, image_shape, mode="xyxy") boxlist.add_field("scores", scores) return boxlist def filter_results(self, boxlist, num_classes): """Returns bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). """ # unwrap the boxlist to avoid additional overhead. # if we had multi-class NMS, we could perform this directly on the boxlist boxes = boxlist.bbox.reshape(-1, num_classes * 4) scores = boxlist.get_field("scores").reshape(-1, num_classes) device = scores.device result = [] # Apply threshold on detection probabilities and apply NMS # Skip j = 0, because it's the background class inds_all = scores > self.score_thresh for j in range(1, num_classes): inds = inds_all[:, j].nonzero().squeeze(1) scores_j = scores[inds, j] boxes_j = boxes[inds, j * 4 : (j + 1) * 4] boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") boxlist_for_class.add_field("scores", scores_j) boxlist_for_class = boxlist_nms( boxlist_for_class, self.nms ) num_labels = len(boxlist_for_class) boxlist_for_class.add_field( "labels", torch.full((num_labels,), j, dtype=torch.int64, device=device) ) result.append(boxlist_for_class) result = cat_boxlist(result) number_of_detections = len(result) # Limit to max_per_image detections **over all classes** if number_of_detections > self.detections_per_img > 0: cls_scores = result.get_field("scores") image_thresh, _ = torch.kthvalue( cls_scores.cpu(), number_of_detections - self.detections_per_img + 1 ) keep = cls_scores >= image_thresh.item() keep = torch.nonzero(keep).squeeze(1) result = result[keep] return result def make_roi_box_post_processor(cfg): use_fpn = cfg.MODEL.ROI_HEADS.USE_FPN bbox_reg_weights = cfg.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS box_coder = BoxCoder(weights=bbox_reg_weights) score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH nms_thresh = cfg.MODEL.ROI_HEADS.NMS detections_per_img = cfg.MODEL.ROI_HEADS.DETECTIONS_PER_IMG cls_agnostic_bbox_reg = cfg.MODEL.CLS_AGNOSTIC_BBOX_REG postprocessor = PostProcessor( score_thresh, nms_thresh, detections_per_img, box_coder, cls_agnostic_bbox_reg ) return postprocessor ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/box_head/loss.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch.nn import functional as F from maskrcnn_benchmark.layers import smooth_l1_loss from maskrcnn_benchmark.modeling.box_coder import BoxCoder from maskrcnn_benchmark.modeling.matcher import Matcher from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou from maskrcnn_benchmark.modeling.balanced_positive_negative_sampler import ( BalancedPositiveNegativeSampler ) from maskrcnn_benchmark.modeling.utils import cat class FastRCNNLossComputation(object): """ Computes the loss for Faster R-CNN. Also supports FPN """ def __init__( self, proposal_matcher, fg_bg_sampler, box_coder, cls_agnostic_bbox_reg=False ): """ Arguments: proposal_matcher (Matcher) fg_bg_sampler (BalancedPositiveNegativeSampler) box_coder (BoxCoder) """ self.proposal_matcher = proposal_matcher self.fg_bg_sampler = fg_bg_sampler self.box_coder = box_coder self.cls_agnostic_bbox_reg = cls_agnostic_bbox_reg def match_targets_to_proposals(self, proposal, target): match_quality_matrix = boxlist_iou(target, proposal) matched_idxs = self.proposal_matcher(match_quality_matrix) # Fast RCNN only need "labels" field for selecting the targets target = target.copy_with_fields("labels") # get the targets corresponding GT for each proposal # NB: need to clamp the indices because we can have a single # GT in the image, and matched_idxs can be -2, which goes # out of bounds matched_targets = target[matched_idxs.clamp(min=0)] matched_targets.add_field("matched_idxs", matched_idxs) return matched_targets def prepare_targets(self, proposals, targets): labels = [] regression_targets = [] for proposals_per_image, targets_per_image in zip(proposals, targets): matched_targets = self.match_targets_to_proposals( proposals_per_image, targets_per_image ) matched_idxs = matched_targets.get_field("matched_idxs") labels_per_image = matched_targets.get_field("labels") labels_per_image = labels_per_image.to(dtype=torch.int64) # Label background (below the low threshold) bg_inds = matched_idxs == Matcher.BELOW_LOW_THRESHOLD labels_per_image[bg_inds] = 0 # Label ignore proposals (between low and high thresholds) ignore_inds = matched_idxs == Matcher.BETWEEN_THRESHOLDS labels_per_image[ignore_inds] = -1 # -1 is ignored by sampler # compute regression targets regression_targets_per_image = self.box_coder.encode( matched_targets.bbox, proposals_per_image.bbox ) labels.append(labels_per_image) regression_targets.append(regression_targets_per_image) return labels, regression_targets def subsample(self, proposals, targets): """ This method performs the positive/negative sampling, and return the sampled proposals. Note: this function keeps a state. Arguments: proposals (list[BoxList]) targets (list[BoxList]) """ labels, regression_targets = self.prepare_targets(proposals, targets) sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels) proposals = list(proposals) # add corresponding label and regression_targets information to the bounding boxes for labels_per_image, regression_targets_per_image, proposals_per_image in zip( labels, regression_targets, proposals ): proposals_per_image.add_field("labels", labels_per_image) proposals_per_image.add_field( "regression_targets", regression_targets_per_image ) # distributed sampled proposals, that were obtained on all feature maps # concatenated via the fg_bg_sampler, into individual feature map levels for img_idx, (pos_inds_img, neg_inds_img) in enumerate( zip(sampled_pos_inds, sampled_neg_inds) ): img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1) proposals_per_image = proposals[img_idx][img_sampled_inds] proposals[img_idx] = proposals_per_image self._proposals = proposals return proposals def __call__(self, class_logits, box_regression): """ Computes the loss for Faster R-CNN. This requires that the subsample method has been called beforehand. Arguments: class_logits (list[Tensor]) box_regression (list[Tensor]) Returns: classification_loss (Tensor) box_loss (Tensor) """ class_logits = cat(class_logits, dim=0) box_regression = cat(box_regression, dim=0) device = class_logits.device if not hasattr(self, "_proposals"): raise RuntimeError("subsample needs to be called before") proposals = self._proposals labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0) regression_targets = cat( [proposal.get_field("regression_targets") for proposal in proposals], dim=0 ) classification_loss = F.cross_entropy(class_logits, labels) # get indices that correspond to the regression targets for # the corresponding ground truth labels, to be used with # advanced indexing sampled_pos_inds_subset = torch.nonzero(labels > 0).squeeze(1) labels_pos = labels[sampled_pos_inds_subset] if self.cls_agnostic_bbox_reg: map_inds = torch.tensor([4, 5, 6, 7], device=device) else: map_inds = 4 * labels_pos[:, None] + torch.tensor( [0, 1, 2, 3], device=device) box_loss = smooth_l1_loss( box_regression[sampled_pos_inds_subset[:, None], map_inds], regression_targets[sampled_pos_inds_subset], size_average=False, beta=1, ) box_loss = box_loss / labels.numel() return classification_loss, box_loss def make_roi_box_loss_evaluator(cfg): matcher = Matcher( cfg.MODEL.ROI_HEADS.FG_IOU_THRESHOLD, cfg.MODEL.ROI_HEADS.BG_IOU_THRESHOLD, allow_low_quality_matches=False, ) bbox_reg_weights = cfg.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS box_coder = BoxCoder(weights=bbox_reg_weights) fg_bg_sampler = BalancedPositiveNegativeSampler( cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE, cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION ) cls_agnostic_bbox_reg = cfg.MODEL.CLS_AGNOSTIC_BBOX_REG loss_evaluator = FastRCNNLossComputation( matcher, fg_bg_sampler, box_coder, cls_agnostic_bbox_reg ) return loss_evaluator ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_feature_extractors.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch import nn from torch.nn import functional as F from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.modeling.backbone import resnet from maskrcnn_benchmark.modeling.poolers import Pooler from maskrcnn_benchmark.modeling.make_layers import group_norm from maskrcnn_benchmark.modeling.make_layers import make_fc @registry.ROI_BOX_FEATURE_EXTRACTORS.register("ResNet50Conv5ROIFeatureExtractor") class ResNet50Conv5ROIFeatureExtractor(nn.Module): def __init__(self, config, in_channels): super(ResNet50Conv5ROIFeatureExtractor, self).__init__() resolution = config.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION scales = config.MODEL.ROI_BOX_HEAD.POOLER_SCALES sampling_ratio = config.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO pooler = Pooler( output_size=(resolution, resolution), scales=scales, sampling_ratio=sampling_ratio, ) stage = resnet.StageSpec(index=4, block_count=3, return_features=False) head = resnet.ResNetHead( block_module=config.MODEL.RESNETS.TRANS_FUNC, stages=(stage,), num_groups=config.MODEL.RESNETS.NUM_GROUPS, width_per_group=config.MODEL.RESNETS.WIDTH_PER_GROUP, stride_in_1x1=config.MODEL.RESNETS.STRIDE_IN_1X1, stride_init=None, res2_out_channels=config.MODEL.RESNETS.RES2_OUT_CHANNELS, dilation=config.MODEL.RESNETS.RES5_DILATION ) self.pooler = pooler self.head = head self.out_channels = head.out_channels def forward(self, x, proposals): x = self.pooler(x, proposals) x = self.head(x) return x @registry.ROI_BOX_FEATURE_EXTRACTORS.register("FPN2MLPFeatureExtractor") class FPN2MLPFeatureExtractor(nn.Module): """ Heads for FPN for classification """ def __init__(self, cfg, in_channels): super(FPN2MLPFeatureExtractor, self).__init__() resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO pooler = Pooler( output_size=(resolution, resolution), scales=scales, sampling_ratio=sampling_ratio, ) input_size = in_channels * resolution ** 2 representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN self.pooler = pooler self.fc6 = make_fc(input_size, representation_size, use_gn) self.fc7 = make_fc(representation_size, representation_size, use_gn) self.out_channels = representation_size def forward(self, x, proposals): x = self.pooler(x, proposals) x = x.view(x.size(0), -1) x = F.relu(self.fc6(x)) x = F.relu(self.fc7(x)) return x @registry.ROI_BOX_FEATURE_EXTRACTORS.register("FPNXconv1fcFeatureExtractor") class FPNXconv1fcFeatureExtractor(nn.Module): """ Heads for FPN for classification """ def __init__(self, cfg, in_channels): super(FPNXconv1fcFeatureExtractor, self).__init__() resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO pooler = Pooler( output_size=(resolution, resolution), scales=scales, sampling_ratio=sampling_ratio, ) self.pooler = pooler use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN conv_head_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM num_stacked_convs = cfg.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS dilation = cfg.MODEL.ROI_BOX_HEAD.DILATION xconvs = [] for ix in range(num_stacked_convs): xconvs.append( nn.Conv2d( in_channels, conv_head_dim, kernel_size=3, stride=1, padding=dilation, dilation=dilation, bias=False if use_gn else True ) ) in_channels = conv_head_dim if use_gn: xconvs.append(group_norm(in_channels)) xconvs.append(nn.ReLU(inplace=True)) self.add_module("xconvs", nn.Sequential(*xconvs)) for modules in [self.xconvs,]: for l in modules.modules(): if isinstance(l, nn.Conv2d): torch.nn.init.normal_(l.weight, std=0.01) if not use_gn: torch.nn.init.constant_(l.bias, 0) input_size = conv_head_dim * resolution ** 2 representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM self.fc6 = make_fc(input_size, representation_size, use_gn=False) self.out_channels = representation_size def forward(self, x, proposals): x = self.pooler(x, proposals) x = self.xconvs(x) x = x.view(x.size(0), -1) x = F.relu(self.fc6(x)) return x def make_roi_box_feature_extractor(cfg, in_channels): func = registry.ROI_BOX_FEATURE_EXTRACTORS[ cfg.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR ] return func(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from maskrcnn_benchmark.modeling import registry from torch import nn @registry.ROI_BOX_PREDICTOR.register("FastRCNNPredictor") class FastRCNNPredictor(nn.Module): def __init__(self, config, in_channels): super(FastRCNNPredictor, self).__init__() assert in_channels is not None num_inputs = in_channels num_classes = config.MODEL.ROI_BOX_HEAD.NUM_CLASSES self.avgpool = nn.AdaptiveAvgPool2d(1) self.cls_score = nn.Linear(num_inputs, num_classes) num_bbox_reg_classes = 2 if config.MODEL.CLS_AGNOSTIC_BBOX_REG else num_classes self.bbox_pred = nn.Linear(num_inputs, num_bbox_reg_classes * 4) nn.init.normal_(self.cls_score.weight, mean=0, std=0.01) nn.init.constant_(self.cls_score.bias, 0) nn.init.normal_(self.bbox_pred.weight, mean=0, std=0.001) nn.init.constant_(self.bbox_pred.bias, 0) def forward(self, x): x = self.avgpool(x) x = x.view(x.size(0), -1) cls_logit = self.cls_score(x) bbox_pred = self.bbox_pred(x) return cls_logit, bbox_pred @registry.ROI_BOX_PREDICTOR.register("FPNPredictor") class FPNPredictor(nn.Module): def __init__(self, cfg, in_channels): super(FPNPredictor, self).__init__() num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES representation_size = in_channels self.cls_score = nn.Linear(representation_size, num_classes) num_bbox_reg_classes = 2 if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG else num_classes self.bbox_pred = nn.Linear(representation_size, num_bbox_reg_classes * 4) nn.init.normal_(self.cls_score.weight, std=0.01) nn.init.normal_(self.bbox_pred.weight, std=0.001) for l in [self.cls_score, self.bbox_pred]: nn.init.constant_(l.bias, 0) def forward(self, x): if x.ndimension() == 4: assert list(x.shape[2:]) == [1, 1] x = x.view(x.size(0), -1) scores = self.cls_score(x) bbox_deltas = self.bbox_pred(x) return scores, bbox_deltas def make_roi_box_predictor(cfg, in_channels): func = registry.ROI_BOX_PREDICTOR[cfg.MODEL.ROI_BOX_HEAD.PREDICTOR] return func(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/keypoint_head/__init__.py ================================================ ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/keypoint_head/inference.py ================================================ import torch from torch import nn class KeypointPostProcessor(nn.Module): def __init__(self, keypointer=None): super(KeypointPostProcessor, self).__init__() self.keypointer = keypointer def forward(self, x, boxes): mask_prob = x scores = None if self.keypointer: mask_prob, scores = self.keypointer(x, boxes) assert len(boxes) == 1, "Only non-batched inference supported for now" boxes_per_image = [box.bbox.size(0) for box in boxes] mask_prob = mask_prob.split(boxes_per_image, dim=0) scores = scores.split(boxes_per_image, dim=0) results = [] for prob, box, score in zip(mask_prob, boxes, scores): bbox = BoxList(box.bbox, box.size, mode="xyxy") for field in box.fields(): bbox.add_field(field, box.get_field(field)) prob = PersonKeypoints(prob, box.size) prob.add_field("logits", score) bbox.add_field("keypoints", prob) results.append(bbox) return results # TODO remove and use only the Keypointer import numpy as np import cv2 def heatmaps_to_keypoints(maps, rois): """Extract predicted keypoint locations from heatmaps. Output has shape (#rois, 4, #keypoints) with the 4 rows corresponding to (x, y, logit, prob) for each keypoint. """ # This function converts a discrete image coordinate in a HEATMAP_SIZE x # HEATMAP_SIZE image to a continuous keypoint coordinate. We maintain # consistency with keypoints_to_heatmap_labels by using the conversion from # Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a # continuous coordinate. offset_x = rois[:, 0] offset_y = rois[:, 1] widths = rois[:, 2] - rois[:, 0] heights = rois[:, 3] - rois[:, 1] widths = np.maximum(widths, 1) heights = np.maximum(heights, 1) widths_ceil = np.ceil(widths) heights_ceil = np.ceil(heights) # NCHW to NHWC for use with OpenCV maps = np.transpose(maps, [0, 2, 3, 1]) min_size = 0 # cfg.KRCNN.INFERENCE_MIN_SIZE num_keypoints = maps.shape[3] xy_preds = np.zeros((len(rois), 3, num_keypoints), dtype=np.float32) end_scores = np.zeros((len(rois), num_keypoints), dtype=np.float32) for i in range(len(rois)): if min_size > 0: roi_map_width = int(np.maximum(widths_ceil[i], min_size)) roi_map_height = int(np.maximum(heights_ceil[i], min_size)) else: roi_map_width = widths_ceil[i] roi_map_height = heights_ceil[i] width_correction = widths[i] / roi_map_width height_correction = heights[i] / roi_map_height roi_map = cv2.resize( maps[i], (roi_map_width, roi_map_height), interpolation=cv2.INTER_CUBIC ) # Bring back to CHW roi_map = np.transpose(roi_map, [2, 0, 1]) # roi_map_probs = scores_to_probs(roi_map.copy()) w = roi_map.shape[2] pos = roi_map.reshape(num_keypoints, -1).argmax(axis=1) x_int = pos % w y_int = (pos - x_int) // w # assert (roi_map_probs[k, y_int, x_int] == # roi_map_probs[k, :, :].max()) x = (x_int + 0.5) * width_correction y = (y_int + 0.5) * height_correction xy_preds[i, 0, :] = x + offset_x[i] xy_preds[i, 1, :] = y + offset_y[i] xy_preds[i, 2, :] = 1 end_scores[i, :] = roi_map[np.arange(num_keypoints), y_int, x_int] return np.transpose(xy_preds, [0, 2, 1]), end_scores from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.structures.keypoint import PersonKeypoints class Keypointer(object): """ Projects a set of masks in an image on the locations specified by the bounding boxes """ def __init__(self, padding=0): self.padding = padding def __call__(self, masks, boxes): # TODO do this properly if isinstance(boxes, BoxList): boxes = [boxes] assert len(boxes) == 1 result, scores = heatmaps_to_keypoints( masks.detach().cpu().numpy(), boxes[0].bbox.cpu().numpy() ) return torch.from_numpy(result).to(masks.device), torch.as_tensor(scores, device=masks.device) def make_roi_keypoint_post_processor(cfg): keypointer = Keypointer() keypoint_post_processor = KeypointPostProcessor(keypointer) return keypoint_post_processor ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/keypoint_head/keypoint_head.py ================================================ import torch from .roi_keypoint_feature_extractors import make_roi_keypoint_feature_extractor from .roi_keypoint_predictors import make_roi_keypoint_predictor from .inference import make_roi_keypoint_post_processor from .loss import make_roi_keypoint_loss_evaluator class ROIKeypointHead(torch.nn.Module): def __init__(self, cfg, in_channels): super(ROIKeypointHead, self).__init__() self.cfg = cfg.clone() self.feature_extractor = make_roi_keypoint_feature_extractor(cfg, in_channels) self.predictor = make_roi_keypoint_predictor( cfg, self.feature_extractor.out_channels) self.post_processor = make_roi_keypoint_post_processor(cfg) self.loss_evaluator = make_roi_keypoint_loss_evaluator(cfg) def forward(self, features, proposals, targets=None): """ Arguments: features (list[Tensor]): feature-maps from possibly several levels proposals (list[BoxList]): proposal boxes targets (list[BoxList], optional): the ground-truth targets. Returns: x (Tensor): the result of the feature extractor proposals (list[BoxList]): during training, the original proposals are returned. During testing, the predicted boxlists are returned with the `mask` field set losses (dict[Tensor]): During training, returns the losses for the head. During testing, returns an empty dict. """ if self.training: with torch.no_grad(): proposals = self.loss_evaluator.subsample(proposals, targets) x = self.feature_extractor(features, proposals) kp_logits = self.predictor(x) if not self.training: result = self.post_processor(kp_logits, proposals) return x, result, {} loss_kp = self.loss_evaluator(proposals, kp_logits) return x, proposals, dict(loss_kp=loss_kp) def build_roi_keypoint_head(cfg, in_channels): return ROIKeypointHead(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/keypoint_head/loss.py ================================================ import torch from torch.nn import functional as F from maskrcnn_benchmark.modeling.matcher import Matcher from maskrcnn_benchmark.modeling.balanced_positive_negative_sampler import ( BalancedPositiveNegativeSampler, ) from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou from maskrcnn_benchmark.modeling.utils import cat from maskrcnn_benchmark.layers import smooth_l1_loss from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist from maskrcnn_benchmark.structures.keypoint import keypoints_to_heat_map def project_keypoints_to_heatmap(keypoints, proposals, discretization_size): proposals = proposals.convert("xyxy") return keypoints_to_heat_map( keypoints.keypoints, proposals.bbox, discretization_size ) def cat_boxlist_with_keypoints(boxlists): assert all(boxlist.has_field("keypoints") for boxlist in boxlists) kp = [boxlist.get_field("keypoints").keypoints for boxlist in boxlists] kp = cat(kp, 0) fields = boxlists[0].get_fields() fields = [field for field in fields if field != "keypoints"] boxlists = [boxlist.copy_with_fields(fields) for boxlist in boxlists] boxlists = cat_boxlist(boxlists) boxlists.add_field("keypoints", kp) return boxlists def _within_box(points, boxes): """Validate which keypoints are contained inside a given box. points: NxKx2 boxes: Nx4 output: NxK """ x_within = (points[..., 0] >= boxes[:, 0, None]) & ( points[..., 0] <= boxes[:, 2, None] ) y_within = (points[..., 1] >= boxes[:, 1, None]) & ( points[..., 1] <= boxes[:, 3, None] ) return x_within & y_within class KeypointRCNNLossComputation(object): def __init__(self, proposal_matcher, fg_bg_sampler, discretization_size): """ Arguments: proposal_matcher (Matcher) fg_bg_sampler (BalancedPositiveNegativeSampler) discretization_size (int) """ self.proposal_matcher = proposal_matcher self.fg_bg_sampler = fg_bg_sampler self.discretization_size = discretization_size def match_targets_to_proposals(self, proposal, target): match_quality_matrix = boxlist_iou(target, proposal) matched_idxs = self.proposal_matcher(match_quality_matrix) # Keypoint RCNN needs "labels" and "keypoints "fields for creating the targets target = target.copy_with_fields(["labels", "keypoints"]) # get the targets corresponding GT for each proposal # NB: need to clamp the indices because we can have a single # GT in the image, and matched_idxs can be -2, which goes # out of bounds matched_targets = target[matched_idxs.clamp(min=0)] matched_targets.add_field("matched_idxs", matched_idxs) return matched_targets def prepare_targets(self, proposals, targets): labels = [] keypoints = [] for proposals_per_image, targets_per_image in zip(proposals, targets): matched_targets = self.match_targets_to_proposals( proposals_per_image, targets_per_image ) matched_idxs = matched_targets.get_field("matched_idxs") labels_per_image = matched_targets.get_field("labels") labels_per_image = labels_per_image.to(dtype=torch.int64) # this can probably be removed, but is left here for clarity # and completeness # TODO check if this is the right one, as BELOW_THRESHOLD neg_inds = matched_idxs == Matcher.BELOW_LOW_THRESHOLD labels_per_image[neg_inds] = 0 keypoints_per_image = matched_targets.get_field("keypoints") within_box = _within_box( keypoints_per_image.keypoints, matched_targets.bbox ) vis_kp = keypoints_per_image.keypoints[..., 2] > 0 is_visible = (within_box & vis_kp).sum(1) > 0 labels_per_image[~is_visible] = -1 labels.append(labels_per_image) keypoints.append(keypoints_per_image) return labels, keypoints def subsample(self, proposals, targets): """ This method performs the positive/negative sampling, and return the sampled proposals. Note: this function keeps a state. Arguments: proposals (list[BoxList]) targets (list[BoxList]) """ labels, keypoints = self.prepare_targets(proposals, targets) sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels) proposals = list(proposals) # add corresponding label and regression_targets information to the bounding boxes for labels_per_image, keypoints_per_image, proposals_per_image in zip( labels, keypoints, proposals ): proposals_per_image.add_field("labels", labels_per_image) proposals_per_image.add_field("keypoints", keypoints_per_image) # distributed sampled proposals, that were obtained on all feature maps # concatenated via the fg_bg_sampler, into individual feature map levels for img_idx, (pos_inds_img, neg_inds_img) in enumerate( zip(sampled_pos_inds, sampled_neg_inds) ): img_sampled_inds = torch.nonzero(pos_inds_img).squeeze(1) proposals_per_image = proposals[img_idx][img_sampled_inds] proposals[img_idx] = proposals_per_image self._proposals = proposals return proposals def __call__(self, proposals, keypoint_logits): heatmaps = [] valid = [] for proposals_per_image in proposals: kp = proposals_per_image.get_field("keypoints") heatmaps_per_image, valid_per_image = project_keypoints_to_heatmap( kp, proposals_per_image, self.discretization_size ) heatmaps.append(heatmaps_per_image.view(-1)) valid.append(valid_per_image.view(-1)) keypoint_targets = cat(heatmaps, dim=0) valid = cat(valid, dim=0).to(dtype=torch.uint8) valid = torch.nonzero(valid).squeeze(1) # torch.mean (in binary_cross_entropy_with_logits) does'nt # accept empty tensors, so handle it sepaartely if keypoint_targets.numel() == 0 or len(valid) == 0: return keypoint_logits.sum() * 0 N, K, H, W = keypoint_logits.shape keypoint_logits = keypoint_logits.view(N * K, H * W) keypoint_loss = F.cross_entropy(keypoint_logits[valid], keypoint_targets[valid]) return keypoint_loss def make_roi_keypoint_loss_evaluator(cfg): matcher = Matcher( cfg.MODEL.ROI_HEADS.FG_IOU_THRESHOLD, cfg.MODEL.ROI_HEADS.BG_IOU_THRESHOLD, allow_low_quality_matches=False, ) fg_bg_sampler = BalancedPositiveNegativeSampler( cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE, cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION ) resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.RESOLUTION loss_evaluator = KeypointRCNNLossComputation(matcher, fg_bg_sampler, resolution) return loss_evaluator ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_feature_extractors.py ================================================ from torch import nn from torch.nn import functional as F from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.modeling.poolers import Pooler from maskrcnn_benchmark.layers import Conv2d @registry.ROI_KEYPOINT_FEATURE_EXTRACTORS.register("KeypointRCNNFeatureExtractor") class KeypointRCNNFeatureExtractor(nn.Module): def __init__(self, cfg, in_channels): super(KeypointRCNNFeatureExtractor, self).__init__() resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION scales = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SCALES sampling_ratio = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO pooler = Pooler( output_size=(resolution, resolution), scales=scales, sampling_ratio=sampling_ratio, ) self.pooler = pooler input_features = in_channels layers = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS next_feature = input_features self.blocks = [] for layer_idx, layer_features in enumerate(layers, 1): layer_name = "conv_fcn{}".format(layer_idx) module = Conv2d(next_feature, layer_features, 3, stride=1, padding=1) nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") nn.init.constant_(module.bias, 0) self.add_module(layer_name, module) next_feature = layer_features self.blocks.append(layer_name) self.out_channels = layer_features def forward(self, x, proposals): x = self.pooler(x, proposals) for layer_name in self.blocks: x = F.relu(getattr(self, layer_name)(x)) return x def make_roi_keypoint_feature_extractor(cfg, in_channels): func = registry.ROI_KEYPOINT_FEATURE_EXTRACTORS[ cfg.MODEL.ROI_KEYPOINT_HEAD.FEATURE_EXTRACTOR ] return func(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_predictors.py ================================================ from torch import nn from maskrcnn_benchmark import layers from maskrcnn_benchmark.modeling import registry @registry.ROI_KEYPOINT_PREDICTOR.register("KeypointRCNNPredictor") class KeypointRCNNPredictor(nn.Module): def __init__(self, cfg, in_channels): super(KeypointRCNNPredictor, self).__init__() input_features = in_channels num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES deconv_kernel = 4 self.kps_score_lowres = layers.ConvTranspose2d( input_features, num_keypoints, deconv_kernel, stride=2, padding=deconv_kernel // 2 - 1, ) nn.init.kaiming_normal_( self.kps_score_lowres.weight, mode="fan_out", nonlinearity="relu" ) nn.init.constant_(self.kps_score_lowres.bias, 0) self.up_scale = 2 self.out_channels = num_keypoints def forward(self, x): x = self.kps_score_lowres(x) x = layers.interpolate( x, scale_factor=self.up_scale, mode="bilinear", align_corners=False ) return x def make_roi_keypoint_predictor(cfg, in_channels): func = registry.ROI_KEYPOINT_PREDICTOR[cfg.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR] return func(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/mask_head/__init__.py ================================================ ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/mask_head/inference.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import numpy as np import torch from torch import nn from maskrcnn_benchmark.layers.misc import interpolate from maskrcnn_benchmark.structures.bounding_box import BoxList # TODO check if want to return a single BoxList or a composite # object class MaskPostProcessor(nn.Module): """ From the results of the CNN, post process the masks by taking the mask corresponding to the class with max probability (which are of fixed size and directly output by the CNN) and return the masks in the mask field of the BoxList. If a masker object is passed, it will additionally project the masks in the image according to the locations in boxes, """ def __init__(self, masker=None): super(MaskPostProcessor, self).__init__() self.masker = masker def forward(self, x, boxes): """ Arguments: x (Tensor): the mask logits boxes (list[BoxList]): bounding boxes that are used as reference, one for ech image Returns: results (list[BoxList]): one BoxList for each image, containing the extra field mask """ mask_prob = x.sigmoid() # select masks coresponding to the predicted classes num_masks = x.shape[0] labels = [bbox.get_field("labels") for bbox in boxes] labels = torch.cat(labels) index = torch.arange(num_masks, device=labels.device) mask_prob = mask_prob[index, labels][:, None] boxes_per_image = [len(box) for box in boxes] mask_prob = mask_prob.split(boxes_per_image, dim=0) if self.masker: mask_prob = self.masker(mask_prob, boxes) results = [] for prob, box in zip(mask_prob, boxes): bbox = BoxList(box.bbox, box.size, mode="xyxy") for field in box.fields(): bbox.add_field(field, box.get_field(field)) bbox.add_field("mask", prob) results.append(bbox) return results class MaskPostProcessorCOCOFormat(MaskPostProcessor): """ From the results of the CNN, post process the results so that the masks are pasted in the image, and additionally convert the results to COCO format. """ def forward(self, x, boxes): import pycocotools.mask as mask_util import numpy as np results = super(MaskPostProcessorCOCOFormat, self).forward(x, boxes) for result in results: masks = result.get_field("mask").cpu() rles = [ mask_util.encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0] for mask in masks ] for rle in rles: rle["counts"] = rle["counts"].decode("utf-8") result.add_field("mask", rles) return results # the next two functions should be merged inside Masker # but are kept here for the moment while we need them # temporarily gor paste_mask_in_image def expand_boxes(boxes, scale): w_half = (boxes[:, 2] - boxes[:, 0]) * .5 h_half = (boxes[:, 3] - boxes[:, 1]) * .5 x_c = (boxes[:, 2] + boxes[:, 0]) * .5 y_c = (boxes[:, 3] + boxes[:, 1]) * .5 w_half *= scale h_half *= scale boxes_exp = torch.zeros_like(boxes) boxes_exp[:, 0] = x_c - w_half boxes_exp[:, 2] = x_c + w_half boxes_exp[:, 1] = y_c - h_half boxes_exp[:, 3] = y_c + h_half return boxes_exp def expand_masks(mask, padding): N = mask.shape[0] M = mask.shape[-1] pad2 = 2 * padding scale = float(M + pad2) / M padded_mask = mask.new_zeros((N, 1, M + pad2, M + pad2)) padded_mask[:, :, padding:-padding, padding:-padding] = mask return padded_mask, scale def paste_mask_in_image(mask, box, im_h, im_w, thresh=0.5, padding=1): padded_mask, scale = expand_masks(mask[None], padding=padding) mask = padded_mask[0, 0] box = expand_boxes(box[None], scale)[0] box = box.to(dtype=torch.int32) TO_REMOVE = 1 w = int(box[2] - box[0] + TO_REMOVE) h = int(box[3] - box[1] + TO_REMOVE) w = max(w, 1) h = max(h, 1) # Set shape to [batchxCxHxW] mask = mask.expand((1, 1, -1, -1)) # Resize mask mask = mask.to(torch.float32) mask = interpolate(mask, size=(h, w), mode='bilinear', align_corners=False) mask = mask[0][0] if thresh >= 0: mask = mask > thresh else: # for visualization and debugging, we also # allow it to return an unmodified mask mask = (mask * 255).to(torch.uint8) im_mask = torch.zeros((im_h, im_w), dtype=torch.uint8) x_0 = max(box[0], 0) x_1 = min(box[2] + 1, im_w) y_0 = max(box[1], 0) y_1 = min(box[3] + 1, im_h) im_mask[y_0:y_1, x_0:x_1] = mask[ (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0]) ] return im_mask class Masker(object): """ Projects a set of masks in an image on the locations specified by the bounding boxes """ def __init__(self, threshold=0.5, padding=1): self.threshold = threshold self.padding = padding def forward_single_image(self, masks, boxes): boxes = boxes.convert("xyxy") im_w, im_h = boxes.size res = [ paste_mask_in_image(mask[0], box, im_h, im_w, self.threshold, self.padding) for mask, box in zip(masks, boxes.bbox) ] if len(res) > 0: res = torch.stack(res, dim=0)[:, None] else: res = masks.new_empty((0, 1, masks.shape[-2], masks.shape[-1])) return res def __call__(self, masks, boxes): if isinstance(boxes, BoxList): boxes = [boxes] # Make some sanity check assert len(boxes) == len(masks), "Masks and boxes should have the same length." # TODO: Is this JIT compatible? # If not we should make it compatible. results = [] for mask, box in zip(masks, boxes): assert mask.shape[0] == len(box), "Number of objects should be the same." result = self.forward_single_image(mask, box) results.append(result) return results def make_roi_mask_post_processor(cfg): if cfg.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS: mask_threshold = cfg.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD masker = Masker(threshold=mask_threshold, padding=1) else: masker = None mask_post_processor = MaskPostProcessor(masker) return mask_post_processor ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/mask_head/loss.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch.nn import functional as F from maskrcnn_benchmark.layers import smooth_l1_loss from maskrcnn_benchmark.modeling.matcher import Matcher from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou from maskrcnn_benchmark.modeling.utils import cat def project_masks_on_boxes(segmentation_masks, proposals, discretization_size): """ Given segmentation masks and the bounding boxes corresponding to the location of the masks in the image, this function crops and resizes the masks in the position defined by the boxes. This prepares the masks for them to be fed to the loss computation as the targets. Arguments: segmentation_masks: an instance of SegmentationMask proposals: an instance of BoxList """ masks = [] M = discretization_size device = proposals.bbox.device proposals = proposals.convert("xyxy") assert segmentation_masks.size == proposals.size, "{}, {}".format( segmentation_masks, proposals ) # FIXME: CPU computation bottleneck, this should be parallelized proposals = proposals.bbox.to(torch.device("cpu")) for segmentation_mask, proposal in zip(segmentation_masks, proposals): # crop the masks, resize them to the desired resolution and # then convert them to the tensor representation. cropped_mask = segmentation_mask.crop(proposal) scaled_mask = cropped_mask.resize((M, M)) mask = scaled_mask.get_mask_tensor() masks.append(mask) if len(masks) == 0: return torch.empty(0, dtype=torch.float32, device=device) return torch.stack(masks, dim=0).to(device, dtype=torch.float32) class MaskRCNNLossComputation(object): def __init__(self, proposal_matcher, discretization_size): """ Arguments: proposal_matcher (Matcher) discretization_size (int) """ self.proposal_matcher = proposal_matcher self.discretization_size = discretization_size def match_targets_to_proposals(self, proposal, target): match_quality_matrix = boxlist_iou(target, proposal) matched_idxs = self.proposal_matcher(match_quality_matrix) # Mask RCNN needs "labels" and "masks "fields for creating the targets target = target.copy_with_fields(["labels", "masks"]) # get the targets corresponding GT for each proposal # NB: need to clamp the indices because we can have a single # GT in the image, and matched_idxs can be -2, which goes # out of bounds matched_targets = target[matched_idxs.clamp(min=0)] matched_targets.add_field("matched_idxs", matched_idxs) return matched_targets def prepare_targets(self, proposals, targets): labels = [] masks = [] for proposals_per_image, targets_per_image in zip(proposals, targets): matched_targets = self.match_targets_to_proposals( proposals_per_image, targets_per_image ) matched_idxs = matched_targets.get_field("matched_idxs") labels_per_image = matched_targets.get_field("labels") labels_per_image = labels_per_image.to(dtype=torch.int64) # this can probably be removed, but is left here for clarity # and completeness neg_inds = matched_idxs == Matcher.BELOW_LOW_THRESHOLD labels_per_image[neg_inds] = 0 # mask scores are only computed on positive samples positive_inds = torch.nonzero(labels_per_image > 0).squeeze(1) segmentation_masks = matched_targets.get_field("masks") segmentation_masks = segmentation_masks[positive_inds] positive_proposals = proposals_per_image[positive_inds] masks_per_image = project_masks_on_boxes( segmentation_masks, positive_proposals, self.discretization_size ) labels.append(labels_per_image) masks.append(masks_per_image) return labels, masks def __call__(self, proposals, mask_logits, targets): """ Arguments: proposals (list[BoxList]) mask_logits (Tensor) targets (list[BoxList]) Return: mask_loss (Tensor): scalar tensor containing the loss """ labels, mask_targets = self.prepare_targets(proposals, targets) labels = cat(labels, dim=0) mask_targets = cat(mask_targets, dim=0) positive_inds = torch.nonzero(labels > 0).squeeze(1) labels_pos = labels[positive_inds] # torch.mean (in binary_cross_entropy_with_logits) doesn't # accept empty tensors, so handle it separately if mask_targets.numel() == 0: return mask_logits.sum() * 0 mask_loss = F.binary_cross_entropy_with_logits( mask_logits[positive_inds, labels_pos], mask_targets ) return mask_loss def make_roi_mask_loss_evaluator(cfg): matcher = Matcher( cfg.MODEL.ROI_HEADS.FG_IOU_THRESHOLD, cfg.MODEL.ROI_HEADS.BG_IOU_THRESHOLD, allow_low_quality_matches=False, ) loss_evaluator = MaskRCNNLossComputation( matcher, cfg.MODEL.ROI_MASK_HEAD.RESOLUTION ) return loss_evaluator ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/mask_head/mask_head.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch import nn from maskrcnn_benchmark.structures.bounding_box import BoxList from .roi_mask_feature_extractors import make_roi_mask_feature_extractor from .roi_mask_predictors import make_roi_mask_predictor from .inference import make_roi_mask_post_processor from .loss import make_roi_mask_loss_evaluator def keep_only_positive_boxes(boxes): """ Given a set of BoxList containing the `labels` field, return a set of BoxList for which `labels > 0`. Arguments: boxes (list of BoxList) """ assert isinstance(boxes, (list, tuple)) assert isinstance(boxes[0], BoxList) assert boxes[0].has_field("labels") positive_boxes = [] positive_inds = [] num_boxes = 0 for boxes_per_image in boxes: labels = boxes_per_image.get_field("labels") inds_mask = labels > 0 inds = inds_mask.nonzero().squeeze(1) positive_boxes.append(boxes_per_image[inds]) positive_inds.append(inds_mask) return positive_boxes, positive_inds class ROIMaskHead(torch.nn.Module): def __init__(self, cfg, in_channels): super(ROIMaskHead, self).__init__() self.cfg = cfg.clone() self.feature_extractor = make_roi_mask_feature_extractor(cfg, in_channels) self.predictor = make_roi_mask_predictor( cfg, self.feature_extractor.out_channels) self.post_processor = make_roi_mask_post_processor(cfg) self.loss_evaluator = make_roi_mask_loss_evaluator(cfg) def forward(self, features, proposals, targets=None): """ Arguments: features (list[Tensor]): feature-maps from possibly several levels proposals (list[BoxList]): proposal boxes targets (list[BoxList], optional): the ground-truth targets. Returns: x (Tensor): the result of the feature extractor proposals (list[BoxList]): during training, the original proposals are returned. During testing, the predicted boxlists are returned with the `mask` field set losses (dict[Tensor]): During training, returns the losses for the head. During testing, returns an empty dict. """ if self.training: # during training, only focus on positive boxes all_proposals = proposals proposals, positive_inds = keep_only_positive_boxes(proposals) if self.training and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR: x = features x = x[torch.cat(positive_inds, dim=0)] else: x = self.feature_extractor(features, proposals) mask_logits = self.predictor(x) if not self.training: result = self.post_processor(mask_logits, proposals) return x, result, {} loss_mask = self.loss_evaluator(proposals, mask_logits, targets) return x, all_proposals, dict(loss_mask=loss_mask) def build_roi_mask_head(cfg, in_channels): return ROIMaskHead(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_feature_extractors.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from torch import nn from torch.nn import functional as F from ..box_head.roi_box_feature_extractors import ResNet50Conv5ROIFeatureExtractor from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.modeling.poolers import Pooler from maskrcnn_benchmark.modeling.make_layers import make_conv3x3 registry.ROI_MASK_FEATURE_EXTRACTORS.register( "ResNet50Conv5ROIFeatureExtractor", ResNet50Conv5ROIFeatureExtractor ) @registry.ROI_MASK_FEATURE_EXTRACTORS.register("MaskRCNNFPNFeatureExtractor") class MaskRCNNFPNFeatureExtractor(nn.Module): """ Heads for FPN for classification """ def __init__(self, cfg, in_channels): """ Arguments: num_classes (int): number of output classes input_size (int): number of channels of the input once it's flattened representation_size (int): size of the intermediate representation """ super(MaskRCNNFPNFeatureExtractor, self).__init__() resolution = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION scales = cfg.MODEL.ROI_MASK_HEAD.POOLER_SCALES sampling_ratio = cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO pooler = Pooler( output_size=(resolution, resolution), scales=scales, sampling_ratio=sampling_ratio, ) input_size = in_channels self.pooler = pooler use_gn = cfg.MODEL.ROI_MASK_HEAD.USE_GN layers = cfg.MODEL.ROI_MASK_HEAD.CONV_LAYERS dilation = cfg.MODEL.ROI_MASK_HEAD.DILATION next_feature = input_size self.blocks = [] for layer_idx, layer_features in enumerate(layers, 1): layer_name = "mask_fcn{}".format(layer_idx) module = make_conv3x3( next_feature, layer_features, dilation=dilation, stride=1, use_gn=use_gn ) self.add_module(layer_name, module) next_feature = layer_features self.blocks.append(layer_name) self.out_channels = layer_features def forward(self, x, proposals): x = self.pooler(x, proposals) for layer_name in self.blocks: x = F.relu(getattr(self, layer_name)(x)) return x def make_roi_mask_feature_extractor(cfg, in_channels): func = registry.ROI_MASK_FEATURE_EXTRACTORS[ cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR ] return func(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_predictors.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from torch import nn from torch.nn import functional as F from maskrcnn_benchmark.layers import Conv2d from maskrcnn_benchmark.layers import ConvTranspose2d from maskrcnn_benchmark.modeling import registry @registry.ROI_MASK_PREDICTOR.register("MaskRCNNC4Predictor") class MaskRCNNC4Predictor(nn.Module): def __init__(self, cfg, in_channels): super(MaskRCNNC4Predictor, self).__init__() num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES dim_reduced = cfg.MODEL.ROI_MASK_HEAD.CONV_LAYERS[-1] num_inputs = in_channels self.conv5_mask = ConvTranspose2d(num_inputs, dim_reduced, 2, 2, 0) self.mask_fcn_logits = Conv2d(dim_reduced, num_classes, 1, 1, 0) for name, param in self.named_parameters(): if "bias" in name: nn.init.constant_(param, 0) elif "weight" in name: # Caffe2 implementation uses MSRAFill, which in fact # corresponds to kaiming_normal_ in PyTorch nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu") def forward(self, x): x = F.relu(self.conv5_mask(x)) return self.mask_fcn_logits(x) @registry.ROI_MASK_PREDICTOR.register("MaskRCNNConv1x1Predictor") class MaskRCNNConv1x1Predictor(nn.Module): def __init__(self, cfg, in_channels): super(MaskRCNNConv1x1Predictor, self).__init__() num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES num_inputs = in_channels self.mask_fcn_logits = Conv2d(num_inputs, num_classes, 1, 1, 0) for name, param in self.named_parameters(): if "bias" in name: nn.init.constant_(param, 0) elif "weight" in name: # Caffe2 implementation uses MSRAFill, which in fact # corresponds to kaiming_normal_ in PyTorch nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu") def forward(self, x): return self.mask_fcn_logits(x) def make_roi_mask_predictor(cfg, in_channels): func = registry.ROI_MASK_PREDICTOR[cfg.MODEL.ROI_MASK_HEAD.PREDICTOR] return func(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/roi_heads/roi_heads.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from .box_head.box_head import build_roi_box_head from .mask_head.mask_head import build_roi_mask_head from .keypoint_head.keypoint_head import build_roi_keypoint_head class CombinedROIHeads(torch.nn.ModuleDict): """ Combines a set of individual heads (for box prediction or masks) into a single head. """ def __init__(self, cfg, heads): super(CombinedROIHeads, self).__init__(heads) self.cfg = cfg.clone() if cfg.MODEL.MASK_ON and cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR: self.mask.feature_extractor = self.box.feature_extractor if cfg.MODEL.KEYPOINT_ON and cfg.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR: self.keypoint.feature_extractor = self.box.feature_extractor def forward(self, features, proposals, targets=None): losses = {} # TODO rename x to roi_box_features, if it doesn't increase memory consumption x, detections, loss_box = self.box(features, proposals, targets) losses.update(loss_box) if self.cfg.MODEL.MASK_ON: mask_features = features # optimization: during training, if we share the feature extractor between # the box and the mask heads, then we can reuse the features already computed if ( self.training and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR ): mask_features = x # During training, self.box() will return the unaltered proposals as "detections" # this makes the API consistent during training and testing x, detections, loss_mask = self.mask(mask_features, detections, targets) losses.update(loss_mask) if self.cfg.MODEL.KEYPOINT_ON: keypoint_features = features # optimization: during training, if we share the feature extractor between # the box and the mask heads, then we can reuse the features already computed if ( self.training and self.cfg.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR ): keypoint_features = x # During training, self.box() will return the unaltered proposals as "detections" # this makes the API consistent during training and testing x, detections, loss_keypoint = self.keypoint(keypoint_features, detections, targets) losses.update(loss_keypoint) return x, detections, losses def build_roi_heads(cfg, in_channels): # individually create the heads, that will be combined together # afterwards roi_heads = [] if cfg.MODEL.RETINANET_ON: return [] if not cfg.MODEL.RPN_ONLY: roi_heads.append(("box", build_roi_box_head(cfg, in_channels))) if cfg.MODEL.MASK_ON: roi_heads.append(("mask", build_roi_mask_head(cfg, in_channels))) if cfg.MODEL.KEYPOINT_ON: roi_heads.append(("keypoint", build_roi_keypoint_head(cfg, in_channels))) # combine individual heads in a single module if roi_heads: roi_heads = CombinedROIHeads(cfg, roi_heads) return roi_heads ================================================ FILE: maskrcnn_benchmark/modeling/rpn/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. # from .rpn import build_rpn ================================================ FILE: maskrcnn_benchmark/modeling/rpn/anchor_generator.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import math import numpy as np import torch from torch import nn from maskrcnn_benchmark.structures.bounding_box import BoxList class BufferList(nn.Module): """ Similar to nn.ParameterList, but for buffers """ def __init__(self, buffers=None): super(BufferList, self).__init__() if buffers is not None: self.extend(buffers) def extend(self, buffers): offset = len(self) for i, buffer in enumerate(buffers): self.register_buffer(str(offset + i), buffer) return self def __len__(self): return len(self._buffers) def __iter__(self): return iter(self._buffers.values()) class AnchorGenerator(nn.Module): """ For a set of image sizes and feature maps, computes a set of anchors """ def __init__( self, sizes=(128, 256, 512), aspect_ratios=(0.5, 1.0, 2.0), anchor_strides=(8, 16, 32), straddle_thresh=0, ): super(AnchorGenerator, self).__init__() if len(anchor_strides) == 1: anchor_stride = anchor_strides[0] cell_anchors = [ generate_anchors(anchor_stride, sizes, aspect_ratios).float() ] else: if len(anchor_strides) != len(sizes): raise RuntimeError("FPN should have #anchor_strides == #sizes") cell_anchors = [ generate_anchors( anchor_stride, size if isinstance(size, (tuple, list)) else (size,), aspect_ratios ).float() for anchor_stride, size in zip(anchor_strides, sizes) ] self.strides = anchor_strides self.cell_anchors = BufferList(cell_anchors) self.straddle_thresh = straddle_thresh def num_anchors_per_location(self): return [len(cell_anchors) for cell_anchors in self.cell_anchors] def grid_anchors(self, grid_sizes): anchors = [] for size, stride, base_anchors in zip( grid_sizes, self.strides, self.cell_anchors ): grid_height, grid_width = size device = base_anchors.device shifts_x = torch.arange( 0, grid_width * stride, step=stride, dtype=torch.float32, device=device ) shifts_y = torch.arange( 0, grid_height * stride, step=stride, dtype=torch.float32, device=device ) shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) shift_x = shift_x.reshape(-1) shift_y = shift_y.reshape(-1) shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1) anchors.append( (shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4) ) return anchors def add_visibility_to(self, boxlist): image_width, image_height = boxlist.size anchors = boxlist.bbox if self.straddle_thresh >= 0: inds_inside = ( (anchors[..., 0] >= -self.straddle_thresh) & (anchors[..., 1] >= -self.straddle_thresh) & (anchors[..., 2] < image_width + self.straddle_thresh) & (anchors[..., 3] < image_height + self.straddle_thresh) ) else: device = anchors.device inds_inside = torch.ones(anchors.shape[0], dtype=torch.uint8, device=device) boxlist.add_field("visibility", inds_inside) def forward(self, image_list, feature_maps): grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps] anchors_over_all_feature_maps = self.grid_anchors(grid_sizes) anchors = [] for i, (image_height, image_width) in enumerate(image_list.image_sizes): anchors_in_image = [] for anchors_per_feature_map in anchors_over_all_feature_maps: boxlist = BoxList( anchors_per_feature_map, (image_width, image_height), mode="xyxy" ) self.add_visibility_to(boxlist) anchors_in_image.append(boxlist) anchors.append(anchors_in_image) return anchors def make_anchor_generator(config): anchor_sizes = config.MODEL.RPN.ANCHOR_SIZES aspect_ratios = config.MODEL.RPN.ASPECT_RATIOS anchor_stride = config.MODEL.RPN.ANCHOR_STRIDE straddle_thresh = config.MODEL.RPN.STRADDLE_THRESH if config.MODEL.RPN.USE_FPN: assert len(anchor_stride) == len( anchor_sizes ), "FPN should have len(ANCHOR_STRIDE) == len(ANCHOR_SIZES)" else: assert len(anchor_stride) == 1, "Non-FPN should have a single ANCHOR_STRIDE" anchor_generator = AnchorGenerator( anchor_sizes, aspect_ratios, anchor_stride, straddle_thresh ) return anchor_generator def make_anchor_generator_retinanet(config): anchor_sizes = config.MODEL.RETINANET.ANCHOR_SIZES aspect_ratios = config.MODEL.RETINANET.ASPECT_RATIOS anchor_strides = config.MODEL.RETINANET.ANCHOR_STRIDES straddle_thresh = config.MODEL.RETINANET.STRADDLE_THRESH octave = config.MODEL.RETINANET.OCTAVE scales_per_octave = config.MODEL.RETINANET.SCALES_PER_OCTAVE assert len(anchor_strides) == len(anchor_sizes), "Only support FPN now" new_anchor_sizes = [] for size in anchor_sizes: per_layer_anchor_sizes = [] for scale_per_octave in range(scales_per_octave): octave_scale = octave ** (scale_per_octave / float(scales_per_octave)) per_layer_anchor_sizes.append(octave_scale * size) new_anchor_sizes.append(tuple(per_layer_anchor_sizes)) anchor_generator = AnchorGenerator( tuple(new_anchor_sizes), aspect_ratios, anchor_strides, straddle_thresh ) return anchor_generator # Copyright (c) 2017-present, Facebook, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ############################################################################## # # Based on: # -------------------------------------------------------- # Faster R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick and Sean Bell # -------------------------------------------------------- # Verify that we compute the same anchors as Shaoqing's matlab implementation: # # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat # >> anchors # # anchors = # # -83 -39 100 56 # -175 -87 192 104 # -359 -183 376 200 # -55 -55 72 72 # -119 -119 136 136 # -247 -247 264 264 # -35 -79 52 96 # -79 -167 96 184 # -167 -343 184 360 # array([[ -83., -39., 100., 56.], # [-175., -87., 192., 104.], # [-359., -183., 376., 200.], # [ -55., -55., 72., 72.], # [-119., -119., 136., 136.], # [-247., -247., 264., 264.], # [ -35., -79., 52., 96.], # [ -79., -167., 96., 184.], # [-167., -343., 184., 360.]]) def generate_anchors( stride=16, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2) ): """Generates a matrix of anchor boxes in (x1, y1, x2, y2) format. Anchors are centered on stride / 2, have (approximate) sqrt areas of the specified sizes, and aspect ratios as given. """ return _generate_anchors( stride, np.array(sizes, dtype=np.float) / stride, np.array(aspect_ratios, dtype=np.float), ) def _generate_anchors(base_size, scales, aspect_ratios): """Generate anchor (reference) windows by enumerating aspect ratios X scales wrt a reference (0, 0, base_size - 1, base_size - 1) window. """ anchor = np.array([1, 1, base_size, base_size], dtype=np.float) - 1 anchors = _ratio_enum(anchor, aspect_ratios) anchors = np.vstack( [_scale_enum(anchors[i, :], scales) for i in range(anchors.shape[0])] ) return torch.from_numpy(anchors) def _whctrs(anchor): """Return width, height, x center, and y center for an anchor (window).""" w = anchor[2] - anchor[0] + 1 h = anchor[3] - anchor[1] + 1 x_ctr = anchor[0] + 0.5 * (w - 1) y_ctr = anchor[1] + 0.5 * (h - 1) return w, h, x_ctr, y_ctr def _mkanchors(ws, hs, x_ctr, y_ctr): """Given a vector of widths (ws) and heights (hs) around a center (x_ctr, y_ctr), output a set of anchors (windows). """ ws = ws[:, np.newaxis] hs = hs[:, np.newaxis] anchors = np.hstack( ( x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1), x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1), ) ) return anchors def _ratio_enum(anchor, ratios): """Enumerate a set of anchors for each aspect ratio wrt an anchor.""" w, h, x_ctr, y_ctr = _whctrs(anchor) size = w * h size_ratios = size / ratios ws = np.round(np.sqrt(size_ratios)) hs = np.round(ws * ratios) anchors = _mkanchors(ws, hs, x_ctr, y_ctr) return anchors def _scale_enum(anchor, scales): """Enumerate a set of anchors for each scale wrt an anchor.""" w, h, x_ctr, y_ctr = _whctrs(anchor) ws = w * scales hs = h * scales anchors = _mkanchors(ws, hs, x_ctr, y_ctr) return anchors ================================================ FILE: maskrcnn_benchmark/modeling/rpn/fcos/__init__.py ================================================ ================================================ FILE: maskrcnn_benchmark/modeling/rpn/fcos/fcos.py ================================================ import math import torch import torch.nn.functional as F from torch import nn from .inference import make_fcos_postprocessor from .loss import make_fcos_loss_evaluator from maskrcnn_benchmark.layers import Scale class FCOSHead(torch.nn.Module): def __init__(self, cfg, in_channels): """ Arguments: in_channels (int): number of channels of the input feature """ super(FCOSHead, self).__init__() # TODO: Implement the sigmoid version first. num_classes = cfg.MODEL.FCOS.NUM_CLASSES - 1 cls_tower = [] bbox_tower = [] for i in range(cfg.MODEL.FCOS.NUM_CONVS): cls_tower.append( nn.Conv2d( in_channels, in_channels, kernel_size=3, stride=1, padding=1 ) ) cls_tower.append(nn.GroupNorm(32, in_channels)) cls_tower.append(nn.ReLU()) bbox_tower.append( nn.Conv2d( in_channels, in_channels, kernel_size=3, stride=1, padding=1 ) ) bbox_tower.append(nn.GroupNorm(32, in_channels)) bbox_tower.append(nn.ReLU()) self.add_module('cls_tower', nn.Sequential(*cls_tower)) self.add_module('bbox_tower', nn.Sequential(*bbox_tower)) self.dense_points = cfg.MODEL.FCOS.DENSE_POINTS self.cls_logits = nn.Conv2d( in_channels, num_classes * self.dense_points, kernel_size=3, stride=1, padding=1 ) self.bbox_pred = nn.Conv2d( in_channels, 4 * self.dense_points, kernel_size=3, stride=1, padding=1 ) self.centerness = nn.Conv2d( in_channels, 1 * self.dense_points, kernel_size=3, stride=1, padding=1 ) # initialization for modules in [self.cls_tower, self.bbox_tower, self.cls_logits, self.bbox_pred, self.centerness]: for l in modules.modules(): if isinstance(l, nn.Conv2d): torch.nn.init.normal_(l.weight, std=0.01) torch.nn.init.constant_(l.bias, 0) # initialize the bias for focal loss prior_prob = cfg.MODEL.FCOS.PRIOR_PROB bias_value = -math.log((1 - prior_prob) / prior_prob) torch.nn.init.constant_(self.cls_logits.bias, bias_value) self.scales = nn.ModuleList([Scale(init_value=1.0) for _ in range(5)]) def forward(self, x): logits = [] bbox_reg = [] centerness = [] for l, feature in enumerate(x): cls_tower = self.cls_tower(feature) logits.append(self.cls_logits(cls_tower)) centerness.append(self.centerness(cls_tower)) bbox_reg.append(torch.exp(self.scales[l]( self.bbox_pred(self.bbox_tower(feature)) ))) return logits, bbox_reg, centerness class FCOSModule(torch.nn.Module): """ Module for FCOS computation. Takes feature maps from the backbone and FCOS outputs and losses. Only Test on FPN now. """ def __init__(self, cfg, in_channels): super(FCOSModule, self).__init__() head = FCOSHead(cfg, in_channels) box_selector_test = make_fcos_postprocessor(cfg) loss_evaluator = make_fcos_loss_evaluator(cfg) self.head = head self.box_selector_test = box_selector_test self.loss_evaluator = loss_evaluator self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES self.dense_points = cfg.MODEL.FCOS.DENSE_POINTS def forward(self, images, features, targets=None): """ Arguments: images (ImageList): images for which we want to compute the predictions features (list[Tensor]): features computed from the images that are used for computing the predictions. Each tensor in the list correspond to different feature levels targets (list[BoxList): ground-truth boxes present in the image (optional) Returns: boxes (list[BoxList]): the predicted boxes from the RPN, one BoxList per image. losses (dict[Tensor]): the losses for the model during training. During testing, it is an empty dict. """ box_cls, box_regression, centerness = self.head(features) locations = self.compute_locations(features) if self.training: return self._forward_train( locations, box_cls, box_regression, centerness, targets ) else: return self._forward_test( locations, box_cls, box_regression, centerness, images.image_sizes ) def _forward_train(self, locations, box_cls, box_regression, centerness, targets): loss_box_cls, loss_box_reg, loss_centerness = self.loss_evaluator( locations, box_cls, box_regression, centerness, targets ) losses = { "loss_cls": loss_box_cls, "loss_reg": loss_box_reg, "loss_centerness": loss_centerness } return None, losses def _forward_test(self, locations, box_cls, box_regression, centerness, image_sizes): boxes = self.box_selector_test( locations, box_cls, box_regression, centerness, image_sizes ) return boxes, {} def compute_locations(self, features): locations = [] for level, feature in enumerate(features): h, w = feature.size()[-2:] locations_per_level = self.compute_locations_per_level( h, w, self.fpn_strides[level], feature.device ) locations.append(locations_per_level) return locations def compute_locations_per_level(self, h, w, stride, device): shifts_x = torch.arange( 0, w * stride, step=stride, dtype=torch.float32, device=device ) shifts_y = torch.arange( 0, h * stride, step=stride, dtype=torch.float32, device=device ) shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) shift_x = shift_x.reshape(-1) shift_y = shift_y.reshape(-1) locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2 locations = self.get_dense_locations(locations, stride, device) return locations def get_dense_locations(self, locations, stride, device): if self.dense_points <= 1: return locations center = 0 step = stride // 4 l_t = [center - step, center - step] r_t = [center + step, center - step] l_b = [center - step, center + step] r_b = [center + step, center + step] if self.dense_points == 4: points = torch.cuda.FloatTensor([l_t, r_t, l_b, r_b], device=device) elif self.dense_points == 5: points = torch.cuda.FloatTensor([l_t, r_t, [center, center], l_b, r_b], device=device) else: print("dense points only support 1, 4, 5") points.reshape(1, -1, 2) locations = locations.reshape(-1, 1, 2).to(points) dense_locations = points + locations dense_locations = dense_locations.view(-1, 2) return dense_locations def build_fcos(cfg, in_channels): return FCOSModule(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/rpn/fcos/inference.py ================================================ import torch from ..inference import RPNPostProcessor from ..utils import permute_and_flatten from maskrcnn_benchmark.modeling.box_coder import BoxCoder from maskrcnn_benchmark.modeling.utils import cat from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms from maskrcnn_benchmark.structures.boxlist_ops import remove_small_boxes class FCOSPostProcessor(torch.nn.Module): """ Performs post-processing on the outputs of the RetinaNet boxes. This is only used in the testing. """ def __init__(self, pre_nms_thresh, pre_nms_top_n, nms_thresh, fpn_post_nms_top_n, min_size, num_classes, dense_points): """ Arguments: pre_nms_thresh (float) pre_nms_top_n (int) nms_thresh (float) fpn_post_nms_top_n (int) min_size (int) num_classes (int) box_coder (BoxCoder) """ super(FCOSPostProcessor, self).__init__() self.pre_nms_thresh = pre_nms_thresh self.pre_nms_top_n = pre_nms_top_n self.nms_thresh = nms_thresh self.fpn_post_nms_top_n = fpn_post_nms_top_n self.min_size = min_size self.num_classes = num_classes self.dense_points = dense_points def forward_for_single_feature_map( self, locations, box_cls, box_regression, centerness, image_sizes): """ Arguments: anchors: list[BoxList] box_cls: tensor of size N, A * C, H, W box_regression: tensor of size N, A * 4, H, W """ N, C, H, W = box_cls.shape # put in the same format as locations box_cls = box_cls.view(N, C, H, W).permute(0, 2, 3, 1) box_cls = box_cls.reshape(N, -1, self.num_classes - 1).sigmoid() box_regression = box_regression.view(N, self.dense_points * 4, H, W).permute(0, 2, 3, 1) box_regression = box_regression.reshape(N, -1, 4) centerness = centerness.view(N, self.dense_points, H, W).permute(0, 2, 3, 1) centerness = centerness.reshape(N, -1).sigmoid() candidate_inds = box_cls > self.pre_nms_thresh pre_nms_top_n = candidate_inds.view(N, -1).sum(1) pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n) # multiply the classification scores with centerness scores box_cls = box_cls * centerness[:, :, None] results = [] for i in range(N): per_box_cls = box_cls[i] per_candidate_inds = candidate_inds[i] per_box_cls = per_box_cls[per_candidate_inds] per_candidate_nonzeros = per_candidate_inds.nonzero() per_box_loc = per_candidate_nonzeros[:, 0] per_class = per_candidate_nonzeros[:, 1] + 1 per_box_regression = box_regression[i] per_box_regression = per_box_regression[per_box_loc] per_locations = locations[per_box_loc] per_pre_nms_top_n = pre_nms_top_n[i] if per_candidate_inds.sum().item() > per_pre_nms_top_n.item(): per_box_cls, top_k_indices = \ per_box_cls.topk(per_pre_nms_top_n, sorted=False) per_class = per_class[top_k_indices] per_box_regression = per_box_regression[top_k_indices] per_locations = per_locations[top_k_indices] detections = torch.stack([ per_locations[:, 0] - per_box_regression[:, 0], per_locations[:, 1] - per_box_regression[:, 1], per_locations[:, 0] + per_box_regression[:, 2], per_locations[:, 1] + per_box_regression[:, 3], ], dim=1) h, w = image_sizes[i] boxlist = BoxList(detections, (int(w), int(h)), mode="xyxy") boxlist.add_field("labels", per_class) boxlist.add_field("scores", per_box_cls) boxlist = boxlist.clip_to_image(remove_empty=False) boxlist = remove_small_boxes(boxlist, self.min_size) results.append(boxlist) return results def forward(self, locations, box_cls, box_regression, centerness, image_sizes): """ Arguments: anchors: list[list[BoxList]] box_cls: list[tensor] box_regression: list[tensor] image_sizes: list[(h, w)] Returns: boxlists (list[BoxList]): the post-processed anchors, after applying box decoding and NMS """ sampled_boxes = [] for _, (l, o, b, c) in enumerate(zip(locations, box_cls, box_regression, centerness)): sampled_boxes.append( self.forward_for_single_feature_map( l, o, b, c, image_sizes ) ) boxlists = list(zip(*sampled_boxes)) boxlists = [cat_boxlist(boxlist) for boxlist in boxlists] boxlists = self.select_over_all_levels(boxlists) return boxlists # TODO very similar to filter_results from PostProcessor # but filter_results is per image # TODO Yang: solve this issue in the future. No good solution # right now. def select_over_all_levels(self, boxlists): num_images = len(boxlists) results = [] for i in range(num_images): scores = boxlists[i].get_field("scores") labels = boxlists[i].get_field("labels") boxes = boxlists[i].bbox boxlist = boxlists[i] result = [] # skip the background for j in range(1, self.num_classes): inds = (labels == j).nonzero().view(-1) scores_j = scores[inds] boxes_j = boxes[inds, :].view(-1, 4) boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") boxlist_for_class.add_field("scores", scores_j) boxlist_for_class = boxlist_nms( boxlist_for_class, self.nms_thresh, score_field="scores" ) num_labels = len(boxlist_for_class) boxlist_for_class.add_field( "labels", torch.full((num_labels,), j, dtype=torch.int64, device=scores.device) ) result.append(boxlist_for_class) result = cat_boxlist(result) number_of_detections = len(result) # Limit to max_per_image detections **over all classes** if number_of_detections > self.fpn_post_nms_top_n > 0: cls_scores = result.get_field("scores") image_thresh, _ = torch.kthvalue( cls_scores.cpu(), number_of_detections - self.fpn_post_nms_top_n + 1 ) keep = cls_scores >= image_thresh.item() keep = torch.nonzero(keep).squeeze(1) result = result[keep] results.append(result) return results def make_fcos_postprocessor(config): pre_nms_thresh = config.MODEL.FCOS.INFERENCE_TH pre_nms_top_n = config.MODEL.FCOS.PRE_NMS_TOP_N nms_thresh = config.MODEL.FCOS.NMS_TH fpn_post_nms_top_n = config.TEST.DETECTIONS_PER_IMG dense_points = config.MODEL.FCOS.DENSE_POINTS box_selector = FCOSPostProcessor( pre_nms_thresh=pre_nms_thresh, pre_nms_top_n=pre_nms_top_n, nms_thresh=nms_thresh, fpn_post_nms_top_n=fpn_post_nms_top_n, min_size=0, num_classes=config.MODEL.FCOS.NUM_CLASSES, dense_points=dense_points) return box_selector ================================================ FILE: maskrcnn_benchmark/modeling/rpn/fcos/loss.py ================================================ """ This file contains specific functions for computing losses of FCOS file """ import torch from torch.nn import functional as F from torch import nn from ..utils import concat_box_prediction_layers from maskrcnn_benchmark.layers import IOULoss from maskrcnn_benchmark.layers import SigmoidFocalLoss from maskrcnn_benchmark.modeling.matcher import Matcher from maskrcnn_benchmark.modeling.utils import cat from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist INF = 100000000 class FCOSLossComputation(object): """ This class computes the FCOS losses. """ def __init__(self, cfg): self.cls_loss_func = SigmoidFocalLoss( cfg.MODEL.FCOS.LOSS_GAMMA, cfg.MODEL.FCOS.LOSS_ALPHA ) self.center_sample = cfg.MODEL.FCOS.CENTER_SAMPLE self.strides = cfg.MODEL.FCOS.FPN_STRIDES self.radius = cfg.MODEL.FCOS.POS_RADIUS self.loc_loss_type = cfg.MODEL.FCOS.LOC_LOSS_TYPE # we make use of IOU Loss for bounding boxes regression, # but we found that L1 in log scale can yield a similar performance self.box_reg_loss_func = IOULoss(self.loc_loss_type) self.centerness_loss_func = nn.BCEWithLogitsLoss() self.dense_points = cfg.MODEL.FCOS.DENSE_POINTS def get_sample_region(self, gt, strides, num_points_per, gt_xs, gt_ys, radius=1): num_gts = gt.shape[0] K = len(gt_xs) gt = gt[None].expand(K, num_gts, 4) center_x = (gt[..., 0] + gt[..., 2]) / 2 center_y = (gt[..., 1] + gt[..., 3]) / 2 center_gt = gt.new_zeros(gt.shape) # no gt if center_x[..., 0].sum() == 0: return gt_xs.new_zeros(gt_xs.shape, dtype=torch.uint8) beg = 0 for level, n_p in enumerate(num_points_per): end = beg + n_p stride = strides[level] * radius xmin = center_x[beg:end] - stride ymin = center_y[beg:end] - stride xmax = center_x[beg:end] + stride ymax = center_y[beg:end] + stride # limit sample region in gt center_gt[beg:end, :, 0] = torch.where(xmin > gt[beg:end, :, 0], xmin, gt[beg:end, :, 0]) center_gt[beg:end, :, 1] = torch.where(ymin > gt[beg:end, :, 1], ymin, gt[beg:end, :, 1]) center_gt[beg:end, :, 2] = torch.where(xmax > gt[beg:end, :, 2], gt[beg:end, :, 2], xmax) center_gt[beg:end, :, 3] = torch.where(ymax > gt[beg:end, :, 3], gt[beg:end, :, 3], ymax) beg = end left = gt_xs[:, None] - center_gt[..., 0] right = center_gt[..., 2] - gt_xs[:, None] top = gt_ys[:, None] - center_gt[..., 1] bottom = center_gt[..., 3] - gt_ys[:, None] center_bbox = torch.stack((left, top, right, bottom), -1) inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0 return inside_gt_bbox_mask def prepare_targets(self, points, targets): object_sizes_of_interest = [ [-1, 64], [64, 128], [128, 256], [256, 512], [512, INF], ] expanded_object_sizes_of_interest = [] for l, points_per_level in enumerate(points): object_sizes_of_interest_per_level = \ points_per_level.new_tensor(object_sizes_of_interest[l]) expanded_object_sizes_of_interest.append( object_sizes_of_interest_per_level[None].expand(len(points_per_level), -1) ) expanded_object_sizes_of_interest = torch.cat(expanded_object_sizes_of_interest, dim=0) num_points_per_level = [len(points_per_level) for points_per_level in points] self.num_points_per_level = num_points_per_level points_all_level = torch.cat(points, dim=0) labels, reg_targets = self.compute_targets_for_locations( points_all_level, targets, expanded_object_sizes_of_interest ) for i in range(len(labels)): labels[i] = torch.split(labels[i], num_points_per_level, dim=0) reg_targets[i] = torch.split(reg_targets[i], num_points_per_level, dim=0) labels_level_first = [] reg_targets_level_first = [] for level in range(len(points)): labels_level_first.append( torch.cat([labels_per_im[level] for labels_per_im in labels], dim=0) ) reg_targets_level_first.append( torch.cat([reg_targets_per_im[level] for reg_targets_per_im in reg_targets], dim=0) ) return labels_level_first, reg_targets_level_first def compute_targets_for_locations(self, locations, targets, object_sizes_of_interest): labels = [] reg_targets = [] xs, ys = locations[:, 0], locations[:, 1] for im_i in range(len(targets)): targets_per_im = targets[im_i] assert targets_per_im.mode == "xyxy" bboxes = targets_per_im.bbox labels_per_im = targets_per_im.get_field("labels") area = targets_per_im.area() l = xs[:, None] - bboxes[:, 0][None] t = ys[:, None] - bboxes[:, 1][None] r = bboxes[:, 2][None] - xs[:, None] b = bboxes[:, 3][None] - ys[:, None] reg_targets_per_im = torch.stack([l, t, r, b], dim=2) if self.center_sample: is_in_boxes = self.get_sample_region( bboxes, self.strides, self.num_points_per_level, xs, ys, radius=self.radius) else: is_in_boxes = reg_targets_per_im.min(dim=2)[0] > 0 max_reg_targets_per_im = reg_targets_per_im.max(dim=2)[0] # limit the regression range for each location is_cared_in_the_level = \ (max_reg_targets_per_im >= object_sizes_of_interest[:, [0]]) & \ (max_reg_targets_per_im <= object_sizes_of_interest[:, [1]]) locations_to_gt_area = area[None].repeat(len(locations), 1) locations_to_gt_area[is_in_boxes == 0] = INF locations_to_gt_area[is_cared_in_the_level == 0] = INF # if there are still more than one objects for a location, # we choose the one with minimal area locations_to_min_area, locations_to_gt_inds = locations_to_gt_area.min(dim=1) reg_targets_per_im = reg_targets_per_im[range(len(locations)), locations_to_gt_inds] labels_per_im = labels_per_im[locations_to_gt_inds] labels_per_im[locations_to_min_area == INF] = 0 labels.append(labels_per_im) reg_targets.append(reg_targets_per_im) return labels, reg_targets def compute_centerness_targets(self, reg_targets): left_right = reg_targets[:, [0, 2]] top_bottom = reg_targets[:, [1, 3]] centerness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * \ (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]) return torch.sqrt(centerness) def __call__(self, locations, box_cls, box_regression, centerness, targets): """ Arguments: locations (list[BoxList]) box_cls (list[Tensor]) box_regression (list[Tensor]) centerness (list[Tensor]) targets (list[BoxList]) Returns: cls_loss (Tensor) reg_loss (Tensor) centerness_loss (Tensor) """ N = box_cls[0].size(0) num_classes = box_cls[0].size(1) // self.dense_points labels, reg_targets = self.prepare_targets(locations, targets) box_cls_flatten = [] box_regression_flatten = [] centerness_flatten = [] labels_flatten = [] reg_targets_flatten = [] for l in range(len(labels)): box_cls_flatten.append(box_cls[l].permute(0, 2, 3, 1).reshape(-1, num_classes)) box_regression_flatten.append(box_regression[l].permute(0, 2, 3, 1).reshape(-1, 4)) labels_flatten.append(labels[l].reshape(-1)) reg_targets_flatten.append(reg_targets[l].reshape(-1, 4)) centerness_flatten.append(centerness[l].permute(0, 2, 3, 1).reshape(-1)) box_cls_flatten = torch.cat(box_cls_flatten, dim=0) box_regression_flatten = torch.cat(box_regression_flatten, dim=0) centerness_flatten = torch.cat(centerness_flatten, dim=0) labels_flatten = torch.cat(labels_flatten, dim=0) reg_targets_flatten = torch.cat(reg_targets_flatten, dim=0) pos_inds = torch.nonzero(labels_flatten > 0).squeeze(1) cls_loss = self.cls_loss_func( box_cls_flatten, labels_flatten.int() ) / (pos_inds.numel() + N) # add N to avoid dividing by a zero box_regression_flatten = box_regression_flatten[pos_inds] reg_targets_flatten = reg_targets_flatten[pos_inds] centerness_flatten = centerness_flatten[pos_inds] if pos_inds.numel() > 0: centerness_targets = self.compute_centerness_targets(reg_targets_flatten) reg_loss = self.box_reg_loss_func( box_regression_flatten, reg_targets_flatten, centerness_targets, ) centerness_loss = self.centerness_loss_func( centerness_flatten, centerness_targets ) else: reg_loss = box_regression_flatten.sum() centerness_loss = centerness_flatten.sum() return cls_loss, reg_loss, centerness_loss def make_fcos_loss_evaluator(cfg): loss_evaluator = FCOSLossComputation(cfg) return loss_evaluator ================================================ FILE: maskrcnn_benchmark/modeling/rpn/inference.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from maskrcnn_benchmark.modeling.box_coder import BoxCoder from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms from maskrcnn_benchmark.structures.boxlist_ops import remove_small_boxes from ..utils import cat from .utils import permute_and_flatten class RPNPostProcessor(torch.nn.Module): """ Performs post-processing on the outputs of the RPN boxes, before feeding the proposals to the heads """ def __init__( self, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, box_coder=None, fpn_post_nms_top_n=None, ): """ Arguments: pre_nms_top_n (int) post_nms_top_n (int) nms_thresh (float) min_size (int) box_coder (BoxCoder) fpn_post_nms_top_n (int) """ super(RPNPostProcessor, self).__init__() self.pre_nms_top_n = pre_nms_top_n self.post_nms_top_n = post_nms_top_n self.nms_thresh = nms_thresh self.min_size = min_size if box_coder is None: box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) self.box_coder = box_coder if fpn_post_nms_top_n is None: fpn_post_nms_top_n = post_nms_top_n self.fpn_post_nms_top_n = fpn_post_nms_top_n def add_gt_proposals(self, proposals, targets): """ Arguments: proposals: list[BoxList] targets: list[BoxList] """ # Get the device we're operating on device = proposals[0].bbox.device gt_boxes = [target.copy_with_fields([]) for target in targets] # later cat of bbox requires all fields to be present for all bbox # so we need to add a dummy for objectness that's missing for gt_box in gt_boxes: gt_box.add_field("objectness", torch.ones(len(gt_box), device=device)) proposals = [ cat_boxlist((proposal, gt_box)) for proposal, gt_box in zip(proposals, gt_boxes) ] return proposals def forward_for_single_feature_map(self, anchors, objectness, box_regression): """ Arguments: anchors: list[BoxList] objectness: tensor of size N, A, H, W box_regression: tensor of size N, A * 4, H, W """ device = objectness.device N, A, H, W = objectness.shape # put in the same format as anchors objectness = permute_and_flatten(objectness, N, A, 1, H, W).view(N, -1) objectness = objectness.sigmoid() box_regression = permute_and_flatten(box_regression, N, A, 4, H, W) num_anchors = A * H * W pre_nms_top_n = min(self.pre_nms_top_n, num_anchors) objectness, topk_idx = objectness.topk(pre_nms_top_n, dim=1, sorted=True) batch_idx = torch.arange(N, device=device)[:, None] box_regression = box_regression[batch_idx, topk_idx] image_shapes = [box.size for box in anchors] concat_anchors = torch.cat([a.bbox for a in anchors], dim=0) concat_anchors = concat_anchors.reshape(N, -1, 4)[batch_idx, topk_idx] proposals = self.box_coder.decode( box_regression.view(-1, 4), concat_anchors.view(-1, 4) ) proposals = proposals.view(N, -1, 4) result = [] for proposal, score, im_shape in zip(proposals, objectness, image_shapes): boxlist = BoxList(proposal, im_shape, mode="xyxy") boxlist.add_field("objectness", score) boxlist = boxlist.clip_to_image(remove_empty=False) boxlist = remove_small_boxes(boxlist, self.min_size) boxlist = boxlist_nms( boxlist, self.nms_thresh, max_proposals=self.post_nms_top_n, score_field="objectness", ) result.append(boxlist) return result def forward(self, anchors, objectness, box_regression, targets=None): """ Arguments: anchors: list[list[BoxList]] objectness: list[tensor] box_regression: list[tensor] Returns: boxlists (list[BoxList]): the post-processed anchors, after applying box decoding and NMS """ sampled_boxes = [] num_levels = len(objectness) anchors = list(zip(*anchors)) for a, o, b in zip(anchors, objectness, box_regression): sampled_boxes.append(self.forward_for_single_feature_map(a, o, b)) boxlists = list(zip(*sampled_boxes)) boxlists = [cat_boxlist(boxlist) for boxlist in boxlists] if num_levels > 1: boxlists = self.select_over_all_levels(boxlists) # append ground-truth bboxes to proposals if self.training and targets is not None: boxlists = self.add_gt_proposals(boxlists, targets) return boxlists def select_over_all_levels(self, boxlists): num_images = len(boxlists) # different behavior during training and during testing: # during training, post_nms_top_n is over *all* the proposals combined, while # during testing, it is over the proposals for each image # TODO resolve this difference and make it consistent. It should be per image, # and not per batch if self.training: objectness = torch.cat( [boxlist.get_field("objectness") for boxlist in boxlists], dim=0 ) box_sizes = [len(boxlist) for boxlist in boxlists] post_nms_top_n = min(self.fpn_post_nms_top_n, len(objectness)) _, inds_sorted = torch.topk(objectness, post_nms_top_n, dim=0, sorted=True) inds_mask = torch.zeros_like(objectness, dtype=torch.uint8) inds_mask[inds_sorted] = 1 inds_mask = inds_mask.split(box_sizes) for i in range(num_images): boxlists[i] = boxlists[i][inds_mask[i]] else: for i in range(num_images): objectness = boxlists[i].get_field("objectness") post_nms_top_n = min(self.fpn_post_nms_top_n, len(objectness)) _, inds_sorted = torch.topk( objectness, post_nms_top_n, dim=0, sorted=True ) boxlists[i] = boxlists[i][inds_sorted] return boxlists def make_rpn_postprocessor(config, rpn_box_coder, is_train): fpn_post_nms_top_n = config.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN if not is_train: fpn_post_nms_top_n = config.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST pre_nms_top_n = config.MODEL.RPN.PRE_NMS_TOP_N_TRAIN post_nms_top_n = config.MODEL.RPN.POST_NMS_TOP_N_TRAIN if not is_train: pre_nms_top_n = config.MODEL.RPN.PRE_NMS_TOP_N_TEST post_nms_top_n = config.MODEL.RPN.POST_NMS_TOP_N_TEST nms_thresh = config.MODEL.RPN.NMS_THRESH min_size = config.MODEL.RPN.MIN_SIZE box_selector = RPNPostProcessor( pre_nms_top_n=pre_nms_top_n, post_nms_top_n=post_nms_top_n, nms_thresh=nms_thresh, min_size=min_size, box_coder=rpn_box_coder, fpn_post_nms_top_n=fpn_post_nms_top_n, ) return box_selector ================================================ FILE: maskrcnn_benchmark/modeling/rpn/loss.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """ This file contains specific functions for computing losses on the RPN file """ import torch from torch.nn import functional as F from .utils import concat_box_prediction_layers from ..balanced_positive_negative_sampler import BalancedPositiveNegativeSampler from ..utils import cat from maskrcnn_benchmark.layers import smooth_l1_loss from maskrcnn_benchmark.modeling.matcher import Matcher from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist class RPNLossComputation(object): """ This class computes the RPN loss. """ def __init__(self, proposal_matcher, fg_bg_sampler, box_coder, generate_labels_func): """ Arguments: proposal_matcher (Matcher) fg_bg_sampler (BalancedPositiveNegativeSampler) box_coder (BoxCoder) """ # self.target_preparator = target_preparator self.proposal_matcher = proposal_matcher self.fg_bg_sampler = fg_bg_sampler self.box_coder = box_coder self.copied_fields = [] self.generate_labels_func = generate_labels_func self.discard_cases = ['not_visibility', 'between_thresholds'] def match_targets_to_anchors(self, anchor, target, copied_fields=[]): match_quality_matrix = boxlist_iou(target, anchor) matched_idxs = self.proposal_matcher(match_quality_matrix) # RPN doesn't need any fields from target # for creating the labels, so clear them all target = target.copy_with_fields(copied_fields) # get the targets corresponding GT for each anchor # NB: need to clamp the indices because we can have a single # GT in the image, and matched_idxs can be -2, which goes # out of bounds matched_targets = target[matched_idxs.clamp(min=0)] matched_targets.add_field("matched_idxs", matched_idxs) return matched_targets def prepare_targets(self, anchors, targets): labels = [] regression_targets = [] for anchors_per_image, targets_per_image in zip(anchors, targets): matched_targets = self.match_targets_to_anchors( anchors_per_image, targets_per_image, self.copied_fields ) matched_idxs = matched_targets.get_field("matched_idxs") labels_per_image = self.generate_labels_func(matched_targets) labels_per_image = labels_per_image.to(dtype=torch.float32) # Background (negative examples) bg_indices = matched_idxs == Matcher.BELOW_LOW_THRESHOLD labels_per_image[bg_indices] = 0 # discard anchors that go out of the boundaries of the image if "not_visibility" in self.discard_cases: labels_per_image[~anchors_per_image.get_field("visibility")] = -1 # discard indices that are between thresholds if "between_thresholds" in self.discard_cases: inds_to_discard = matched_idxs == Matcher.BETWEEN_THRESHOLDS labels_per_image[inds_to_discard] = -1 # compute regression targets regression_targets_per_image = self.box_coder.encode( matched_targets.bbox, anchors_per_image.bbox ) labels.append(labels_per_image) regression_targets.append(regression_targets_per_image) return labels, regression_targets def __call__(self, anchors, objectness, box_regression, targets): """ Arguments: anchors (list[BoxList]) objectness (list[Tensor]) box_regression (list[Tensor]) targets (list[BoxList]) Returns: objectness_loss (Tensor) box_loss (Tensor """ anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors] labels, regression_targets = self.prepare_targets(anchors, targets) sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels) sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1) sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0)).squeeze(1) sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0) objectness, box_regression = \ concat_box_prediction_layers(objectness, box_regression) objectness = objectness.squeeze() labels = torch.cat(labels, dim=0) regression_targets = torch.cat(regression_targets, dim=0) box_loss = smooth_l1_loss( box_regression[sampled_pos_inds], regression_targets[sampled_pos_inds], beta=1.0 / 9, size_average=False, ) / (sampled_inds.numel()) objectness_loss = F.binary_cross_entropy_with_logits( objectness[sampled_inds], labels[sampled_inds] ) return objectness_loss, box_loss # This function should be overwritten in RetinaNet def generate_rpn_labels(matched_targets): matched_idxs = matched_targets.get_field("matched_idxs") labels_per_image = matched_idxs >= 0 return labels_per_image def make_rpn_loss_evaluator(cfg, box_coder): matcher = Matcher( cfg.MODEL.RPN.FG_IOU_THRESHOLD, cfg.MODEL.RPN.BG_IOU_THRESHOLD, allow_low_quality_matches=True, ) fg_bg_sampler = BalancedPositiveNegativeSampler( cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE, cfg.MODEL.RPN.POSITIVE_FRACTION ) loss_evaluator = RPNLossComputation( matcher, fg_bg_sampler, box_coder, generate_rpn_labels ) return loss_evaluator ================================================ FILE: maskrcnn_benchmark/modeling/rpn/retinanet/__init__.py ================================================ ================================================ FILE: maskrcnn_benchmark/modeling/rpn/retinanet/inference.py ================================================ import torch from ..inference import RPNPostProcessor from ..utils import permute_and_flatten from maskrcnn_benchmark.modeling.box_coder import BoxCoder from maskrcnn_benchmark.modeling.utils import cat from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms from maskrcnn_benchmark.structures.boxlist_ops import remove_small_boxes class RetinaNetPostProcessor(RPNPostProcessor): """ Performs post-processing on the outputs of the RetinaNet boxes. This is only used in the testing. """ def __init__( self, pre_nms_thresh, pre_nms_top_n, nms_thresh, fpn_post_nms_top_n, min_size, num_classes, box_coder=None, ): """ Arguments: pre_nms_thresh (float) pre_nms_top_n (int) nms_thresh (float) fpn_post_nms_top_n (int) min_size (int) num_classes (int) box_coder (BoxCoder) """ super(RetinaNetPostProcessor, self).__init__( pre_nms_thresh, 0, nms_thresh, min_size ) self.pre_nms_thresh = pre_nms_thresh self.pre_nms_top_n = pre_nms_top_n self.nms_thresh = nms_thresh self.fpn_post_nms_top_n = fpn_post_nms_top_n self.min_size = min_size self.num_classes = num_classes if box_coder is None: box_coder = BoxCoder(weights=(10., 10., 5., 5.)) self.box_coder = box_coder def add_gt_proposals(self, proposals, targets): """ This function is not used in RetinaNet """ pass def forward_for_single_feature_map( self, anchors, box_cls, box_regression): """ Arguments: anchors: list[BoxList] box_cls: tensor of size N, A * C, H, W box_regression: tensor of size N, A * 4, H, W """ device = box_cls.device N, _, H, W = box_cls.shape A = box_regression.size(1) // 4 C = box_cls.size(1) // A # put in the same format as anchors box_cls = permute_and_flatten(box_cls, N, A, C, H, W) box_cls = box_cls.sigmoid() box_regression = permute_and_flatten(box_regression, N, A, 4, H, W) box_regression = box_regression.reshape(N, -1, 4) num_anchors = A * H * W candidate_inds = box_cls > self.pre_nms_thresh pre_nms_top_n = candidate_inds.view(N, -1).sum(1) pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n) results = [] for per_box_cls, per_box_regression, per_pre_nms_top_n, \ per_candidate_inds, per_anchors in zip( box_cls, box_regression, pre_nms_top_n, candidate_inds, anchors): # Sort and select TopN # TODO most of this can be made out of the loop for # all images. # TODO:Yang: Not easy to do. Because the numbers of detections are # different in each image. Therefore, this part needs to be done # per image. per_box_cls = per_box_cls[per_candidate_inds] per_box_cls, top_k_indices = \ per_box_cls.topk(per_pre_nms_top_n, sorted=False) per_candidate_nonzeros = \ per_candidate_inds.nonzero()[top_k_indices, :] per_box_loc = per_candidate_nonzeros[:, 0] per_class = per_candidate_nonzeros[:, 1] per_class += 1 detections = self.box_coder.decode( per_box_regression[per_box_loc, :].view(-1, 4), per_anchors.bbox[per_box_loc, :].view(-1, 4) ) boxlist = BoxList(detections, per_anchors.size, mode="xyxy") boxlist.add_field("labels", per_class) boxlist.add_field("scores", per_box_cls) boxlist = boxlist.clip_to_image(remove_empty=False) boxlist = remove_small_boxes(boxlist, self.min_size) results.append(boxlist) return results # TODO very similar to filter_results from PostProcessor # but filter_results is per image # TODO Yang: solve this issue in the future. No good solution # right now. def select_over_all_levels(self, boxlists): num_images = len(boxlists) results = [] for i in range(num_images): scores = boxlists[i].get_field("scores") labels = boxlists[i].get_field("labels") boxes = boxlists[i].bbox boxlist = boxlists[i] result = [] # skip the background for j in range(1, self.num_classes): inds = (labels == j).nonzero().view(-1) scores_j = scores[inds] boxes_j = boxes[inds, :].view(-1, 4) boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") boxlist_for_class.add_field("scores", scores_j) boxlist_for_class = boxlist_nms( boxlist_for_class, self.nms_thresh, score_field="scores" ) num_labels = len(boxlist_for_class) boxlist_for_class.add_field( "labels", torch.full((num_labels,), j, dtype=torch.int64, device=scores.device) ) result.append(boxlist_for_class) result = cat_boxlist(result) number_of_detections = len(result) # Limit to max_per_image detections **over all classes** if number_of_detections > self.fpn_post_nms_top_n > 0: cls_scores = result.get_field("scores") image_thresh, _ = torch.kthvalue( cls_scores.cpu(), number_of_detections - self.fpn_post_nms_top_n + 1 ) keep = cls_scores >= image_thresh.item() keep = torch.nonzero(keep).squeeze(1) result = result[keep] results.append(result) return results def make_retinanet_postprocessor(config, rpn_box_coder, is_train): pre_nms_thresh = config.MODEL.RETINANET.INFERENCE_TH pre_nms_top_n = config.MODEL.RETINANET.PRE_NMS_TOP_N nms_thresh = config.MODEL.RETINANET.NMS_TH fpn_post_nms_top_n = config.TEST.DETECTIONS_PER_IMG min_size = 0 box_selector = RetinaNetPostProcessor( pre_nms_thresh=pre_nms_thresh, pre_nms_top_n=pre_nms_top_n, nms_thresh=nms_thresh, fpn_post_nms_top_n=fpn_post_nms_top_n, min_size=min_size, num_classes=config.MODEL.RETINANET.NUM_CLASSES, box_coder=rpn_box_coder, ) return box_selector ================================================ FILE: maskrcnn_benchmark/modeling/rpn/retinanet/loss.py ================================================ """ This file contains specific functions for computing losses on the RetinaNet file """ import torch from torch.nn import functional as F from ..utils import concat_box_prediction_layers from maskrcnn_benchmark.layers import smooth_l1_loss from maskrcnn_benchmark.layers import SigmoidFocalLoss from maskrcnn_benchmark.modeling.matcher import Matcher from maskrcnn_benchmark.modeling.utils import cat from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist from maskrcnn_benchmark.modeling.rpn.loss import RPNLossComputation class RetinaNetLossComputation(RPNLossComputation): """ This class computes the RetinaNet loss. """ def __init__(self, proposal_matcher, box_coder, generate_labels_func, sigmoid_focal_loss, bbox_reg_beta=0.11, regress_norm=1.0): """ Arguments: proposal_matcher (Matcher) box_coder (BoxCoder) """ self.proposal_matcher = proposal_matcher self.box_coder = box_coder self.box_cls_loss_func = sigmoid_focal_loss self.bbox_reg_beta = bbox_reg_beta self.copied_fields = ['labels'] self.generate_labels_func = generate_labels_func self.discard_cases = ['between_thresholds'] self.regress_norm = regress_norm def __call__(self, anchors, box_cls, box_regression, targets): """ Arguments: anchors (list[BoxList]) box_cls (list[Tensor]) box_regression (list[Tensor]) targets (list[BoxList]) Returns: retinanet_cls_loss (Tensor) retinanet_regression_loss (Tensor """ anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors] labels, regression_targets = self.prepare_targets(anchors, targets) N = len(labels) box_cls, box_regression = \ concat_box_prediction_layers(box_cls, box_regression) labels = torch.cat(labels, dim=0) regression_targets = torch.cat(regression_targets, dim=0) pos_inds = torch.nonzero(labels > 0).squeeze(1) retinanet_regression_loss = smooth_l1_loss( box_regression[pos_inds], regression_targets[pos_inds], beta=self.bbox_reg_beta, size_average=False, ) / (max(1, pos_inds.numel() * self.regress_norm)) labels = labels.int() retinanet_cls_loss = self.box_cls_loss_func( box_cls, labels ) / (pos_inds.numel() + N) return retinanet_cls_loss, retinanet_regression_loss def generate_retinanet_labels(matched_targets): labels_per_image = matched_targets.get_field("labels") return labels_per_image def make_retinanet_loss_evaluator(cfg, box_coder): matcher = Matcher( cfg.MODEL.RETINANET.FG_IOU_THRESHOLD, cfg.MODEL.RETINANET.BG_IOU_THRESHOLD, allow_low_quality_matches=True, ) sigmoid_focal_loss = SigmoidFocalLoss( cfg.MODEL.RETINANET.LOSS_GAMMA, cfg.MODEL.RETINANET.LOSS_ALPHA ) loss_evaluator = RetinaNetLossComputation( matcher, box_coder, generate_retinanet_labels, sigmoid_focal_loss, bbox_reg_beta = cfg.MODEL.RETINANET.BBOX_REG_BETA, regress_norm = cfg.MODEL.RETINANET.BBOX_REG_WEIGHT, ) return loss_evaluator ================================================ FILE: maskrcnn_benchmark/modeling/rpn/retinanet/retinanet.py ================================================ import math import torch import torch.nn.functional as F from torch import nn from .inference import make_retinanet_postprocessor from .loss import make_retinanet_loss_evaluator from ..anchor_generator import make_anchor_generator_retinanet from maskrcnn_benchmark.modeling.box_coder import BoxCoder class RetinaNetHead(torch.nn.Module): """ Adds a RetinNet head with classification and regression heads """ def __init__(self, cfg, in_channels): """ Arguments: in_channels (int): number of channels of the input feature num_anchors (int): number of anchors to be predicted """ super(RetinaNetHead, self).__init__() # TODO: Implement the sigmoid version first. num_classes = cfg.MODEL.RETINANET.NUM_CLASSES - 1 num_anchors = len(cfg.MODEL.RETINANET.ASPECT_RATIOS) \ * cfg.MODEL.RETINANET.SCALES_PER_OCTAVE cls_tower = [] bbox_tower = [] for i in range(cfg.MODEL.RETINANET.NUM_CONVS): cls_tower.append( nn.Conv2d( in_channels, in_channels, kernel_size=3, stride=1, padding=1 ) ) cls_tower.append(nn.ReLU()) bbox_tower.append( nn.Conv2d( in_channels, in_channels, kernel_size=3, stride=1, padding=1 ) ) bbox_tower.append(nn.ReLU()) self.add_module('cls_tower', nn.Sequential(*cls_tower)) self.add_module('bbox_tower', nn.Sequential(*bbox_tower)) self.cls_logits = nn.Conv2d( in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1 ) self.bbox_pred = nn.Conv2d( in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1 ) # Initialization for modules in [self.cls_tower, self.bbox_tower, self.cls_logits, self.bbox_pred]: for l in modules.modules(): if isinstance(l, nn.Conv2d): torch.nn.init.normal_(l.weight, std=0.01) torch.nn.init.constant_(l.bias, 0) # retinanet_bias_init prior_prob = cfg.MODEL.RETINANET.PRIOR_PROB bias_value = -math.log((1 - prior_prob) / prior_prob) torch.nn.init.constant_(self.cls_logits.bias, bias_value) def forward(self, x): logits = [] bbox_reg = [] for feature in x: logits.append(self.cls_logits(self.cls_tower(feature))) bbox_reg.append(self.bbox_pred(self.bbox_tower(feature))) return logits, bbox_reg class RetinaNetModule(torch.nn.Module): """ Module for RetinaNet computation. Takes feature maps from the backbone and RetinaNet outputs and losses. Only Test on FPN now. """ def __init__(self, cfg, in_channels): super(RetinaNetModule, self).__init__() self.cfg = cfg.clone() anchor_generator = make_anchor_generator_retinanet(cfg) head = RetinaNetHead(cfg, in_channels) box_coder = BoxCoder(weights=(10., 10., 5., 5.)) box_selector_test = make_retinanet_postprocessor(cfg, box_coder, is_train=False) loss_evaluator = make_retinanet_loss_evaluator(cfg, box_coder) self.anchor_generator = anchor_generator self.head = head self.box_selector_test = box_selector_test self.loss_evaluator = loss_evaluator def forward(self, images, features, targets=None): """ Arguments: images (ImageList): images for which we want to compute the predictions features (list[Tensor]): features computed from the images that are used for computing the predictions. Each tensor in the list correspond to different feature levels targets (list[BoxList): ground-truth boxes present in the image (optional) Returns: boxes (list[BoxList]): the predicted boxes from the RPN, one BoxList per image. losses (dict[Tensor]): the losses for the model during training. During testing, it is an empty dict. """ box_cls, box_regression = self.head(features) anchors = self.anchor_generator(images, features) if self.training: return self._forward_train(anchors, box_cls, box_regression, targets) else: return self._forward_test(anchors, box_cls, box_regression) def _forward_train(self, anchors, box_cls, box_regression, targets): loss_box_cls, loss_box_reg = self.loss_evaluator( anchors, box_cls, box_regression, targets ) losses = { "loss_retina_cls": loss_box_cls, "loss_retina_reg": loss_box_reg, } return anchors, losses def _forward_test(self, anchors, box_cls, box_regression): boxes = self.box_selector_test(anchors, box_cls, box_regression) return boxes, {} def build_retinanet(cfg, in_channels): return RetinaNetModule(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/rpn/rpn.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch import torch.nn.functional as F from torch import nn from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.modeling.box_coder import BoxCoder from maskrcnn_benchmark.modeling.rpn.retinanet.retinanet import build_retinanet from maskrcnn_benchmark.modeling.rpn.fcos.fcos import build_fcos from .loss import make_rpn_loss_evaluator from .anchor_generator import make_anchor_generator from .inference import make_rpn_postprocessor class RPNHeadConvRegressor(nn.Module): """ A simple RPN Head for classification and bbox regression """ def __init__(self, cfg, in_channels, num_anchors): """ Arguments: cfg : config in_channels (int): number of channels of the input feature num_anchors (int): number of anchors to be predicted """ super(RPNHeadConvRegressor, self).__init__() self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1) self.bbox_pred = nn.Conv2d( in_channels, num_anchors * 4, kernel_size=1, stride=1 ) for l in [self.cls_logits, self.bbox_pred]: torch.nn.init.normal_(l.weight, std=0.01) torch.nn.init.constant_(l.bias, 0) def forward(self, x): assert isinstance(x, (list, tuple)) logits = [self.cls_logits(y) for y in x] bbox_reg = [self.bbox_pred(y) for y in x] return logits, bbox_reg class RPNHeadFeatureSingleConv(nn.Module): """ Adds a simple RPN Head with one conv to extract the feature """ def __init__(self, cfg, in_channels): """ Arguments: cfg : config in_channels (int): number of channels of the input feature """ super(RPNHeadFeatureSingleConv, self).__init__() self.conv = nn.Conv2d( in_channels, in_channels, kernel_size=3, stride=1, padding=1 ) for l in [self.conv]: torch.nn.init.normal_(l.weight, std=0.01) torch.nn.init.constant_(l.bias, 0) self.out_channels = in_channels def forward(self, x): assert isinstance(x, (list, tuple)) x = [F.relu(self.conv(z)) for z in x] return x @registry.RPN_HEADS.register("SingleConvRPNHead") class RPNHead(nn.Module): """ Adds a simple RPN Head with classification and regression heads """ def __init__(self, cfg, in_channels, num_anchors): """ Arguments: cfg : config in_channels (int): number of channels of the input feature num_anchors (int): number of anchors to be predicted """ super(RPNHead, self).__init__() self.conv = nn.Conv2d( in_channels, in_channels, kernel_size=3, stride=1, padding=1 ) self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1) self.bbox_pred = nn.Conv2d( in_channels, num_anchors * 4, kernel_size=1, stride=1 ) for l in [self.conv, self.cls_logits, self.bbox_pred]: torch.nn.init.normal_(l.weight, std=0.01) torch.nn.init.constant_(l.bias, 0) def forward(self, x): logits = [] bbox_reg = [] for feature in x: t = F.relu(self.conv(feature)) logits.append(self.cls_logits(t)) bbox_reg.append(self.bbox_pred(t)) return logits, bbox_reg class RPNModule(torch.nn.Module): """ Module for RPN computation. Takes feature maps from the backbone and RPN proposals and losses. Works for both FPN and non-FPN. """ def __init__(self, cfg, in_channels): super(RPNModule, self).__init__() self.cfg = cfg.clone() anchor_generator = make_anchor_generator(cfg) rpn_head = registry.RPN_HEADS[cfg.MODEL.RPN.RPN_HEAD] head = rpn_head( cfg, in_channels, anchor_generator.num_anchors_per_location()[0] ) rpn_box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) box_selector_train = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=True) box_selector_test = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=False) loss_evaluator = make_rpn_loss_evaluator(cfg, rpn_box_coder) self.anchor_generator = anchor_generator self.head = head self.box_selector_train = box_selector_train self.box_selector_test = box_selector_test self.loss_evaluator = loss_evaluator def forward(self, images, features, targets=None): """ Arguments: images (ImageList): images for which we want to compute the predictions features (list[Tensor]): features computed from the images that are used for computing the predictions. Each tensor in the list correspond to different feature levels targets (list[BoxList): ground-truth boxes present in the image (optional) Returns: boxes (list[BoxList]): the predicted boxes from the RPN, one BoxList per image. losses (dict[Tensor]): the losses for the model during training. During testing, it is an empty dict. """ objectness, rpn_box_regression = self.head(features) anchors = self.anchor_generator(images, features) if self.training: return self._forward_train(anchors, objectness, rpn_box_regression, targets) else: return self._forward_test(anchors, objectness, rpn_box_regression) def _forward_train(self, anchors, objectness, rpn_box_regression, targets): if self.cfg.MODEL.RPN_ONLY: # When training an RPN-only model, the loss is determined by the # predicted objectness and rpn_box_regression values and there is # no need to transform the anchors into predicted boxes; this is an # optimization that avoids the unnecessary transformation. boxes = anchors else: # For end-to-end models, anchors must be transformed into boxes and # sampled into a training batch. with torch.no_grad(): boxes = self.box_selector_train( anchors, objectness, rpn_box_regression, targets ) loss_objectness, loss_rpn_box_reg = self.loss_evaluator( anchors, objectness, rpn_box_regression, targets ) losses = { "loss_objectness": loss_objectness, "loss_rpn_box_reg": loss_rpn_box_reg, } return boxes, losses def _forward_test(self, anchors, objectness, rpn_box_regression): boxes = self.box_selector_test(anchors, objectness, rpn_box_regression) if self.cfg.MODEL.RPN_ONLY: # For end-to-end models, the RPN proposals are an intermediate state # and don't bother to sort them in decreasing score order. For RPN-only # models, the proposals are the final output and we return them in # high-to-low confidence order. inds = [ box.get_field("objectness").sort(descending=True)[1] for box in boxes ] boxes = [box[ind] for box, ind in zip(boxes, inds)] return boxes, {} def build_rpn(cfg, in_channels): """ This gives the gist of it. Not super important because it doesn't change as much """ if cfg.MODEL.FCOS_ON: return build_fcos(cfg, in_channels) if cfg.MODEL.RETINANET_ON: return build_retinanet(cfg, in_channels) return RPNModule(cfg, in_channels) ================================================ FILE: maskrcnn_benchmark/modeling/rpn/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """ Utility functions minipulating the prediction layers """ from ..utils import cat import torch def permute_and_flatten(layer, N, A, C, H, W): layer = layer.view(N, -1, C, H, W) layer = layer.permute(0, 3, 4, 1, 2) layer = layer.reshape(N, -1, C) return layer def concat_box_prediction_layers(box_cls, box_regression): box_cls_flattened = [] box_regression_flattened = [] # for each feature level, permute the outputs to make them be in the # same format as the labels. Note that the labels are computed for # all feature levels concatenated, so we keep the same representation # for the objectness and the box_regression for box_cls_per_level, box_regression_per_level in zip( box_cls, box_regression ): N, AxC, H, W = box_cls_per_level.shape Ax4 = box_regression_per_level.shape[1] A = Ax4 // 4 C = AxC // A box_cls_per_level = permute_and_flatten( box_cls_per_level, N, A, C, H, W ) box_cls_flattened.append(box_cls_per_level) box_regression_per_level = permute_and_flatten( box_regression_per_level, N, A, 4, H, W ) box_regression_flattened.append(box_regression_per_level) # concatenate on the first dimension (representing the feature levels), to # take into account the way the labels were generated (with all feature maps # being concatenated as well) box_cls = cat(box_cls_flattened, dim=1).reshape(-1, C) box_regression = cat(box_regression_flattened, dim=1).reshape(-1, 4) return box_cls, box_regression ================================================ FILE: maskrcnn_benchmark/modeling/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """ Miscellaneous utility functions """ import torch def cat(tensors, dim=0): """ Efficient version of torch.cat that avoids a copy if there is only a single element in a list """ assert isinstance(tensors, (list, tuple)) if len(tensors) == 1: return tensors[0] return torch.cat(tensors, dim) ================================================ FILE: maskrcnn_benchmark/solver/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from .build import make_optimizer from .build import make_lr_scheduler from .lr_scheduler import WarmupMultiStepLR ================================================ FILE: maskrcnn_benchmark/solver/build.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from .lr_scheduler import WarmupMultiStepLR def make_optimizer(cfg, model): params = [] for key, value in model.named_parameters(): if not value.requires_grad: continue lr = cfg.SOLVER.BASE_LR weight_decay = cfg.SOLVER.WEIGHT_DECAY if "bias" in key: lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] optimizer = torch.optim.SGD(params, lr, momentum=cfg.SOLVER.MOMENTUM) return optimizer def make_lr_scheduler(cfg, optimizer): return WarmupMultiStepLR( optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, warmup_factor=cfg.SOLVER.WARMUP_FACTOR, warmup_iters=cfg.SOLVER.WARMUP_ITERS, warmup_method=cfg.SOLVER.WARMUP_METHOD, ) ================================================ FILE: maskrcnn_benchmark/solver/lr_scheduler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from bisect import bisect_right import torch # FIXME ideally this would be achieved with a CombinedLRScheduler, # separating MultiStepLR with WarmupLR # but the current LRScheduler design doesn't allow it class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): def __init__( self, optimizer, milestones, gamma=0.1, warmup_factor=1.0 / 3, warmup_iters=500, warmup_method="linear", last_epoch=-1, ): if not list(milestones) == sorted(milestones): raise ValueError( "Milestones should be a list of" " increasing integers. Got {}", milestones, ) if warmup_method not in ("constant", "linear"): raise ValueError( "Only 'constant' or 'linear' warmup_method accepted" "got {}".format(warmup_method) ) self.milestones = milestones self.gamma = gamma self.warmup_factor = warmup_factor self.warmup_iters = warmup_iters self.warmup_method = warmup_method super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch) def get_lr(self): warmup_factor = 1 if self.last_epoch < self.warmup_iters: if self.warmup_method == "constant": warmup_factor = self.warmup_factor elif self.warmup_method == "linear": alpha = float(self.last_epoch) / self.warmup_iters warmup_factor = self.warmup_factor * (1 - alpha) + alpha return [ base_lr * warmup_factor * self.gamma ** bisect_right(self.milestones, self.last_epoch) for base_lr in self.base_lrs ] ================================================ FILE: maskrcnn_benchmark/structures/__init__.py ================================================ ================================================ FILE: maskrcnn_benchmark/structures/bounding_box.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch # transpose FLIP_LEFT_RIGHT = 0 FLIP_TOP_BOTTOM = 1 class BoxList(object): """ This class represents a set of bounding boxes. The bounding boxes are represented as a Nx4 Tensor. In order to uniquely determine the bounding boxes with respect to an image, we also store the corresponding image dimensions. They can contain extra information that is specific to each bounding box, such as labels. """ def __init__(self, bbox, image_size, mode="xyxy"): device = bbox.device if isinstance(bbox, torch.Tensor) else torch.device("cpu") bbox = torch.as_tensor(bbox, dtype=torch.float32, device=device) if bbox.ndimension() != 2: raise ValueError( "bbox should have 2 dimensions, got {}".format(bbox.ndimension()) ) if bbox.size(-1) != 4: raise ValueError( "last dimension of bbox should have a " "size of 4, got {}".format(bbox.size(-1)) ) if mode not in ("xyxy", "xywh"): raise ValueError("mode should be 'xyxy' or 'xywh'") self.bbox = bbox self.size = image_size # (image_width, image_height) self.mode = mode self.extra_fields = {} def add_field(self, field, field_data): self.extra_fields[field] = field_data def get_field(self, field): return self.extra_fields[field] def has_field(self, field): return field in self.extra_fields def fields(self): return list(self.extra_fields.keys()) def _copy_extra_fields(self, bbox): for k, v in bbox.extra_fields.items(): self.extra_fields[k] = v def convert(self, mode): if mode not in ("xyxy", "xywh"): raise ValueError("mode should be 'xyxy' or 'xywh'") if mode == self.mode: return self # we only have two modes, so don't need to check # self.mode xmin, ymin, xmax, ymax = self._split_into_xyxy() if mode == "xyxy": bbox = torch.cat((xmin, ymin, xmax, ymax), dim=-1) bbox = BoxList(bbox, self.size, mode=mode) else: TO_REMOVE = 1 bbox = torch.cat( (xmin, ymin, xmax - xmin + TO_REMOVE, ymax - ymin + TO_REMOVE), dim=-1 ) bbox = BoxList(bbox, self.size, mode=mode) bbox._copy_extra_fields(self) return bbox def _split_into_xyxy(self): if self.mode == "xyxy": xmin, ymin, xmax, ymax = self.bbox.split(1, dim=-1) return xmin, ymin, xmax, ymax elif self.mode == "xywh": TO_REMOVE = 1 xmin, ymin, w, h = self.bbox.split(1, dim=-1) return ( xmin, ymin, xmin + (w - TO_REMOVE).clamp(min=0), ymin + (h - TO_REMOVE).clamp(min=0), ) else: raise RuntimeError("Should not be here") def resize(self, size, *args, **kwargs): """ Returns a resized copy of this bounding box :param size: The requested size in pixels, as a 2-tuple: (width, height). """ ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size)) if ratios[0] == ratios[1]: ratio = ratios[0] scaled_box = self.bbox * ratio bbox = BoxList(scaled_box, size, mode=self.mode) # bbox._copy_extra_fields(self) for k, v in self.extra_fields.items(): if not isinstance(v, torch.Tensor): v = v.resize(size, *args, **kwargs) bbox.add_field(k, v) return bbox ratio_width, ratio_height = ratios xmin, ymin, xmax, ymax = self._split_into_xyxy() scaled_xmin = xmin * ratio_width scaled_xmax = xmax * ratio_width scaled_ymin = ymin * ratio_height scaled_ymax = ymax * ratio_height scaled_box = torch.cat( (scaled_xmin, scaled_ymin, scaled_xmax, scaled_ymax), dim=-1 ) bbox = BoxList(scaled_box, size, mode="xyxy") # bbox._copy_extra_fields(self) for k, v in self.extra_fields.items(): if not isinstance(v, torch.Tensor): v = v.resize(size, *args, **kwargs) bbox.add_field(k, v) return bbox.convert(self.mode) def transpose(self, method): """ Transpose bounding box (flip or rotate in 90 degree steps) :param method: One of :py:attr:`PIL.Image.FLIP_LEFT_RIGHT`, :py:attr:`PIL.Image.FLIP_TOP_BOTTOM`, :py:attr:`PIL.Image.ROTATE_90`, :py:attr:`PIL.Image.ROTATE_180`, :py:attr:`PIL.Image.ROTATE_270`, :py:attr:`PIL.Image.TRANSPOSE` or :py:attr:`PIL.Image.TRANSVERSE`. """ if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): raise NotImplementedError( "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" ) image_width, image_height = self.size xmin, ymin, xmax, ymax = self._split_into_xyxy() if method == FLIP_LEFT_RIGHT: TO_REMOVE = 1 transposed_xmin = image_width - xmax - TO_REMOVE transposed_xmax = image_width - xmin - TO_REMOVE transposed_ymin = ymin transposed_ymax = ymax elif method == FLIP_TOP_BOTTOM: transposed_xmin = xmin transposed_xmax = xmax transposed_ymin = image_height - ymax transposed_ymax = image_height - ymin transposed_boxes = torch.cat( (transposed_xmin, transposed_ymin, transposed_xmax, transposed_ymax), dim=-1 ) bbox = BoxList(transposed_boxes, self.size, mode="xyxy") # bbox._copy_extra_fields(self) for k, v in self.extra_fields.items(): if not isinstance(v, torch.Tensor): v = v.transpose(method) bbox.add_field(k, v) return bbox.convert(self.mode) def crop(self, box): """ Cropss a rectangular region from this bounding box. The box is a 4-tuple defining the left, upper, right, and lower pixel coordinate. """ xmin, ymin, xmax, ymax = self._split_into_xyxy() w, h = box[2] - box[0], box[3] - box[1] cropped_xmin = (xmin - box[0]).clamp(min=0, max=w) cropped_ymin = (ymin - box[1]).clamp(min=0, max=h) cropped_xmax = (xmax - box[0]).clamp(min=0, max=w) cropped_ymax = (ymax - box[1]).clamp(min=0, max=h) # TODO should I filter empty boxes here? if False: is_empty = (cropped_xmin == cropped_xmax) | (cropped_ymin == cropped_ymax) cropped_box = torch.cat( (cropped_xmin, cropped_ymin, cropped_xmax, cropped_ymax), dim=-1 ) bbox = BoxList(cropped_box, (w, h), mode="xyxy") # bbox._copy_extra_fields(self) for k, v in self.extra_fields.items(): if not isinstance(v, torch.Tensor): v = v.crop(box) bbox.add_field(k, v) return bbox.convert(self.mode) # Tensor-like methods def to(self, device): bbox = BoxList(self.bbox.to(device), self.size, self.mode) for k, v in self.extra_fields.items(): if hasattr(v, "to"): v = v.to(device) bbox.add_field(k, v) return bbox def __getitem__(self, item): bbox = BoxList(self.bbox[item], self.size, self.mode) for k, v in self.extra_fields.items(): bbox.add_field(k, v[item]) return bbox def __len__(self): return self.bbox.shape[0] def clip_to_image(self, remove_empty=True): TO_REMOVE = 1 self.bbox[:, 0].clamp_(min=0, max=self.size[0] - TO_REMOVE) self.bbox[:, 1].clamp_(min=0, max=self.size[1] - TO_REMOVE) self.bbox[:, 2].clamp_(min=0, max=self.size[0] - TO_REMOVE) self.bbox[:, 3].clamp_(min=0, max=self.size[1] - TO_REMOVE) if remove_empty: box = self.bbox keep = (box[:, 3] > box[:, 1]) & (box[:, 2] > box[:, 0]) return self[keep] return self def area(self): box = self.bbox if self.mode == "xyxy": TO_REMOVE = 1 area = (box[:, 2] - box[:, 0] + TO_REMOVE) * (box[:, 3] - box[:, 1] + TO_REMOVE) elif self.mode == "xywh": area = box[:, 2] * box[:, 3] else: raise RuntimeError("Should not be here") return area def copy_with_fields(self, fields, skip_missing=False): bbox = BoxList(self.bbox, self.size, self.mode) if not isinstance(fields, (list, tuple)): fields = [fields] for field in fields: if self.has_field(field): bbox.add_field(field, self.get_field(field)) elif not skip_missing: raise KeyError("Field '{}' not found in {}".format(field, self)) return bbox def __repr__(self): s = self.__class__.__name__ + "(" s += "num_boxes={}, ".format(len(self)) s += "image_width={}, ".format(self.size[0]) s += "image_height={}, ".format(self.size[1]) s += "mode={})".format(self.mode) return s if __name__ == "__main__": bbox = BoxList([[0, 0, 10, 10], [0, 0, 5, 5]], (10, 10)) s_bbox = bbox.resize((5, 5)) print(s_bbox) print(s_bbox.bbox) t_bbox = bbox.transpose(0) print(t_bbox) print(t_bbox.bbox) ================================================ FILE: maskrcnn_benchmark/structures/boxlist_ops.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from .bounding_box import BoxList from maskrcnn_benchmark.layers import nms as _box_nms def boxlist_nms(boxlist, nms_thresh, max_proposals=-1, score_field="scores"): """ Performs non-maximum suppression on a boxlist, with scores specified in a boxlist field via score_field. Arguments: boxlist(BoxList) nms_thresh (float) max_proposals (int): if > 0, then only the top max_proposals are kept after non-maximum suppression score_field (str) """ if nms_thresh <= 0: return boxlist mode = boxlist.mode boxlist = boxlist.convert("xyxy") boxes = boxlist.bbox score = boxlist.get_field(score_field) keep = _box_nms(boxes, score, nms_thresh) if max_proposals > 0: keep = keep[: max_proposals] boxlist = boxlist[keep] return boxlist.convert(mode) def remove_small_boxes(boxlist, min_size): """ Only keep boxes with both sides >= min_size Arguments: boxlist (Boxlist) min_size (int) """ # TODO maybe add an API for querying the ws / hs xywh_boxes = boxlist.convert("xywh").bbox _, _, ws, hs = xywh_boxes.unbind(dim=1) keep = ( (ws >= min_size) & (hs >= min_size) ).nonzero().squeeze(1) return boxlist[keep] # implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py # with slight modifications def boxlist_iou(boxlist1, boxlist2): """Compute the intersection over union of two set of boxes. The box order must be (xmin, ymin, xmax, ymax). Arguments: box1: (BoxList) bounding boxes, sized [N,4]. box2: (BoxList) bounding boxes, sized [M,4]. Returns: (tensor) iou, sized [N,M]. Reference: https://github.com/chainer/chainercv/blob/master/chainercv/utils/bbox/bbox_iou.py """ if boxlist1.size != boxlist2.size: raise RuntimeError( "boxlists should have same image size, got {}, {}".format(boxlist1, boxlist2)) N = len(boxlist1) M = len(boxlist2) area1 = boxlist1.area() area2 = boxlist2.area() box1, box2 = boxlist1.bbox, boxlist2.bbox lt = torch.max(box1[:, None, :2], box2[:, :2]) # [N,M,2] rb = torch.min(box1[:, None, 2:], box2[:, 2:]) # [N,M,2] TO_REMOVE = 1 wh = (rb - lt + TO_REMOVE).clamp(min=0) # [N,M,2] inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] iou = inter / (area1[:, None] + area2 - inter) return iou # TODO redundant, remove def _cat(tensors, dim=0): """ Efficient version of torch.cat that avoids a copy if there is only a single element in a list """ assert isinstance(tensors, (list, tuple)) if len(tensors) == 1: return tensors[0] return torch.cat(tensors, dim) def cat_boxlist(bboxes): """ Concatenates a list of BoxList (having the same image size) into a single BoxList Arguments: bboxes (list[BoxList]) """ assert isinstance(bboxes, (list, tuple)) assert all(isinstance(bbox, BoxList) for bbox in bboxes) size = bboxes[0].size assert all(bbox.size == size for bbox in bboxes) mode = bboxes[0].mode assert all(bbox.mode == mode for bbox in bboxes) fields = set(bboxes[0].fields()) assert all(set(bbox.fields()) == fields for bbox in bboxes) cat_boxes = BoxList(_cat([bbox.bbox for bbox in bboxes], dim=0), size, mode) for field in fields: data = _cat([bbox.get_field(field) for bbox in bboxes], dim=0) cat_boxes.add_field(field, data) return cat_boxes ================================================ FILE: maskrcnn_benchmark/structures/image_list.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from __future__ import division import torch class ImageList(object): """ Structure that holds a list of images (of possibly varying sizes) as a single tensor. This works by padding the images to the same size, and storing in a field the original sizes of each image """ def __init__(self, tensors, image_sizes): """ Arguments: tensors (tensor) image_sizes (list[tuple[int, int]]) """ self.tensors = tensors self.image_sizes = image_sizes def to(self, *args, **kwargs): cast_tensor = self.tensors.to(*args, **kwargs) return ImageList(cast_tensor, self.image_sizes) def to_image_list(tensors, size_divisible=0): """ tensors can be an ImageList, a torch.Tensor or an iterable of Tensors. It can't be a numpy array. When tensors is an iterable of Tensors, it pads the Tensors with zeros so that they have the same shape """ if isinstance(tensors, torch.Tensor) and size_divisible > 0: tensors = [tensors] if isinstance(tensors, ImageList): return tensors elif isinstance(tensors, torch.Tensor): # single tensor shape can be inferred if tensors.dim() == 3: tensors = tensors[None] assert tensors.dim() == 4 image_sizes = [tensor.shape[-2:] for tensor in tensors] return ImageList(tensors, image_sizes) elif isinstance(tensors, (tuple, list)): max_size = tuple(max(s) for s in zip(*[img.shape for img in tensors])) # TODO Ideally, just remove this and let me model handle arbitrary # input sizs if size_divisible > 0: import math stride = size_divisible max_size = list(max_size) max_size[1] = int(math.ceil(max_size[1] / stride) * stride) max_size[2] = int(math.ceil(max_size[2] / stride) * stride) max_size = tuple(max_size) batch_shape = (len(tensors),) + max_size batched_imgs = tensors[0].new(*batch_shape).zero_() for img, pad_img in zip(tensors, batched_imgs): pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) image_sizes = [im.shape[-2:] for im in tensors] return ImageList(batched_imgs, image_sizes) else: raise TypeError("Unsupported type for to_image_list: {}".format(type(tensors))) ================================================ FILE: maskrcnn_benchmark/structures/keypoint.py ================================================ import torch # transpose FLIP_LEFT_RIGHT = 0 FLIP_TOP_BOTTOM = 1 class Keypoints(object): def __init__(self, keypoints, size, mode=None): # FIXME remove check once we have better integration with device # in my version this would consistently return a CPU tensor device = keypoints.device if isinstance(keypoints, torch.Tensor) else torch.device('cpu') keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=device) num_keypoints = keypoints.shape[0] if num_keypoints: keypoints = keypoints.view(num_keypoints, -1, 3) # TODO should I split them? # self.visibility = keypoints[..., 2] self.keypoints = keypoints# [..., :2] self.size = size self.mode = mode self.extra_fields = {} def crop(self, box): raise NotImplementedError() def resize(self, size, *args, **kwargs): ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size)) ratio_w, ratio_h = ratios resized_data = self.keypoints.clone() resized_data[..., 0] *= ratio_w resized_data[..., 1] *= ratio_h keypoints = type(self)(resized_data, size, self.mode) for k, v in self.extra_fields.items(): keypoints.add_field(k, v) return keypoints def transpose(self, method): if method not in (FLIP_LEFT_RIGHT,): raise NotImplementedError( "Only FLIP_LEFT_RIGHT implemented") flip_inds = type(self).FLIP_INDS flipped_data = self.keypoints[:, flip_inds] width = self.size[0] TO_REMOVE = 1 # Flip x coordinates flipped_data[..., 0] = width - flipped_data[..., 0] - TO_REMOVE # Maintain COCO convention that if visibility == 0, then x, y = 0 inds = flipped_data[..., 2] == 0 flipped_data[inds] = 0 keypoints = type(self)(flipped_data, self.size, self.mode) for k, v in self.extra_fields.items(): keypoints.add_field(k, v) return keypoints def to(self, *args, **kwargs): keypoints = type(self)(self.keypoints.to(*args, **kwargs), self.size, self.mode) for k, v in self.extra_fields.items(): if hasattr(v, "to"): v = v.to(*args, **kwargs) keypoints.add_field(k, v) return keypoints def __getitem__(self, item): keypoints = type(self)(self.keypoints[item], self.size, self.mode) for k, v in self.extra_fields.items(): keypoints.add_field(k, v[item]) return keypoints def add_field(self, field, field_data): self.extra_fields[field] = field_data def get_field(self, field): return self.extra_fields[field] def __repr__(self): s = self.__class__.__name__ + '(' s += 'num_instances={}, '.format(len(self.keypoints)) s += 'image_width={}, '.format(self.size[0]) s += 'image_height={})'.format(self.size[1]) return s def _create_flip_indices(names, flip_map): full_flip_map = flip_map.copy() full_flip_map.update({v: k for k, v in flip_map.items()}) flipped_names = [i if i not in full_flip_map else full_flip_map[i] for i in names] flip_indices = [names.index(i) for i in flipped_names] return torch.tensor(flip_indices) class PersonKeypoints(Keypoints): NAMES = [ 'nose', 'left_eye', 'right_eye', 'left_ear', 'right_ear', 'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow', 'left_wrist', 'right_wrist', 'left_hip', 'right_hip', 'left_knee', 'right_knee', 'left_ankle', 'right_ankle' ] FLIP_MAP = { 'left_eye': 'right_eye', 'left_ear': 'right_ear', 'left_shoulder': 'right_shoulder', 'left_elbow': 'right_elbow', 'left_wrist': 'right_wrist', 'left_hip': 'right_hip', 'left_knee': 'right_knee', 'left_ankle': 'right_ankle' } # TODO this doesn't look great PersonKeypoints.FLIP_INDS = _create_flip_indices(PersonKeypoints.NAMES, PersonKeypoints.FLIP_MAP) def kp_connections(keypoints): kp_lines = [ [keypoints.index('left_eye'), keypoints.index('right_eye')], [keypoints.index('left_eye'), keypoints.index('nose')], [keypoints.index('right_eye'), keypoints.index('nose')], [keypoints.index('right_eye'), keypoints.index('right_ear')], [keypoints.index('left_eye'), keypoints.index('left_ear')], [keypoints.index('right_shoulder'), keypoints.index('right_elbow')], [keypoints.index('right_elbow'), keypoints.index('right_wrist')], [keypoints.index('left_shoulder'), keypoints.index('left_elbow')], [keypoints.index('left_elbow'), keypoints.index('left_wrist')], [keypoints.index('right_hip'), keypoints.index('right_knee')], [keypoints.index('right_knee'), keypoints.index('right_ankle')], [keypoints.index('left_hip'), keypoints.index('left_knee')], [keypoints.index('left_knee'), keypoints.index('left_ankle')], [keypoints.index('right_shoulder'), keypoints.index('left_shoulder')], [keypoints.index('right_hip'), keypoints.index('left_hip')], ] return kp_lines PersonKeypoints.CONNECTIONS = kp_connections(PersonKeypoints.NAMES) # TODO make this nicer, this is a direct translation from C2 (but removing the inner loop) def keypoints_to_heat_map(keypoints, rois, heatmap_size): if rois.numel() == 0: return rois.new().long(), rois.new().long() offset_x = rois[:, 0] offset_y = rois[:, 1] scale_x = heatmap_size / (rois[:, 2] - rois[:, 0]) scale_y = heatmap_size / (rois[:, 3] - rois[:, 1]) offset_x = offset_x[:, None] offset_y = offset_y[:, None] scale_x = scale_x[:, None] scale_y = scale_y[:, None] x = keypoints[..., 0] y = keypoints[..., 1] x_boundary_inds = x == rois[:, 2][:, None] y_boundary_inds = y == rois[:, 3][:, None] x = (x - offset_x) * scale_x x = x.floor().long() y = (y - offset_y) * scale_y y = y.floor().long() x[x_boundary_inds] = heatmap_size - 1 y[y_boundary_inds] = heatmap_size - 1 valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size) vis = keypoints[..., 2] > 0 valid = (valid_loc & vis).long() lin_ind = y * heatmap_size + x heatmaps = lin_ind * valid return heatmaps, valid ================================================ FILE: maskrcnn_benchmark/structures/segmentation_mask.py ================================================ import cv2 import torch import numpy as np from maskrcnn_benchmark.layers.misc import interpolate import pycocotools.mask as mask_utils # transpose FLIP_LEFT_RIGHT = 0 FLIP_TOP_BOTTOM = 1 """ ABSTRACT Segmentations come in either: 1) Binary masks 2) Polygons Binary masks can be represented in a contiguous array and operations can be carried out more efficiently, therefore BinaryMaskList handles them together. Polygons are handled separately for each instance, by PolygonInstance and instances are handled by PolygonList. SegmentationList is supposed to represent both, therefore it wraps the functions of BinaryMaskList and PolygonList to make it transparent. """ class BinaryMaskList(object): """ This class handles binary masks for all objects in the image """ def __init__(self, masks, size): """ Arguments: masks: Either torch.tensor of [num_instances, H, W] or list of torch.tensors of [H, W] with num_instances elems, or RLE (Run Length Encoding) - interpreted as list of dicts, or BinaryMaskList. size: absolute image size, width first After initialization, a hard copy will be made, to leave the initializing source data intact. """ if isinstance(masks, torch.Tensor): # The raw data representation is passed as argument masks = masks.clone() elif isinstance(masks, (list, tuple)): if isinstance(masks[0], torch.Tensor): masks = torch.stack(masks, dim=2).clone() elif isinstance(masks[0], dict) and "count" in masks[0]: # RLE interpretation masks = mask_utils else: RuntimeError( "Type of `masks[0]` could not be interpreted: %s" % type(masks) ) elif isinstance(masks, BinaryMaskList): # just hard copy the BinaryMaskList instance's underlying data masks = masks.masks.clone() else: RuntimeError( "Type of `masks` argument could not be interpreted:%s" % type(masks) ) if len(masks.shape) == 2: # if only a single instance mask is passed masks = masks[None] assert len(masks.shape) == 3 assert masks.shape[1] == size[1], "%s != %s" % (masks.shape[1], size[1]) assert masks.shape[2] == size[0], "%s != %s" % (masks.shape[2], size[0]) self.masks = masks self.size = tuple(size) def transpose(self, method): dim = 1 if method == FLIP_TOP_BOTTOM else 2 flipped_masks = self.masks.flip(dim) return BinaryMaskList(flipped_masks, self.size) def crop(self, box): assert isinstance(box, (list, tuple, torch.Tensor)), str(type(box)) # box is assumed to be xyxy current_width, current_height = self.size xmin, ymin, xmax, ymax = [round(float(b)) for b in box] assert xmin <= xmax and ymin <= ymax, str(box) xmin = min(max(xmin, 0), current_width - 1) ymin = min(max(ymin, 0), current_height - 1) xmax = min(max(xmax, 0), current_width) ymax = min(max(ymax, 0), current_height) xmax = max(xmax, xmin + 1) ymax = max(ymax, ymin + 1) width, height = xmax - xmin, ymax - ymin cropped_masks = self.masks[:, ymin:ymax, xmin:xmax] cropped_size = width, height return BinaryMaskList(cropped_masks, cropped_size) def resize(self, size): try: iter(size) except TypeError: assert isinstance(size, (int, float)) size = size, size width, height = map(int, size) assert width > 0 assert height > 0 # Height comes first here! resized_masks = torch.nn.functional.interpolate( input=self.masks[None].float(), size=(height, width), mode="bilinear", align_corners=False, )[0].type_as(self.masks) resized_size = width, height return BinaryMaskList(resized_masks, resized_size) def convert_to_polygon(self): contours = self._findContours() return PolygonList(contours, self.size) def to(self, *args, **kwargs): return self def _findContours(self): contours = [] masks = self.masks.detach().numpy() for mask in masks: mask = cv2.UMat(mask) contour, hierarchy = cv2.findContours( mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_TC89_L1 ) reshaped_contour = [] for entity in contour: assert len(entity.shape) == 3 assert entity.shape[1] == 1, "Hierarchical contours are not allowed" reshaped_contour.append(entity.reshape(-1).tolist()) contours.append(reshaped_contour) return contours def __len__(self): return len(self.masks) def __getitem__(self, index): # Probably it can cause some overhead # but preserves consistency masks = self.masks[index].clone() return BinaryMaskList(masks, self.size) def __iter__(self): return iter(self.masks) def __repr__(self): s = self.__class__.__name__ + "(" s += "num_instances={}, ".format(len(self.masks)) s += "image_width={}, ".format(self.size[0]) s += "image_height={})".format(self.size[1]) return s class PolygonInstance(object): """ This class holds a set of polygons that represents a single instance of an object mask. The object can be represented as a set of polygons """ def __init__(self, polygons, size): """ Arguments: a list of lists of numbers. The first level refers to all the polygons that compose the object, and the second level to the polygon coordinates. """ if isinstance(polygons, (list, tuple)): valid_polygons = [] for p in polygons: p = torch.as_tensor(p, dtype=torch.float32) if len(p) >= 6: # 3 * 2 coordinates valid_polygons.append(p) polygons = valid_polygons elif isinstance(polygons, PolygonInstance): polygons = [p.clone() for p in polygons.polygons] else: RuntimeError( "Type of argument `polygons` is not allowed:%s" % (type(polygons)) ) """ This crashes the training way too many times... for p in polygons: assert p[::2].min() >= 0 assert p[::2].max() < size[0] assert p[1::2].min() >= 0 assert p[1::2].max() , size[1] """ self.polygons = polygons self.size = tuple(size) def transpose(self, method): if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): raise NotImplementedError( "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" ) flipped_polygons = [] width, height = self.size if method == FLIP_LEFT_RIGHT: dim = width idx = 0 elif method == FLIP_TOP_BOTTOM: dim = height idx = 1 for poly in self.polygons: p = poly.clone() TO_REMOVE = 1 p[idx::2] = dim - poly[idx::2] - TO_REMOVE flipped_polygons.append(p) return PolygonInstance(flipped_polygons, size=self.size) def crop(self, box): assert isinstance(box, (list, tuple, torch.Tensor)), str(type(box)) # box is assumed to be xyxy current_width, current_height = self.size xmin, ymin, xmax, ymax = map(float, box) assert xmin <= xmax and ymin <= ymax, str(box) xmin = min(max(xmin, 0), current_width - 1) ymin = min(max(ymin, 0), current_height - 1) xmax = min(max(xmax, 0), current_width) ymax = min(max(ymax, 0), current_height) xmax = max(xmax, xmin + 1) ymax = max(ymax, ymin + 1) w, h = xmax - xmin, ymax - ymin cropped_polygons = [] for poly in self.polygons: p = poly.clone() p[0::2] = p[0::2] - xmin # .clamp(min=0, max=w) p[1::2] = p[1::2] - ymin # .clamp(min=0, max=h) cropped_polygons.append(p) return PolygonInstance(cropped_polygons, size=(w, h)) def resize(self, size): try: iter(size) except TypeError: assert isinstance(size, (int, float)) size = size, size ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size)) if ratios[0] == ratios[1]: ratio = ratios[0] scaled_polys = [p * ratio for p in self.polygons] return PolygonInstance(scaled_polys, size) ratio_w, ratio_h = ratios scaled_polygons = [] for poly in self.polygons: p = poly.clone() p[0::2] *= ratio_w p[1::2] *= ratio_h scaled_polygons.append(p) return PolygonInstance(scaled_polygons, size=size) def convert_to_binarymask(self): width, height = self.size # formatting for COCO PythonAPI polygons = [p.numpy() for p in self.polygons] rles = mask_utils.frPyObjects(polygons, height, width) rle = mask_utils.merge(rles) mask = mask_utils.decode(rle) mask = torch.from_numpy(mask) return mask def __len__(self): return len(self.polygons) def __repr__(self): s = self.__class__.__name__ + "(" s += "num_groups={}, ".format(len(self.polygons)) s += "image_width={}, ".format(self.size[0]) s += "image_height={}, ".format(self.size[1]) return s class PolygonList(object): """ This class handles PolygonInstances for all objects in the image """ def __init__(self, polygons, size): """ Arguments: polygons: a list of list of lists of numbers. The first level of the list correspond to individual instances, the second level to all the polygons that compose the object, and the third level to the polygon coordinates. OR a list of PolygonInstances. OR a PolygonList size: absolute image size """ if isinstance(polygons, (list, tuple)): if len(polygons) == 0: polygons = [[[]]] if isinstance(polygons[0], (list, tuple)): assert isinstance(polygons[0][0], (list, tuple)), str( type(polygons[0][0]) ) else: assert isinstance(polygons[0], PolygonInstance), str(type(polygons[0])) elif isinstance(polygons, PolygonList): size = polygons.size polygons = polygons.polygons else: RuntimeError( "Type of argument `polygons` is not allowed:%s" % (type(polygons)) ) assert isinstance(size, (list, tuple)), str(type(size)) self.polygons = [] for p in polygons: p = PolygonInstance(p, size) if len(p) > 0: self.polygons.append(p) self.size = tuple(size) def transpose(self, method): if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): raise NotImplementedError( "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" ) flipped_polygons = [] for polygon in self.polygons: flipped_polygons.append(polygon.transpose(method)) return PolygonList(flipped_polygons, size=self.size) def crop(self, box): w, h = box[2] - box[0], box[3] - box[1] cropped_polygons = [] for polygon in self.polygons: cropped_polygons.append(polygon.crop(box)) cropped_size = w, h return PolygonList(cropped_polygons, cropped_size) def resize(self, size): resized_polygons = [] for polygon in self.polygons: resized_polygons.append(polygon.resize(size)) resized_size = size return PolygonList(resized_polygons, resized_size) def to(self, *args, **kwargs): return self def convert_to_binarymask(self): if len(self) > 0: masks = torch.stack([p.convert_to_binarymask() for p in self.polygons]) else: size = self.size masks = torch.empty([0, size[1], size[0]], dtype=torch.uint8) return BinaryMaskList(masks, size=self.size) def __len__(self): return len(self.polygons) def __getitem__(self, item): if isinstance(item, int): selected_polygons = [self.polygons[item]] elif isinstance(item, slice): selected_polygons = self.polygons[item] else: # advanced indexing on a single dimension selected_polygons = [] if isinstance(item, torch.Tensor) and item.dtype == torch.uint8: item = item.nonzero() item = item.squeeze(1) if item.numel() > 0 else item item = item.tolist() for i in item: selected_polygons.append(self.polygons[i]) return PolygonList(selected_polygons, size=self.size) def __iter__(self): return iter(self.polygons) def __repr__(self): s = self.__class__.__name__ + "(" s += "num_instances={}, ".format(len(self.polygons)) s += "image_width={}, ".format(self.size[0]) s += "image_height={})".format(self.size[1]) return s class SegmentationMask(object): """ This class stores the segmentations for all objects in the image. It wraps BinaryMaskList and PolygonList conveniently. """ def __init__(self, instances, size, mode="poly"): """ Arguments: instances: two types (1) polygon (2) binary mask size: (width, height) mode: 'poly', 'mask'. if mode is 'mask', convert mask of any format to binary mask """ assert isinstance(size, (list, tuple)) assert len(size) == 2 if isinstance(size[0], torch.Tensor): assert isinstance(size[1], torch.Tensor) size = size[0].item(), size[1].item() assert isinstance(size[0], (int, float)) assert isinstance(size[1], (int, float)) if mode == "poly": self.instances = PolygonList(instances, size) elif mode == "mask": self.instances = BinaryMaskList(instances, size) else: raise NotImplementedError("Unknown mode: %s" % str(mode)) self.mode = mode self.size = tuple(size) def transpose(self, method): flipped_instances = self.instances.transpose(method) return SegmentationMask(flipped_instances, self.size, self.mode) def crop(self, box): cropped_instances = self.instances.crop(box) cropped_size = cropped_instances.size return SegmentationMask(cropped_instances, cropped_size, self.mode) def resize(self, size, *args, **kwargs): resized_instances = self.instances.resize(size) resized_size = size return SegmentationMask(resized_instances, resized_size, self.mode) def to(self, *args, **kwargs): return self def convert(self, mode): if mode == self.mode: return self if mode == "poly": converted_instances = self.instances.convert_to_polygon() elif mode == "mask": converted_instances = self.instances.convert_to_binarymask() else: raise NotImplementedError("Unknown mode: %s" % str(mode)) return SegmentationMask(converted_instances, self.size, mode) def get_mask_tensor(self): instances = self.instances if self.mode == "poly": instances = instances.convert_to_binarymask() # If there is only 1 instance return instances.masks.squeeze(0) def __len__(self): return len(self.instances) def __getitem__(self, item): selected_instances = self.instances.__getitem__(item) return SegmentationMask(selected_instances, self.size, self.mode) def __iter__(self): self.iter_idx = 0 return self def __next__(self): if self.iter_idx < self.__len__(): next_segmentation = self.__getitem__(self.iter_idx) self.iter_idx += 1 return next_segmentation raise StopIteration def __repr__(self): s = self.__class__.__name__ + "(" s += "num_instances={}, ".format(len(self.instances)) s += "image_width={}, ".format(self.size[0]) s += "image_height={}, ".format(self.size[1]) s += "mode={})".format(self.mode) return s ================================================ FILE: maskrcnn_benchmark/utils/README.md ================================================ # Utility functions This folder contain utility functions that are not used in the core library, but are useful for building models or training code using the config system. ================================================ FILE: maskrcnn_benchmark/utils/__init__.py ================================================ ================================================ FILE: maskrcnn_benchmark/utils/c2_model_loading.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import logging import pickle from collections import OrderedDict import torch from maskrcnn_benchmark.utils.model_serialization import load_state_dict from maskrcnn_benchmark.utils.registry import Registry def _rename_basic_resnet_weights(layer_keys): layer_keys = [k.replace("_", ".") for k in layer_keys] layer_keys = [k.replace(".w", ".weight") for k in layer_keys] layer_keys = [k.replace(".bn", "_bn") for k in layer_keys] layer_keys = [k.replace(".b", ".bias") for k in layer_keys] layer_keys = [k.replace("_bn.s", "_bn.scale") for k in layer_keys] layer_keys = [k.replace(".biasranch", ".branch") for k in layer_keys] layer_keys = [k.replace("bbox.pred", "bbox_pred") for k in layer_keys] layer_keys = [k.replace("cls.score", "cls_score") for k in layer_keys] layer_keys = [k.replace("res.conv1_", "conv1_") for k in layer_keys] # RPN / Faster RCNN layer_keys = [k.replace(".biasbox", ".bbox") for k in layer_keys] layer_keys = [k.replace("conv.rpn", "rpn.conv") for k in layer_keys] layer_keys = [k.replace("rpn.bbox.pred", "rpn.bbox_pred") for k in layer_keys] layer_keys = [k.replace("rpn.cls.logits", "rpn.cls_logits") for k in layer_keys] # Affine-Channel -> BatchNorm enaming layer_keys = [k.replace("_bn.scale", "_bn.weight") for k in layer_keys] # Make torchvision-compatible layer_keys = [k.replace("conv1_bn.", "bn1.") for k in layer_keys] layer_keys = [k.replace("res2.", "layer1.") for k in layer_keys] layer_keys = [k.replace("res3.", "layer2.") for k in layer_keys] layer_keys = [k.replace("res4.", "layer3.") for k in layer_keys] layer_keys = [k.replace("res5.", "layer4.") for k in layer_keys] layer_keys = [k.replace(".branch2a.", ".conv1.") for k in layer_keys] layer_keys = [k.replace(".branch2a_bn.", ".bn1.") for k in layer_keys] layer_keys = [k.replace(".branch2b.", ".conv2.") for k in layer_keys] layer_keys = [k.replace(".branch2b_bn.", ".bn2.") for k in layer_keys] layer_keys = [k.replace(".branch2c.", ".conv3.") for k in layer_keys] layer_keys = [k.replace(".branch2c_bn.", ".bn3.") for k in layer_keys] layer_keys = [k.replace(".branch1.", ".downsample.0.") for k in layer_keys] layer_keys = [k.replace(".branch1_bn.", ".downsample.1.") for k in layer_keys] # GroupNorm layer_keys = [k.replace("conv1.gn.s", "bn1.weight") for k in layer_keys] layer_keys = [k.replace("conv1.gn.bias", "bn1.bias") for k in layer_keys] layer_keys = [k.replace("conv2.gn.s", "bn2.weight") for k in layer_keys] layer_keys = [k.replace("conv2.gn.bias", "bn2.bias") for k in layer_keys] layer_keys = [k.replace("conv3.gn.s", "bn3.weight") for k in layer_keys] layer_keys = [k.replace("conv3.gn.bias", "bn3.bias") for k in layer_keys] layer_keys = [k.replace("downsample.0.gn.s", "downsample.1.weight") \ for k in layer_keys] layer_keys = [k.replace("downsample.0.gn.bias", "downsample.1.bias") \ for k in layer_keys] return layer_keys def _rename_fpn_weights(layer_keys, stage_names): for mapped_idx, stage_name in enumerate(stage_names, 1): suffix = "" if mapped_idx < 4: suffix = ".lateral" layer_keys = [ k.replace("fpn.inner.layer{}.sum{}".format(stage_name, suffix), "fpn_inner{}".format(mapped_idx)) for k in layer_keys ] layer_keys = [k.replace("fpn.layer{}.sum".format(stage_name), "fpn_layer{}".format(mapped_idx)) for k in layer_keys] layer_keys = [k.replace("rpn.conv.fpn2", "rpn.conv") for k in layer_keys] layer_keys = [k.replace("rpn.bbox_pred.fpn2", "rpn.bbox_pred") for k in layer_keys] layer_keys = [ k.replace("rpn.cls_logits.fpn2", "rpn.cls_logits") for k in layer_keys ] return layer_keys def _rename_weights_for_resnet(weights, stage_names): original_keys = sorted(weights.keys()) layer_keys = sorted(weights.keys()) # for X-101, rename output to fc1000 to avoid conflicts afterwards layer_keys = [k if k != "pred_b" else "fc1000_b" for k in layer_keys] layer_keys = [k if k != "pred_w" else "fc1000_w" for k in layer_keys] # performs basic renaming: _ -> . , etc layer_keys = _rename_basic_resnet_weights(layer_keys) # FPN layer_keys = _rename_fpn_weights(layer_keys, stage_names) # Mask R-CNN layer_keys = [k.replace("mask.fcn.logits", "mask_fcn_logits") for k in layer_keys] layer_keys = [k.replace(".[mask].fcn", "mask_fcn") for k in layer_keys] layer_keys = [k.replace("conv5.mask", "conv5_mask") for k in layer_keys] # Keypoint R-CNN layer_keys = [k.replace("kps.score.lowres", "kps_score_lowres") for k in layer_keys] layer_keys = [k.replace("kps.score", "kps_score") for k in layer_keys] layer_keys = [k.replace("conv.fcn", "conv_fcn") for k in layer_keys] # Rename for our RPN structure layer_keys = [k.replace("rpn.", "rpn.head.") for k in layer_keys] key_map = {k: v for k, v in zip(original_keys, layer_keys)} logger = logging.getLogger(__name__) logger.info("Remapping C2 weights") max_c2_key_size = max([len(k) for k in original_keys if "_momentum" not in k]) new_weights = OrderedDict() for k in original_keys: v = weights[k] if "_momentum" in k: continue # if 'fc1000' in k: # continue w = torch.from_numpy(v) # if "bn" in k: # w = w.view(1, -1, 1, 1) logger.info("C2 name: {: <{}} mapped name: {}".format(k, max_c2_key_size, key_map[k])) new_weights[key_map[k]] = w return new_weights def _load_c2_pickled_weights(file_path): with open(file_path, "rb") as f: if torch._six.PY3: data = pickle.load(f, encoding="latin1") else: data = pickle.load(f) if "blobs" in data: weights = data["blobs"] else: weights = data return weights _C2_STAGE_NAMES = { "R-50": ["1.2", "2.3", "3.5", "4.2"], "R-101": ["1.2", "2.3", "3.22", "4.2"], "R-152": ["1.2", "2.7", "3.35", "4.2"], } C2_FORMAT_LOADER = Registry() @C2_FORMAT_LOADER.register("R-50-C4") @C2_FORMAT_LOADER.register("R-50-C5") @C2_FORMAT_LOADER.register("R-101-C4") @C2_FORMAT_LOADER.register("R-101-C5") @C2_FORMAT_LOADER.register("R-50-FPN") @C2_FORMAT_LOADER.register("R-50-FPN-RETINANET") @C2_FORMAT_LOADER.register("R-101-FPN") @C2_FORMAT_LOADER.register("R-101-FPN-RETINANET") @C2_FORMAT_LOADER.register("R-152-FPN") def load_resnet_c2_format(cfg, f): state_dict = _load_c2_pickled_weights(f) conv_body = cfg.MODEL.BACKBONE.CONV_BODY arch = conv_body.replace("-C4", "").replace("-C5", "").replace("-FPN", "") arch = arch.replace("-RETINANET", "") stages = _C2_STAGE_NAMES[arch] state_dict = _rename_weights_for_resnet(state_dict, stages) return dict(model=state_dict) def load_c2_format(cfg, f): return C2_FORMAT_LOADER[cfg.MODEL.BACKBONE.CONV_BODY](cfg, f) ================================================ FILE: maskrcnn_benchmark/utils/checkpoint.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import logging import os import torch from maskrcnn_benchmark.utils.model_serialization import load_state_dict from maskrcnn_benchmark.utils.c2_model_loading import load_c2_format from maskrcnn_benchmark.utils.imports import import_file from maskrcnn_benchmark.utils.model_zoo import cache_url class Checkpointer(object): def __init__( self, model, optimizer=None, scheduler=None, save_dir="", save_to_disk=None, logger=None, ): self.model = model self.optimizer = optimizer self.scheduler = scheduler self.save_dir = save_dir self.save_to_disk = save_to_disk if logger is None: logger = logging.getLogger(__name__) self.logger = logger def save(self, name, **kwargs): if not self.save_dir: return if not self.save_to_disk: return data = {} data["model"] = self.model.state_dict() if self.optimizer is not None: data["optimizer"] = self.optimizer.state_dict() if self.scheduler is not None: data["scheduler"] = self.scheduler.state_dict() data.update(kwargs) save_file = os.path.join(self.save_dir, "{}.pth".format(name)) self.logger.info("Saving checkpoint to {}".format(save_file)) torch.save(data, save_file) self.tag_last_checkpoint(save_file) def load(self, f=None): if self.has_checkpoint(): # override argument with existing checkpoint f = self.get_checkpoint_file() if not f: # no checkpoint could be found self.logger.info("No checkpoint found. Initializing model from scratch") return {} self.logger.info("Loading checkpoint from {}".format(f)) checkpoint = self._load_file(f) self._load_model(checkpoint) if "optimizer" in checkpoint and self.optimizer: self.logger.info("Loading optimizer from {}".format(f)) self.optimizer.load_state_dict(checkpoint.pop("optimizer")) if "scheduler" in checkpoint and self.scheduler: self.logger.info("Loading scheduler from {}".format(f)) self.scheduler.load_state_dict(checkpoint.pop("scheduler")) # return any further checkpoint data return checkpoint def has_checkpoint(self): save_file = os.path.join(self.save_dir, "last_checkpoint") return os.path.exists(save_file) def get_checkpoint_file(self): save_file = os.path.join(self.save_dir, "last_checkpoint") try: with open(save_file, "r") as f: last_saved = f.read() last_saved = last_saved.strip() except IOError: # if file doesn't exist, maybe because it has just been # deleted by a separate process last_saved = "" return last_saved def tag_last_checkpoint(self, last_filename): save_file = os.path.join(self.save_dir, "last_checkpoint") with open(save_file, "w") as f: f.write(last_filename) def _load_file(self, f): return torch.load(f, map_location=torch.device("cpu")) def _load_model(self, checkpoint): load_state_dict(self.model, checkpoint.pop("model")) class DetectronCheckpointer(Checkpointer): def __init__( self, cfg, model, optimizer=None, scheduler=None, save_dir="", save_to_disk=None, logger=None, ): super(DetectronCheckpointer, self).__init__( model, optimizer, scheduler, save_dir, save_to_disk, logger ) self.cfg = cfg.clone() def _load_file(self, f): # catalog lookup if f.startswith("catalog://"): paths_catalog = import_file( "maskrcnn_benchmark.config.paths_catalog", self.cfg.PATHS_CATALOG, True ) catalog_f = paths_catalog.ModelCatalog.get(f[len("catalog://") :]) self.logger.info("{} points to {}".format(f, catalog_f)) f = catalog_f # download url files if f.startswith("http"): # if the file is a url path, download it and cache it cached_f = cache_url(f) self.logger.info("url {} cached in {}".format(f, cached_f)) f = cached_f # convert Caffe2 checkpoint from pkl if f.endswith(".pkl"): return load_c2_format(self.cfg, f) # load native detectron.pytorch checkpoint loaded = super(DetectronCheckpointer, self)._load_file(f) if "model" not in loaded: loaded = dict(model=loaded) return loaded ================================================ FILE: maskrcnn_benchmark/utils/collect_env.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import PIL from torch.utils.collect_env import get_pretty_env_info def get_pil_version(): return "\n Pillow ({})".format(PIL.__version__) def collect_env_info(): env_str = get_pretty_env_info() env_str += get_pil_version() return env_str ================================================ FILE: maskrcnn_benchmark/utils/comm.py ================================================ """ This file contains primitives for multi-gpu communication. This is useful when doing distributed training. """ import pickle import time import torch import torch.distributed as dist def get_world_size(): if not dist.is_available(): return 1 if not dist.is_initialized(): return 1 return dist.get_world_size() def get_rank(): if not dist.is_available(): return 0 if not dist.is_initialized(): return 0 return dist.get_rank() def is_main_process(): return get_rank() == 0 def synchronize(): """ Helper function to synchronize (barrier) among all processes when using distributed training """ if not dist.is_available(): return if not dist.is_initialized(): return world_size = dist.get_world_size() if world_size == 1: return dist.barrier() def all_gather(data): """ Run all_gather on arbitrary picklable data (not necessarily tensors) Args: data: any picklable object Returns: list[data]: list of data gathered from each rank """ world_size = get_world_size() if world_size == 1: return [data] # serialized to a Tensor buffer = pickle.dumps(data) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to("cuda") # obtain Tensor size of each rank local_size = torch.IntTensor([tensor.numel()]).to("cuda") size_list = [torch.IntTensor([0]).to("cuda") for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # receiving Tensor from all ranks # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes tensor_list = [] for _ in size_list: tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) if local_size != max_size: padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") tensor = torch.cat((tensor, padding), dim=0) dist.all_gather(tensor_list, tensor) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list def reduce_dict(input_dict, average=True): """ Args: input_dict (dict): all the values will be reduced average (bool): whether to do average or sum Reduce the values in the dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as input_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return input_dict with torch.no_grad(): names = [] values = [] # sort the keys so that they are consistent across processes for k in sorted(input_dict.keys()): names.append(k) values.append(input_dict[k]) values = torch.stack(values, dim=0) dist.reduce(values, dst=0) if dist.get_rank() == 0 and average: # only main process gets accumulated, so only divide by # world_size in this case values /= world_size reduced_dict = {k: v for k, v in zip(names, values)} return reduced_dict def is_pytorch_1_1_0_or_later(): return [int(_) for _ in torch.__version__.split(".")[:3]] >= [1, 1, 0] ================================================ FILE: maskrcnn_benchmark/utils/cv2_util.py ================================================ """ Module for cv2 utility functions and maintaining version compatibility between 3.x and 4.x """ import cv2 def findContours(*args, **kwargs): """ Wraps cv2.findContours to maintain compatiblity between versions 3 and 4 Returns: contours, hierarchy """ if cv2.__version__.startswith('4'): contours, hierarchy = cv2.findContours(*args, **kwargs) elif cv2.__version__.startswith('3'): _, contours, hierarchy = cv2.findContours(*args, **kwargs) else: raise AssertionError( 'cv2 must be either version 3 or 4 to call this method') return contours, hierarchy ================================================ FILE: maskrcnn_benchmark/utils/env.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import os from maskrcnn_benchmark.utils.imports import import_file def setup_environment(): """Perform environment setup work. The default setup is a no-op, but this function allows the user to specify a Python source file that performs custom setup work that may be necessary to their computing environment. """ custom_module_path = os.environ.get("TORCH_DETECTRON_ENV_MODULE") if custom_module_path: setup_custom_environment(custom_module_path) else: # The default setup is a no-op pass def setup_custom_environment(custom_module_path): """Load custom environment setup from a Python source file and run the setup function. """ module = import_file("maskrcnn_benchmark.utils.env.custom_module", custom_module_path) assert hasattr(module, "setup_environment") and callable( module.setup_environment ), ( "Custom environment module defined in {} does not have the " "required callable attribute 'setup_environment'." ).format( custom_module_path ) module.setup_environment() # Force environment setup when this module is imported setup_environment() ================================================ FILE: maskrcnn_benchmark/utils/imports.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch if torch._six.PY3: import importlib import importlib.util import sys # from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa def import_file(module_name, file_path, make_importable=False): spec = importlib.util.spec_from_file_location(module_name, file_path) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) if make_importable: sys.modules[module_name] = module return module else: import imp def import_file(module_name, file_path, make_importable=None): module = imp.load_source(module_name, file_path) return module ================================================ FILE: maskrcnn_benchmark/utils/logger.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import logging import os import sys def setup_logger(name, save_dir, distributed_rank, filename="log.txt"): logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) # don't log results for the non-master process if distributed_rank > 0: return logger ch = logging.StreamHandler(stream=sys.stdout) ch.setLevel(logging.DEBUG) formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s") ch.setFormatter(formatter) logger.addHandler(ch) if save_dir: fh = logging.FileHandler(os.path.join(save_dir, filename)) fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger.addHandler(fh) return logger ================================================ FILE: maskrcnn_benchmark/utils/metric_logger.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from collections import defaultdict from collections import deque import torch class SmoothedValue(object): """Track a series of values and provide access to smoothed values over a window or the global series average. """ def __init__(self, window_size=20): self.deque = deque(maxlen=window_size) self.series = [] self.total = 0.0 self.count = 0 def update(self, value): self.deque.append(value) self.series.append(value) self.count += 1 self.total += value @property def median(self): d = torch.tensor(list(self.deque)) return d.median().item() @property def avg(self): d = torch.tensor(list(self.deque)) return d.mean().item() @property def global_avg(self): return self.total / self.count class MetricLogger(object): def __init__(self, delimiter="\t"): self.meters = defaultdict(SmoothedValue) self.delimiter = delimiter def update(self, **kwargs): for k, v in kwargs.items(): if isinstance(v, torch.Tensor): v = v.item() assert isinstance(v, (float, int)) self.meters[k].update(v) def __getattr__(self, attr): if attr in self.meters: return self.meters[attr] if attr in self.__dict__: return self.__dict__[attr] raise AttributeError("'{}' object has no attribute '{}'".format( type(self).__name__, attr)) def __str__(self): loss_str = [] for name, meter in self.meters.items(): loss_str.append( "{}: {:.4f} ({:.4f})".format(name, meter.median, meter.global_avg) ) return self.delimiter.join(loss_str) ================================================ FILE: maskrcnn_benchmark/utils/miscellaneous.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import errno import os def mkdir(path): try: os.makedirs(path) except OSError as e: if e.errno != errno.EEXIST: raise ================================================ FILE: maskrcnn_benchmark/utils/model_serialization.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from collections import OrderedDict import logging import torch from maskrcnn_benchmark.utils.imports import import_file def align_and_update_state_dicts(model_state_dict, loaded_state_dict): """ Strategy: suppose that the models that we will create will have prefixes appended to each of its keys, for example due to an extra level of nesting that the original pre-trained weights from ImageNet won't contain. For example, model.state_dict() might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains res2.conv1.weight. We thus want to match both parameters together. For that, we look for each model weight, look among all loaded keys if there is one that is a suffix of the current weight name, and use it if that's the case. If multiple matches exist, take the one with longest size of the corresponding name. For example, for the same model as before, the pretrained weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case, we want to match backbone[0].body.conv1.weight to conv1.weight, and backbone[0].body.res2.conv1.weight to res2.conv1.weight. """ current_keys = sorted(list(model_state_dict.keys())) loaded_keys = sorted(list(loaded_state_dict.keys())) # get a matrix of string matches, where each (i, j) entry correspond to the size of the # loaded_key string, if it matches match_matrix = [ len(j) if i.endswith(j) else 0 for i in current_keys for j in loaded_keys ] match_matrix = torch.as_tensor(match_matrix).view( len(current_keys), len(loaded_keys) ) max_match_size, idxs = match_matrix.max(1) # remove indices that correspond to no-match idxs[max_match_size == 0] = -1 # used for logging max_size = max([len(key) for key in current_keys]) if current_keys else 1 max_size_loaded = max([len(key) for key in loaded_keys]) if loaded_keys else 1 log_str_template = "{: <{}} loaded from {: <{}} of shape {}" logger = logging.getLogger(__name__) for idx_new, idx_old in enumerate(idxs.tolist()): if idx_old == -1: continue key = current_keys[idx_new] key_old = loaded_keys[idx_old] model_state_dict[key] = loaded_state_dict[key_old] logger.info( log_str_template.format( key, max_size, key_old, max_size_loaded, tuple(loaded_state_dict[key_old].shape), ) ) def strip_prefix_if_present(state_dict, prefix): keys = sorted(state_dict.keys()) if not all(key.startswith(prefix) for key in keys): return state_dict stripped_state_dict = OrderedDict() for key, value in state_dict.items(): stripped_state_dict[key.replace(prefix, "")] = value return stripped_state_dict def load_state_dict(model, loaded_state_dict): model_state_dict = model.state_dict() # if the state_dict comes from a model that was wrapped in a # DataParallel or DistributedDataParallel during serialization, # remove the "module" prefix before performing the matching loaded_state_dict = strip_prefix_if_present(loaded_state_dict, prefix="module.") align_and_update_state_dicts(model_state_dict, loaded_state_dict) # use strict loading model.load_state_dict(model_state_dict) ================================================ FILE: maskrcnn_benchmark/utils/model_zoo.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import os import sys try: from torch.utils.model_zoo import _download_url_to_file from torch.utils.model_zoo import urlparse from torch.utils.model_zoo import HASH_REGEX except: from torch.hub import _download_url_to_file from torch.hub import urlparse from torch.hub import HASH_REGEX from maskrcnn_benchmark.utils.comm import is_main_process from maskrcnn_benchmark.utils.comm import synchronize # very similar to https://github.com/pytorch/pytorch/blob/master/torch/utils/model_zoo.py # but with a few improvements and modifications def cache_url(url, model_dir=None, progress=True): r"""Loads the Torch serialized object at the given URL. If the object is already present in `model_dir`, it's deserialized and returned. The filename part of the URL should follow the naming convention ``filename-.ext`` where ```` is the first eight or more digits of the SHA256 hash of the contents of the file. The hash is used to ensure unique names and to verify the contents of the file. The default value of `model_dir` is ``$TORCH_HOME/models`` where ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be overridden with the ``$TORCH_MODEL_ZOO`` environment variable. Args: url (string): URL of the object to download model_dir (string, optional): directory in which to save the object progress (bool, optional): whether or not to display a progress bar to stderr Example: >>> cached_file = maskrcnn_benchmark.utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth') """ if model_dir is None: torch_home = os.path.expanduser(os.getenv('TORCH_HOME', '~/.torch')) model_dir = os.getenv('TORCH_MODEL_ZOO', os.path.join(torch_home, 'models')) if not os.path.exists(model_dir): os.makedirs(model_dir) parts = urlparse(url) if parts.fragment != "": filename = parts.fragment else: filename = os.path.basename(parts.path) if filename == "model_final.pkl": # workaround as pre-trained Caffe2 models from Detectron have all the same filename # so make the full path the filename by replacing / with _ filename = parts.path.replace("/", "_") cached_file = os.path.join(model_dir, filename) if not os.path.exists(cached_file) and is_main_process(): sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file)) hash_prefix = HASH_REGEX.search(filename) if hash_prefix is not None: hash_prefix = hash_prefix.group(1) # workaround: Caffe2 models don't have a hash, but follow the R-50 convention, # which matches the hash PyTorch uses. So we skip the hash matching # if the hash_prefix is less than 6 characters if len(hash_prefix) < 6: hash_prefix = None _download_url_to_file(url, cached_file, hash_prefix, progress=progress) synchronize() return cached_file ================================================ FILE: maskrcnn_benchmark/utils/registry.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. def _register_generic(module_dict, module_name, module): assert module_name not in module_dict module_dict[module_name] = module class Registry(dict): ''' A helper class for managing registering modules, it extends a dictionary and provides a register functions. Eg. creeting a registry: some_registry = Registry({"default": default_module}) There're two ways of registering new modules: 1): normal way is just calling register function: def foo(): ... some_registry.register("foo_module", foo) 2): used as decorator when declaring the module: @some_registry.register("foo_module") @some_registry.register("foo_modeul_nickname") def foo(): ... Access of module is just like using a dictionary, eg: f = some_registry["foo_modeul"] ''' def __init__(self, *args, **kwargs): super(Registry, self).__init__(*args, **kwargs) def register(self, module_name, module=None): # used as function call if module is not None: _register_generic(self, module_name, module) return # used as decorator def register_fn(fn): _register_generic(self, module_name, fn) return fn return register_fn ================================================ FILE: maskrcnn_benchmark/utils/timer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import time import datetime class Timer(object): def __init__(self): self.reset() @property def average_time(self): return self.total_time / self.calls if self.calls > 0 else 0.0 def tic(self): # using time.time instead of time.clock because time time.clock # does not normalize for multithreading self.start_time = time.time() def toc(self, average=True): self.add(time.time() - self.start_time) if average: return self.average_time else: return self.diff def add(self, time_diff): self.diff = time_diff self.total_time += self.diff self.calls += 1 def reset(self): self.total_time = 0.0 self.calls = 0 self.start_time = 0.0 self.diff = 0.0 def avg_time_str(self): time_str = str(datetime.timedelta(seconds=self.average_time)) return time_str def get_time_str(time_diff): time_str = str(datetime.timedelta(seconds=time_diff)) return time_str ================================================ FILE: requirements.txt ================================================ ninja yacs cython matplotlib tqdm ================================================ FILE: setup.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #!/usr/bin/env python import glob import os import torch from setuptools import find_packages from setuptools import setup from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CppExtension from torch.utils.cpp_extension import CUDAExtension requirements = ["torch", "torchvision"] def get_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) extensions_dir = os.path.join(this_dir, "maskrcnn_benchmark", "csrc") main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) sources = main_file + source_cpu extension = CppExtension extra_compile_args = {"cxx": []} define_macros = [] if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1": extension = CUDAExtension sources += source_cuda define_macros += [("WITH_CUDA", None)] extra_compile_args["nvcc"] = [ "-DCUDA_HAS_FP16=1", "-D__CUDA_NO_HALF_OPERATORS__", "-D__CUDA_NO_HALF_CONVERSIONS__", "-D__CUDA_NO_HALF2_OPERATORS__", ] sources = [os.path.join(extensions_dir, s) for s in sources] include_dirs = [extensions_dir] ext_modules = [ extension( "maskrcnn_benchmark._C", sources, include_dirs=include_dirs, define_macros=define_macros, extra_compile_args=extra_compile_args, ) ] return ext_modules setup( name="maskrcnn_benchmark", version="0.1", author="fmassa", url="https://github.com/facebookresearch/maskrcnn-benchmark", description="object detection in pytorch", packages=find_packages(exclude=("configs", "tests",)), # install_requires=requirements, ext_modules=get_extensions(), cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, ) ================================================ FILE: tests/checkpoint.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. from collections import OrderedDict import os from tempfile import TemporaryDirectory import unittest import torch from torch import nn from maskrcnn_benchmark.utils.model_serialization import load_state_dict from maskrcnn_benchmark.utils.checkpoint import Checkpointer class TestCheckpointer(unittest.TestCase): def create_model(self): return nn.Sequential(nn.Linear(2, 3), nn.Linear(3, 1)) def create_complex_model(self): m = nn.Module() m.block1 = nn.Module() m.block1.layer1 = nn.Linear(2, 3) m.layer2 = nn.Linear(3, 2) m.res = nn.Module() m.res.layer2 = nn.Linear(3, 2) state_dict = OrderedDict() state_dict["layer1.weight"] = torch.rand(3, 2) state_dict["layer1.bias"] = torch.rand(3) state_dict["layer2.weight"] = torch.rand(2, 3) state_dict["layer2.bias"] = torch.rand(2) state_dict["res.layer2.weight"] = torch.rand(2, 3) state_dict["res.layer2.bias"] = torch.rand(2) return m, state_dict def test_from_last_checkpoint_model(self): # test that loading works even if they differ by a prefix for trained_model, fresh_model in [ (self.create_model(), self.create_model()), (nn.DataParallel(self.create_model()), self.create_model()), (self.create_model(), nn.DataParallel(self.create_model())), ( nn.DataParallel(self.create_model()), nn.DataParallel(self.create_model()), ), ]: with TemporaryDirectory() as f: checkpointer = Checkpointer( trained_model, save_dir=f, save_to_disk=True ) checkpointer.save("checkpoint_file") # in the same folder fresh_checkpointer = Checkpointer(fresh_model, save_dir=f) self.assertTrue(fresh_checkpointer.has_checkpoint()) self.assertEqual( fresh_checkpointer.get_checkpoint_file(), os.path.join(f, "checkpoint_file.pth"), ) _ = fresh_checkpointer.load() for trained_p, loaded_p in zip( trained_model.parameters(), fresh_model.parameters() ): # different tensor references self.assertFalse(id(trained_p) == id(loaded_p)) # same content self.assertTrue(trained_p.equal(loaded_p)) def test_from_name_file_model(self): # test that loading works even if they differ by a prefix for trained_model, fresh_model in [ (self.create_model(), self.create_model()), (nn.DataParallel(self.create_model()), self.create_model()), (self.create_model(), nn.DataParallel(self.create_model())), ( nn.DataParallel(self.create_model()), nn.DataParallel(self.create_model()), ), ]: with TemporaryDirectory() as f: checkpointer = Checkpointer( trained_model, save_dir=f, save_to_disk=True ) checkpointer.save("checkpoint_file") # on different folders with TemporaryDirectory() as g: fresh_checkpointer = Checkpointer(fresh_model, save_dir=g) self.assertFalse(fresh_checkpointer.has_checkpoint()) self.assertEqual(fresh_checkpointer.get_checkpoint_file(), "") _ = fresh_checkpointer.load(os.path.join(f, "checkpoint_file.pth")) for trained_p, loaded_p in zip( trained_model.parameters(), fresh_model.parameters() ): # different tensor references self.assertFalse(id(trained_p) == id(loaded_p)) # same content self.assertTrue(trained_p.equal(loaded_p)) def test_complex_model_loaded(self): for add_data_parallel in [False, True]: model, state_dict = self.create_complex_model() if add_data_parallel: model = nn.DataParallel(model) load_state_dict(model, state_dict) for loaded, stored in zip(model.state_dict().values(), state_dict.values()): # different tensor references self.assertFalse(id(loaded) == id(stored)) # same content self.assertTrue(loaded.equal(stored)) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/env_tests/env.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import os def get_config_root_path(): ''' Path to configs for unit tests ''' # cur_file_dir is root/tests/env_tests cur_file_dir = os.path.dirname(os.path.abspath(os.path.realpath(__file__))) ret = os.path.dirname(os.path.dirname(cur_file_dir)) ret = os.path.join(ret, "configs") return ret ================================================ FILE: tests/test_backbones.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import copy import torch # import modules to to register backbones from maskrcnn_benchmark.modeling.backbone import build_backbone # NoQA from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.config import cfg as g_cfg from utils import load_config # overwrite configs if specified, otherwise default config is used BACKBONE_CFGS = { "R-50-FPN": "e2e_faster_rcnn_R_50_FPN_1x.yaml", "R-101-FPN": "e2e_faster_rcnn_R_101_FPN_1x.yaml", "R-152-FPN": "e2e_faster_rcnn_R_101_FPN_1x.yaml", "R-50-FPN-RETINANET": "retinanet/retinanet_R-50-FPN_1x.yaml", "R-101-FPN-RETINANET": "retinanet/retinanet_R-101-FPN_1x.yaml", } class TestBackbones(unittest.TestCase): def test_build_backbones(self): ''' Make sure backbones run ''' self.assertGreater(len(registry.BACKBONES), 0) for name, backbone_builder in registry.BACKBONES.items(): print('Testing {}...'.format(name)) if name in BACKBONE_CFGS: cfg = load_config(BACKBONE_CFGS[name]) else: # Use default config if config file is not specified cfg = copy.deepcopy(g_cfg) backbone = backbone_builder(cfg) # make sures the backbone has `out_channels` self.assertIsNotNone( getattr(backbone, 'out_channels', None), 'Need to provide out_channels for backbone {}'.format(name) ) N, C_in, H, W = 2, 3, 224, 256 input = torch.rand([N, C_in, H, W], dtype=torch.float32) out = backbone(input) for cur_out in out: self.assertEqual( cur_out.shape[:2], torch.Size([N, backbone.out_channels]) ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_box_coder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import numpy as np import torch from maskrcnn_benchmark.modeling.box_coder import BoxCoder class TestBoxCoder(unittest.TestCase): def test_box_decoder(self): """ Match unit test UtilsBoxesTest.TestBboxTransformRandom in caffe2/operators/generate_proposals_op_util_boxes_test.cc """ box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) bbox = torch.from_numpy( np.array( [ 175.62031555, 20.91103172, 253.352005, 155.0145874, 169.24636841, 4.85241556, 228.8605957, 105.02092743, 181.77426147, 199.82876587, 192.88427734, 214.0255127, 174.36262512, 186.75761414, 296.19091797, 231.27906799, 22.73153877, 92.02596283, 135.5695343, 208.80291748, ] ) .astype(np.float32) .reshape(-1, 4) ) deltas = torch.from_numpy( np.array( [ 0.47861834, 0.13992102, 0.14961673, 0.71495209, 0.29915856, -0.35664671, 0.89018666, 0.70815367, -0.03852064, 0.44466892, 0.49492538, 0.71409376, 0.28052918, 0.02184832, 0.65289006, 1.05060139, -0.38172557, -0.08533806, -0.60335309, 0.79052375, ] ) .astype(np.float32) .reshape(-1, 4) ) gt_bbox = ( np.array( [ 206.949539, -30.715202, 297.387665, 244.448486, 143.871216, -83.342888, 290.502289, 121.053398, 177.430283, 198.666245, 196.295273, 228.703079, 152.251892, 145.431564, 387.215454, 274.594238, 5.062420, 11.040955, 66.328903, 269.686218, ] ) .astype(np.float32) .reshape(-1, 4) ) results = box_coder.decode(deltas, bbox) np.testing.assert_allclose(results.detach().numpy(), gt_bbox, atol=1e-4) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_configs.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import glob import os import utils class TestConfigs(unittest.TestCase): def test_configs_load(self): ''' Make sure configs are loadable ''' cfg_root_path = utils.get_config_root_path() files = glob.glob( os.path.join(cfg_root_path, "./**/*.yaml"), recursive=True) self.assertGreater(len(files), 0) for fn in files: print('Loading {}...'.format(fn)) utils.load_config_from_file(fn) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_data_samplers.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import itertools import random import unittest from torch.utils.data.sampler import BatchSampler from torch.utils.data.sampler import Sampler from torch.utils.data.sampler import SequentialSampler from torch.utils.data.sampler import RandomSampler from maskrcnn_benchmark.data.samplers import GroupedBatchSampler from maskrcnn_benchmark.data.samplers import IterationBasedBatchSampler class SubsetSampler(Sampler): def __init__(self, indices): self.indices = indices def __iter__(self): return iter(self.indices) def __len__(self): return len(self.indices) class TestGroupedBatchSampler(unittest.TestCase): def test_respect_order_simple(self): drop_uneven = False dataset = [i for i in range(40)] group_ids = [i // 10 for i in dataset] sampler = SequentialSampler(dataset) for batch_size in [1, 3, 5, 6]: batch_sampler = GroupedBatchSampler( sampler, group_ids, batch_size, drop_uneven ) result = list(batch_sampler) merged_result = list(itertools.chain.from_iterable(result)) self.assertEqual(merged_result, dataset) def test_respect_order(self): drop_uneven = False dataset = [i for i in range(10)] group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] sampler = SequentialSampler(dataset) expected = [ [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]], [[0, 1, 3], [2, 4, 5], [6, 9], [7, 8]], [[0, 1, 3, 6], [2, 4, 5, 7], [8], [9]], ] for idx, batch_size in enumerate([1, 3, 4]): batch_sampler = GroupedBatchSampler( sampler, group_ids, batch_size, drop_uneven ) result = list(batch_sampler) self.assertEqual(result, expected[idx]) def test_respect_order_drop_uneven(self): batch_size = 3 drop_uneven = True dataset = [i for i in range(10)] group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] sampler = SequentialSampler(dataset) batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) result = list(batch_sampler) expected = [[0, 1, 3], [2, 4, 5]] self.assertEqual(result, expected) def test_subset_sampler(self): batch_size = 3 drop_uneven = False dataset = [i for i in range(10)] group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] sampler = SubsetSampler([0, 3, 5, 6, 7, 8]) batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) result = list(batch_sampler) expected = [[0, 3, 6], [5, 7, 8]] self.assertEqual(result, expected) def test_permute_subset_sampler(self): batch_size = 3 drop_uneven = False dataset = [i for i in range(10)] group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] sampler = SubsetSampler([5, 0, 6, 1, 3, 8]) batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) result = list(batch_sampler) expected = [[5, 8], [0, 6, 1], [3]] self.assertEqual(result, expected) def test_permute_subset_sampler_drop_uneven(self): batch_size = 3 drop_uneven = True dataset = [i for i in range(10)] group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] sampler = SubsetSampler([5, 0, 6, 1, 3, 8]) batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) result = list(batch_sampler) expected = [[0, 6, 1]] self.assertEqual(result, expected) def test_len(self): batch_size = 3 drop_uneven = True dataset = [i for i in range(10)] group_ids = [random.randint(0, 1) for _ in dataset] sampler = RandomSampler(dataset) batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) result = list(batch_sampler) self.assertEqual(len(result), len(batch_sampler)) self.assertEqual(len(result), len(batch_sampler)) batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) batch_sampler_len = len(batch_sampler) result = list(batch_sampler) self.assertEqual(len(result), batch_sampler_len) self.assertEqual(len(result), len(batch_sampler)) class TestIterationBasedBatchSampler(unittest.TestCase): def test_number_of_iters_and_elements(self): for batch_size in [2, 3, 4]: for num_iterations in [4, 10, 20]: for drop_last in [False, True]: dataset = [i for i in range(10)] sampler = SequentialSampler(dataset) batch_sampler = BatchSampler( sampler, batch_size, drop_last=drop_last ) iter_sampler = IterationBasedBatchSampler( batch_sampler, num_iterations ) assert len(iter_sampler) == num_iterations for i, batch in enumerate(iter_sampler): start = (i % len(batch_sampler)) * batch_size end = min(start + batch_size, len(dataset)) expected = [x for x in range(start, end)] self.assertEqual(batch, expected) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_detectors.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import glob import os import copy import torch from maskrcnn_benchmark.modeling.detector import build_detection_model from maskrcnn_benchmark.structures.image_list import to_image_list import utils CONFIG_FILES = [ # bbox "e2e_faster_rcnn_R_50_C4_1x.yaml", "e2e_faster_rcnn_R_50_FPN_1x.yaml", "e2e_faster_rcnn_fbnet.yaml", # mask "e2e_mask_rcnn_R_50_C4_1x.yaml", "e2e_mask_rcnn_R_50_FPN_1x.yaml", "e2e_mask_rcnn_fbnet.yaml", # keypoints # TODO: fail to run for random model due to empty head input # "e2e_keypoint_rcnn_R_50_FPN_1x.yaml", # gn "gn_baselines/e2e_faster_rcnn_R_50_FPN_1x_gn.yaml", # TODO: fail to run for random model due to empty head input # "gn_baselines/e2e_mask_rcnn_R_50_FPN_Xconv1fc_1x_gn.yaml", # retinanet "retinanet/retinanet_R-50-FPN_1x.yaml", # rpn only "rpn_R_50_C4_1x.yaml", "rpn_R_50_FPN_1x.yaml", ] EXCLUDED_FOLDERS = [ "caffe2", "quick_schedules", "pascal_voc", "cityscapes", ] TEST_CUDA = torch.cuda.is_available() def get_config_files(file_list, exclude_folders): cfg_root_path = utils.get_config_root_path() if file_list is not None: files = [os.path.join(cfg_root_path, x) for x in file_list] else: files = glob.glob( os.path.join(cfg_root_path, "./**/*.yaml"), recursive=True) def _contains(path, exclude_dirs): return any(x in path for x in exclude_dirs) if exclude_folders is not None: files = [x for x in files if not _contains(x, exclude_folders)] return files def create_model(cfg, device): cfg = copy.deepcopy(cfg) cfg.freeze() model = build_detection_model(cfg) model = model.to(device) return model def create_random_input(cfg, device): ret = [] for x in cfg.INPUT.MIN_SIZE_TRAIN: ret.append(torch.rand(3, x, int(x * 1.2))) ret = to_image_list(ret, cfg.DATALOADER.SIZE_DIVISIBILITY) ret = ret.to(device) return ret def _test_build_detectors(self, device): ''' Make sure models build ''' cfg_files = get_config_files(None, EXCLUDED_FOLDERS) self.assertGreater(len(cfg_files), 0) for cfg_file in cfg_files: with self.subTest(cfg_file=cfg_file): print('Testing {}...'.format(cfg_file)) cfg = utils.load_config_from_file(cfg_file) create_model(cfg, device) def _test_run_selected_detectors(self, cfg_files, device): ''' Make sure models build and run ''' self.assertGreater(len(cfg_files), 0) for cfg_file in cfg_files: with self.subTest(cfg_file=cfg_file): print('Testing {}...'.format(cfg_file)) cfg = utils.load_config_from_file(cfg_file) cfg.MODEL.RPN.POST_NMS_TOP_N_TEST = 10 cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 10 model = create_model(cfg, device) inputs = create_random_input(cfg, device) model.eval() output = model(inputs) self.assertEqual(len(output), len(inputs.image_sizes)) class TestDetectors(unittest.TestCase): def test_build_detectors(self): ''' Make sure models build ''' _test_build_detectors(self, "cpu") @unittest.skipIf(not TEST_CUDA, "no CUDA detected") def test_build_detectors_cuda(self): ''' Make sure models build on gpu''' _test_build_detectors(self, "cuda") def test_run_selected_detectors(self): ''' Make sure models build and run ''' # run on selected models cfg_files = get_config_files(CONFIG_FILES, None) # cfg_files = get_config_files(None, EXCLUDED_FOLDERS) _test_run_selected_detectors(self, cfg_files, "cpu") @unittest.skipIf(not TEST_CUDA, "no CUDA detected") def test_run_selected_detectors_cuda(self): ''' Make sure models build and run on cuda ''' # run on selected models cfg_files = get_config_files(CONFIG_FILES, None) # cfg_files = get_config_files(None, EXCLUDED_FOLDERS) _test_run_selected_detectors(self, cfg_files, "cuda") if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_fbnet.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import numpy as np import torch import maskrcnn_benchmark.modeling.backbone.fbnet_builder as fbnet_builder TEST_CUDA = torch.cuda.is_available() def _test_primitive(self, device, op_name, op_func, N, C_in, C_out, expand, stride): op = op_func(C_in, C_out, expand, stride).to(device) input = torch.rand([N, C_in, 7, 7], dtype=torch.float32).to(device) output = op(input) self.assertEqual( output.shape[:2], torch.Size([N, C_out]), 'Primitive {} failed for shape {}.'.format(op_name, input.shape) ) class TestFBNetBuilder(unittest.TestCase): def test_identity(self): id_op = fbnet_builder.Identity(20, 20, 1) input = torch.rand([10, 20, 7, 7], dtype=torch.float32) output = id_op(input) np.testing.assert_array_equal(np.array(input), np.array(output)) id_op = fbnet_builder.Identity(20, 40, 2) input = torch.rand([10, 20, 7, 7], dtype=torch.float32) output = id_op(input) np.testing.assert_array_equal(output.shape, [10, 40, 4, 4]) def test_primitives(self): ''' Make sures the primitives runs ''' for op_name, op_func in fbnet_builder.PRIMITIVES.items(): print('Testing {}'.format(op_name)) _test_primitive( self, "cpu", op_name, op_func, N=20, C_in=16, C_out=32, expand=4, stride=1 ) @unittest.skipIf(not TEST_CUDA, "no CUDA detected") def test_primitives_cuda(self): ''' Make sures the primitives runs on cuda ''' for op_name, op_func in fbnet_builder.PRIMITIVES.items(): print('Testing {}'.format(op_name)) _test_primitive( self, "cuda", op_name, op_func, N=20, C_in=16, C_out=32, expand=4, stride=1 ) def test_primitives_empty_batch(self): ''' Make sures the primitives runs ''' for op_name, op_func in fbnet_builder.PRIMITIVES.items(): print('Testing {}'.format(op_name)) # test empty batch size _test_primitive( self, "cpu", op_name, op_func, N=0, C_in=16, C_out=32, expand=4, stride=1 ) @unittest.skipIf(not TEST_CUDA, "no CUDA detected") def test_primitives_cuda_empty_batch(self): ''' Make sures the primitives runs ''' for op_name, op_func in fbnet_builder.PRIMITIVES.items(): print('Testing {}'.format(op_name)) # test empty batch size _test_primitive( self, "cuda", op_name, op_func, N=0, C_in=16, C_out=32, expand=4, stride=1 ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_feature_extractors.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import copy import torch # import modules to to register feature extractors from maskrcnn_benchmark.modeling.backbone import build_backbone # NoQA from maskrcnn_benchmark.modeling.roi_heads.roi_heads import build_roi_heads # NoQA from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.structures.bounding_box import BoxList from maskrcnn_benchmark.config import cfg as g_cfg from utils import load_config # overwrite configs if specified, otherwise default config is used FEATURE_EXTRACTORS_CFGS = { } # overwrite configs if specified, otherwise default config is used FEATURE_EXTRACTORS_INPUT_CHANNELS = { # in_channels was not used, load through config "ResNet50Conv5ROIFeatureExtractor": 1024, } def _test_feature_extractors( self, extractors, overwrite_cfgs, overwrite_in_channels ): ''' Make sure roi box feature extractors run ''' self.assertGreater(len(extractors), 0) in_channels_default = 64 for name, builder in extractors.items(): print('Testing {}...'.format(name)) if name in overwrite_cfgs: cfg = load_config(overwrite_cfgs[name]) else: # Use default config if config file is not specified cfg = copy.deepcopy(g_cfg) in_channels = overwrite_in_channels.get( name, in_channels_default) fe = builder(cfg, in_channels) self.assertIsNotNone( getattr(fe, 'out_channels', None), 'Need to provide out_channels for feature extractor {}'.format(name) ) N, C_in, H, W = 2, in_channels, 24, 32 input = torch.rand([N, C_in, H, W], dtype=torch.float32) bboxes = [[1, 1, 10, 10], [5, 5, 8, 8], [2, 2, 3, 4]] img_size = [384, 512] box_list = BoxList(bboxes, img_size, "xyxy") out = fe([input], [box_list] * N) self.assertEqual( out.shape[:2], torch.Size([N * len(bboxes), fe.out_channels]) ) class TestFeatureExtractors(unittest.TestCase): def test_roi_box_feature_extractors(self): ''' Make sure roi box feature extractors run ''' _test_feature_extractors( self, registry.ROI_BOX_FEATURE_EXTRACTORS, FEATURE_EXTRACTORS_CFGS, FEATURE_EXTRACTORS_INPUT_CHANNELS, ) def test_roi_keypoints_feature_extractors(self): ''' Make sure roi keypoints feature extractors run ''' _test_feature_extractors( self, registry.ROI_KEYPOINT_FEATURE_EXTRACTORS, FEATURE_EXTRACTORS_CFGS, FEATURE_EXTRACTORS_INPUT_CHANNELS, ) def test_roi_mask_feature_extractors(self): ''' Make sure roi mask feature extractors run ''' _test_feature_extractors( self, registry.ROI_MASK_FEATURE_EXTRACTORS, FEATURE_EXTRACTORS_CFGS, FEATURE_EXTRACTORS_INPUT_CHANNELS, ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_metric_logger.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest from maskrcnn_benchmark.utils.metric_logger import MetricLogger class TestMetricLogger(unittest.TestCase): def test_update(self): meter = MetricLogger() for i in range(10): meter.update(metric=float(i)) m = meter.meters["metric"] self.assertEqual(m.count, 10) self.assertEqual(m.total, 45) self.assertEqual(m.median, 4) self.assertEqual(m.avg, 4.5) def test_no_attr(self): meter = MetricLogger() _ = meter.meters _ = meter.delimiter def broken(): _ = meter.not_existent self.assertRaises(AttributeError, broken) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_nms.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import numpy as np import torch from maskrcnn_benchmark.layers import nms as box_nms class TestNMS(unittest.TestCase): def test_nms_cpu(self): """ Match unit test UtilsNMSTest.TestNMS in caffe2/operators/generate_proposals_op_util_nms_test.cc """ inputs = ( np.array( [ 10, 10, 50, 60, 0.5, 11, 12, 48, 60, 0.7, 8, 9, 40, 50, 0.6, 100, 100, 150, 140, 0.9, 99, 110, 155, 139, 0.8, ] ) .astype(np.float32) .reshape(-1, 5) ) boxes = torch.from_numpy(inputs[:, :4]) scores = torch.from_numpy(inputs[:, 4]) test_thresh = [0.1, 0.3, 0.5, 0.8, 0.9] gt_indices = [[1, 3], [1, 3], [1, 3], [1, 2, 3, 4], [0, 1, 2, 3, 4]] for thresh, gt_index in zip(test_thresh, gt_indices): keep_indices = box_nms(boxes, scores, thresh) keep_indices = np.sort(keep_indices) np.testing.assert_array_equal(keep_indices, np.array(gt_index)) def test_nms1_cpu(self): """ Match unit test UtilsNMSTest.TestNMS1 in caffe2/operators/generate_proposals_op_util_nms_test.cc """ boxes = torch.from_numpy( np.array( [ [350.9821, 161.8200, 369.9685, 205.2372], [250.5236, 154.2844, 274.1773, 204.9810], [471.4920, 160.4118, 496.0094, 213.4244], [352.0421, 164.5933, 366.4458, 205.9624], [166.0765, 169.7707, 183.0102, 232.6606], [252.3000, 183.1449, 269.6541, 210.6747], [469.7862, 162.0192, 482.1673, 187.0053], [168.4862, 174.2567, 181.7437, 232.9379], [470.3290, 162.3442, 496.4272, 214.6296], [251.0450, 155.5911, 272.2693, 203.3675], [252.0326, 154.7950, 273.7404, 195.3671], [351.7479, 161.9567, 370.6432, 204.3047], [496.3306, 161.7157, 515.0573, 210.7200], [471.0749, 162.6143, 485.3374, 207.3448], [250.9745, 160.7633, 264.1924, 206.8350], [470.4792, 169.0351, 487.1934, 220.2984], [474.4227, 161.9546, 513.1018, 215.5193], [251.9428, 184.1950, 262.6937, 207.6416], [252.6623, 175.0252, 269.8806, 213.7584], [260.9884, 157.0351, 288.3554, 206.6027], [251.3629, 164.5101, 263.2179, 202.4203], [471.8361, 190.8142, 485.6812, 220.8586], [248.6243, 156.9628, 264.3355, 199.2767], [495.1643, 158.0483, 512.6261, 184.4192], [376.8718, 168.0144, 387.3584, 201.3210], [122.9191, 160.7433, 172.5612, 231.3837], [350.3857, 175.8806, 366.2500, 205.4329], [115.2958, 162.7822, 161.9776, 229.6147], [168.4375, 177.4041, 180.8028, 232.4551], [169.7939, 184.4330, 181.4767, 232.1220], [347.7536, 175.9356, 355.8637, 197.5586], [495.5434, 164.6059, 516.4031, 207.7053], [172.1216, 194.6033, 183.1217, 235.2653], [264.2654, 181.5540, 288.4626, 214.0170], [111.7971, 183.7748, 137.3745, 225.9724], [253.4919, 186.3945, 280.8694, 210.0731], [165.5334, 169.7344, 185.9159, 232.8514], [348.3662, 184.5187, 354.9081, 201.4038], [164.6562, 162.5724, 186.3108, 233.5010], [113.2999, 186.8410, 135.8841, 219.7642], [117.0282, 179.8009, 142.5375, 221.0736], [462.1312, 161.1004, 495.3576, 217.2208], [462.5800, 159.9310, 501.2937, 224.1655], [503.5242, 170.0733, 518.3792, 209.0113], [250.3658, 195.5925, 260.6523, 212.4679], [108.8287, 163.6994, 146.3642, 229.7261], [256.7617, 187.3123, 288.8407, 211.2013], [161.2781, 167.4801, 186.3751, 232.7133], [115.3760, 177.5859, 163.3512, 236.9660], [248.9077, 188.0919, 264.8579, 207.9718], [108.1349, 160.7851, 143.6370, 229.6243], [465.0900, 156.7555, 490.3561, 213.5704], [107.5338, 173.4323, 141.0704, 235.2910], ] ).astype(np.float32) ) scores = torch.from_numpy( np.array( [ 0.1919, 0.3293, 0.0860, 0.1600, 0.1885, 0.4297, 0.0974, 0.2711, 0.1483, 0.1173, 0.1034, 0.2915, 0.1993, 0.0677, 0.3217, 0.0966, 0.0526, 0.5675, 0.3130, 0.1592, 0.1353, 0.0634, 0.1557, 0.1512, 0.0699, 0.0545, 0.2692, 0.1143, 0.0572, 0.1990, 0.0558, 0.1500, 0.2214, 0.1878, 0.2501, 0.1343, 0.0809, 0.1266, 0.0743, 0.0896, 0.0781, 0.0983, 0.0557, 0.0623, 0.5808, 0.3090, 0.1050, 0.0524, 0.0513, 0.4501, 0.4167, 0.0623, 0.1749, ] ).astype(np.float32) ) gt_indices = np.array( [ 1, 6, 7, 8, 11, 12, 13, 14, 17, 18, 19, 21, 23, 24, 25, 26, 30, 32, 33, 34, 35, 37, 43, 44, 47, 50, ] ) keep_indices = box_nms(boxes, scores, 0.5) keep_indices = np.sort(keep_indices) np.testing.assert_array_equal(keep_indices, gt_indices) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_predictors.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import copy import torch # import modules to to register predictors from maskrcnn_benchmark.modeling.backbone import build_backbone # NoQA from maskrcnn_benchmark.modeling.roi_heads.roi_heads import build_roi_heads # NoQA from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.config import cfg as g_cfg from utils import load_config # overwrite configs if specified, otherwise default config is used PREDICTOR_CFGS = { } # overwrite configs if specified, otherwise default config is used PREDICTOR_INPUT_CHANNELS = { } def _test_predictors( self, predictors, overwrite_cfgs, overwrite_in_channels, hwsize, ): ''' Make sure predictors run ''' self.assertGreater(len(predictors), 0) in_channels_default = 64 for name, builder in predictors.items(): print('Testing {}...'.format(name)) if name in overwrite_cfgs: cfg = load_config(overwrite_cfgs[name]) else: # Use default config if config file is not specified cfg = copy.deepcopy(g_cfg) in_channels = overwrite_in_channels.get( name, in_channels_default) fe = builder(cfg, in_channels) N, C_in, H, W = 2, in_channels, hwsize, hwsize input = torch.rand([N, C_in, H, W], dtype=torch.float32) out = fe(input) yield input, out, cfg class TestPredictors(unittest.TestCase): def test_roi_box_predictors(self): ''' Make sure roi box predictors run ''' for cur_in, cur_out, cur_cfg in _test_predictors( self, registry.ROI_BOX_PREDICTOR, PREDICTOR_CFGS, PREDICTOR_INPUT_CHANNELS, hwsize=1, ): self.assertEqual(len(cur_out), 2) scores, bbox_deltas = cur_out[0], cur_out[1] self.assertEqual( scores.shape[1], cur_cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES) self.assertEqual(scores.shape[0], cur_in.shape[0]) self.assertEqual(scores.shape[0], bbox_deltas.shape[0]) self.assertEqual(scores.shape[1] * 4, bbox_deltas.shape[1]) def test_roi_keypoints_predictors(self): ''' Make sure roi keypoint predictors run ''' for cur_in, cur_out, cur_cfg in _test_predictors( self, registry.ROI_KEYPOINT_PREDICTOR, PREDICTOR_CFGS, PREDICTOR_INPUT_CHANNELS, hwsize=14, ): self.assertEqual(cur_out.shape[0], cur_in.shape[0]) self.assertEqual( cur_out.shape[1], cur_cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES) def test_roi_mask_predictors(self): ''' Make sure roi mask predictors run ''' for cur_in, cur_out, cur_cfg in _test_predictors( self, registry.ROI_MASK_PREDICTOR, PREDICTOR_CFGS, PREDICTOR_INPUT_CHANNELS, hwsize=14, ): self.assertEqual(cur_out.shape[0], cur_in.shape[0]) self.assertEqual( cur_out.shape[1], cur_cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_rpn_heads.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import copy import torch # import modules to to register rpn heads from maskrcnn_benchmark.modeling.backbone import build_backbone # NoQA from maskrcnn_benchmark.modeling.rpn.rpn import build_rpn # NoQA from maskrcnn_benchmark.modeling import registry from maskrcnn_benchmark.config import cfg as g_cfg from utils import load_config # overwrite configs if specified, otherwise default config is used RPN_CFGS = { } class TestRPNHeads(unittest.TestCase): def test_build_rpn_heads(self): ''' Make sure rpn heads run ''' self.assertGreater(len(registry.RPN_HEADS), 0) in_channels = 64 num_anchors = 10 for name, builder in registry.RPN_HEADS.items(): print('Testing {}...'.format(name)) if name in RPN_CFGS: cfg = load_config(RPN_CFGS[name]) else: # Use default config if config file is not specified cfg = copy.deepcopy(g_cfg) rpn = builder(cfg, in_channels, num_anchors) N, C_in, H, W = 2, in_channels, 24, 32 input = torch.rand([N, C_in, H, W], dtype=torch.float32) LAYERS = 3 out = rpn([input] * LAYERS) self.assertEqual(len(out), 2) logits, bbox_reg = out for idx in range(LAYERS): self.assertEqual( logits[idx].shape, torch.Size([ input.shape[0], num_anchors, input.shape[2], input.shape[3], ]) ) self.assertEqual( bbox_reg[idx].shape, torch.Size([ logits[idx].shape[0], num_anchors * 4, logits[idx].shape[2], logits[idx].shape[3], ]), ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_segmentation_mask.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import unittest import torch from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask class TestSegmentationMask(unittest.TestCase): def __init__(self, method_name='runTest'): super(TestSegmentationMask, self).__init__(method_name) poly = [[[423.0, 306.5, 406.5, 277.0, 400.0, 271.5, 389.5, 277.0, 387.5, 292.0, 384.5, 295.0, 374.5, 220.0, 378.5, 210.0, 391.0, 200.5, 404.0, 199.5, 414.0, 203.5, 425.5, 221.0, 438.5, 297.0, 423.0, 306.5], [100, 100, 200, 100, 200, 200, 100, 200], ]] width = 640 height = 480 size = width, height self.P = SegmentationMask(poly, size, 'poly') self.M = SegmentationMask(poly, size, 'poly').convert('mask') def L1(self, A, B): diff = A.get_mask_tensor() - B.get_mask_tensor() diff = torch.sum(torch.abs(diff.float())).item() return diff def test_convert(self): M_hat = self.M.convert('poly').convert('mask') P_hat = self.P.convert('mask').convert('poly') diff_mask = self.L1(self.M, M_hat) diff_poly = self.L1(self.P, P_hat) self.assertTrue(diff_mask == diff_poly) self.assertTrue(diff_mask <= 8169.) self.assertTrue(diff_poly <= 8169.) def test_crop(self): box = [400, 250, 500, 300] # xyxy diff = self.L1(self.M.crop(box), self.P.crop(box)) self.assertTrue(diff <= 1.) def test_resize(self): new_size = 50, 25 M_hat = self.M.resize(new_size) P_hat = self.P.resize(new_size) diff = self.L1(M_hat, P_hat) self.assertTrue(self.M.size == self.P.size) self.assertTrue(M_hat.size == P_hat.size) self.assertTrue(self.M.size != M_hat.size) self.assertTrue(diff <= 255.) def test_transpose(self): FLIP_LEFT_RIGHT = 0 FLIP_TOP_BOTTOM = 1 diff_hor = self.L1(self.M.transpose(FLIP_LEFT_RIGHT), self.P.transpose(FLIP_LEFT_RIGHT)) diff_ver = self.L1(self.M.transpose(FLIP_TOP_BOTTOM), self.P.transpose(FLIP_TOP_BOTTOM)) self.assertTrue(diff_hor <= 53250.) self.assertTrue(diff_ver <= 42494.) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/utils.py ================================================ from __future__ import absolute_import, division, print_function, unicode_literals # Set up custom environment before nearly anything else is imported # NOTE: this should be the first import (no not reorder) from maskrcnn_benchmark.utils.env import setup_environment # noqa F401 isort:skip import env_tests.env as env_tests import os import copy from maskrcnn_benchmark.config import cfg as g_cfg def get_config_root_path(): return env_tests.get_config_root_path() def load_config(rel_path): ''' Load config from file path specified as path relative to config_root ''' cfg_path = os.path.join(env_tests.get_config_root_path(), rel_path) return load_config_from_file(cfg_path) def load_config_from_file(file_path): ''' Load config from file path specified as absolute path ''' ret = copy.deepcopy(g_cfg) ret.merge_from_file(file_path) return ret ================================================ FILE: tools/cityscapes/convert_cityscapes_to_coco.py ================================================ #!/usr/bin/env python # Copyright (c) 2017-present, Facebook, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ############################################################################## # This file is copy from https://github.com/facebookresearch/Detectron/tree/master/tools from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import argparse import h5py import json import os import scipy.misc import sys import cityscapesscripts.evaluation.instances2dict_with_polygons as cs def parse_args(): parser = argparse.ArgumentParser(description='Convert dataset') parser.add_argument( '--dataset', help="cocostuff, cityscapes", default=None, type=str) parser.add_argument( '--outdir', help="output dir for json files", default=None, type=str) parser.add_argument( '--datadir', help="data dir for annotations to be converted", default=None, type=str) if len(sys.argv) == 1: parser.print_help() sys.exit(1) return parser.parse_args() def poly_to_box(poly): """Convert a polygon into a tight bounding box.""" x0 = min(min(p[::2]) for p in poly) x1 = max(max(p[::2]) for p in poly) y0 = min(min(p[1::2]) for p in poly) y1 = max(max(p[1::2]) for p in poly) box_from_poly = [x0, y0, x1, y1] return box_from_poly def xyxy_to_xywh(xyxy_box): xmin, ymin, xmax, ymax = xyxy_box TO_REMOVE = 1 xywh_box = (xmin, ymin, xmax - xmin + TO_REMOVE, ymax - ymin + TO_REMOVE) return xywh_box def convert_coco_stuff_mat(data_dir, out_dir): """Convert to png and save json with path. This currently only contains the segmentation labels for objects+stuff in cocostuff - if we need to combine with other labels from original COCO that will be a TODO.""" sets = ['train', 'val'] categories = [] json_name = 'coco_stuff_%s.json' ann_dict = {} for data_set in sets: file_list = os.path.join(data_dir, '%s.txt') images = [] with open(file_list % data_set) as f: for img_id, img_name in enumerate(f): img_name = img_name.replace('coco', 'COCO').strip('\n') image = {} mat_file = os.path.join( data_dir, 'annotations/%s.mat' % img_name) data = h5py.File(mat_file, 'r') labelMap = data.get('S') if len(categories) == 0: labelNames = data.get('names') for idx, n in enumerate(labelNames): categories.append( {"id": idx, "name": ''.join(chr(i) for i in data[ n[0]])}) ann_dict['categories'] = categories scipy.misc.imsave( os.path.join(data_dir, img_name + '.png'), labelMap) image['width'] = labelMap.shape[0] image['height'] = labelMap.shape[1] image['file_name'] = img_name image['seg_file_name'] = img_name image['id'] = img_id images.append(image) ann_dict['images'] = images print("Num images: %s" % len(images)) with open(os.path.join(out_dir, json_name % data_set), 'wb') as outfile: outfile.write(json.dumps(ann_dict)) # for Cityscapes def getLabelID(self, instID): if (instID < 1000): return instID else: return int(instID / 1000) def convert_cityscapes_instance_only( data_dir, out_dir): """Convert from cityscapes format to COCO instance seg format - polygons""" sets = [ 'gtFine_val', 'gtFine_train', 'gtFine_test', # 'gtCoarse_train', # 'gtCoarse_val', # 'gtCoarse_train_extra' ] ann_dirs = [ 'gtFine_trainvaltest/gtFine/val', 'gtFine_trainvaltest/gtFine/train', 'gtFine_trainvaltest/gtFine/test', # 'gtCoarse/train', # 'gtCoarse/train_extra', # 'gtCoarse/val' ] json_name = 'instancesonly_filtered_%s.json' ends_in = '%s_polygons.json' img_id = 0 ann_id = 0 cat_id = 1 category_dict = {} category_instancesonly = [ 'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle', ] for data_set, ann_dir in zip(sets, ann_dirs): print('Starting %s' % data_set) ann_dict = {} images = [] annotations = [] ann_dir = os.path.join(data_dir, ann_dir) for root, _, files in os.walk(ann_dir): for filename in files: if filename.endswith(ends_in % data_set.split('_')[0]): if len(images) % 50 == 0: print("Processed %s images, %s annotations" % ( len(images), len(annotations))) json_ann = json.load(open(os.path.join(root, filename))) image = {} image['id'] = img_id img_id += 1 image['width'] = json_ann['imgWidth'] image['height'] = json_ann['imgHeight'] image['file_name'] = filename[:-len( ends_in % data_set.split('_')[0])] + 'leftImg8bit.png' image['seg_file_name'] = filename[:-len( ends_in % data_set.split('_')[0])] + \ '%s_instanceIds.png' % data_set.split('_')[0] images.append(image) fullname = os.path.join(root, image['seg_file_name']) objects = cs.instances2dict_with_polygons( [fullname], verbose=False)[fullname] for object_cls in objects: if object_cls not in category_instancesonly: continue # skip non-instance categories for obj in objects[object_cls]: if obj['contours'] == []: print('Warning: empty contours.') continue # skip non-instance categories len_p = [len(p) for p in obj['contours']] if min(len_p) <= 4: print('Warning: invalid contours.') continue # skip non-instance categories ann = {} ann['id'] = ann_id ann_id += 1 ann['image_id'] = image['id'] ann['segmentation'] = obj['contours'] if object_cls not in category_dict: category_dict[object_cls] = cat_id cat_id += 1 ann['category_id'] = category_dict[object_cls] ann['iscrowd'] = 0 ann['area'] = obj['pixelCount'] xyxy_box = poly_to_box(ann['segmentation']) xywh_box = xyxy_to_xywh(xyxy_box) ann['bbox'] = xywh_box annotations.append(ann) ann_dict['images'] = images categories = [{"id": category_dict[name], "name": name} for name in category_dict] ann_dict['categories'] = categories ann_dict['annotations'] = annotations print("Num categories: %s" % len(categories)) print("Num images: %s" % len(images)) print("Num annotations: %s" % len(annotations)) with open(os.path.join(out_dir, json_name % data_set), 'w') as outfile: outfile.write(json.dumps(ann_dict)) if __name__ == '__main__': args = parse_args() if args.dataset == "cityscapes_instance_only": convert_cityscapes_instance_only(args.datadir, args.outdir) elif args.dataset == "cocostuff": convert_coco_stuff_mat(args.datadir, args.outdir) else: print("Dataset not supported: %s" % args.dataset) ================================================ FILE: tools/cityscapes/instances2dict_with_polygons.py ================================================ #!/usr/bin/python # # Convert instances from png files to a dictionary # This files is created according to https://github.com/facebookresearch/Detectron/issues/111 from __future__ import print_function, absolute_import, division import os, sys sys.path.append( os.path.normpath( os.path.join( os.path.dirname( __file__ ) , '..' , 'helpers' ) ) ) from csHelpers import * # Cityscapes imports from cityscapesscripts.evaluation.instance import * from cityscapesscripts.helpers.csHelpers import * import cv2 from maskrcnn_benchmark.utils import cv2_util def instances2dict_with_polygons(imageFileList, verbose=False): imgCount = 0 instanceDict = {} if not isinstance(imageFileList, list): imageFileList = [imageFileList] if verbose: print("Processing {} images...".format(len(imageFileList))) for imageFileName in imageFileList: # Load image img = Image.open(imageFileName) # Image as numpy array imgNp = np.array(img) # Initialize label categories instances = {} for label in labels: instances[label.name] = [] # Loop through all instance ids in instance image for instanceId in np.unique(imgNp): if instanceId < 1000: continue instanceObj = Instance(imgNp, instanceId) instanceObj_dict = instanceObj.toDict() #instances[id2label[instanceObj.labelID].name].append(instanceObj.toDict()) if id2label[instanceObj.labelID].hasInstances: mask = (imgNp == instanceId).astype(np.uint8) contour, hier = cv2_util.findContours( mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) polygons = [c.reshape(-1).tolist() for c in contour] instanceObj_dict['contours'] = polygons instances[id2label[instanceObj.labelID].name].append(instanceObj_dict) imgKey = os.path.abspath(imageFileName) instanceDict[imgKey] = instances imgCount += 1 if verbose: print("\rImages Processed: {}".format(imgCount), end=' ') sys.stdout.flush() if verbose: print("") return instanceDict def main(argv): fileList = [] if (len(argv) > 2): for arg in argv: if ("png" in arg): fileList.append(arg) instances2dict_with_polygons(fileList, True) if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: tools/remove_solver_states.py ================================================ # Set up custom environment before nearly anything else is imported # NOTE: this should be the first import (no not reorder) from maskrcnn_benchmark.utils.env import setup_environment # noqa F401 isort:skip import argparse import os import torch def main(): parser = argparse.ArgumentParser(description="Remove the solver states stored in a trained model") parser.add_argument( "model", default="models/FCOS_R_50_FPN_1x.pth", help="path to the input model file", ) args = parser.parse_args() model = torch.load(args.model) del model["optimizer"] del model["scheduler"] filename_wo_ext, ext = os.path.splitext(args.model) output_file = filename_wo_ext + "_wo_solver_states" + ext torch.save(model, output_file) print("Done. The model without solver states is saved to {}".format(output_file)) if __name__ == "__main__": main() ================================================ FILE: tools/test_net.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. # Set up custom environment before nearly anything else is imported # NOTE: this should be the first import (no not reorder) from maskrcnn_benchmark.utils.env import setup_environment # noqa F401 isort:skip import argparse import os import torch from maskrcnn_benchmark.config import cfg from maskrcnn_benchmark.data import make_data_loader from maskrcnn_benchmark.engine.inference import inference from maskrcnn_benchmark.modeling.detector import build_detection_model from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer from maskrcnn_benchmark.utils.collect_env import collect_env_info from maskrcnn_benchmark.utils.comm import synchronize, get_rank from maskrcnn_benchmark.utils.logger import setup_logger from maskrcnn_benchmark.utils.miscellaneous import mkdir def main(): parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default="/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group( backend="nccl", init_method="env://" ) synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() save_dir = "" logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) model = build_detection_model(cfg) model.to(cfg.MODEL.DEVICE) output_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) _ = checkpointer.load(cfg.MODEL.WEIGHT) iou_types = ("bbox",) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm",) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints",) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize() if __name__ == "__main__": main() ================================================ FILE: tools/train_net.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. r""" Basic training script for PyTorch """ # Set up custom environment before nearly anything else is imported # NOTE: this should be the first import (no not reorder) from maskrcnn_benchmark.utils.env import setup_environment # noqa F401 isort:skip import argparse import os import torch from maskrcnn_benchmark.config import cfg from maskrcnn_benchmark.data import make_data_loader from maskrcnn_benchmark.solver import make_lr_scheduler from maskrcnn_benchmark.solver import make_optimizer from maskrcnn_benchmark.engine.inference import inference from maskrcnn_benchmark.engine.trainer import do_train from maskrcnn_benchmark.modeling.detector import build_detection_model from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer from maskrcnn_benchmark.utils.collect_env import collect_env_info from maskrcnn_benchmark.utils.comm import synchronize, \ get_rank, is_pytorch_1_1_0_or_later from maskrcnn_benchmark.utils.imports import import_file from maskrcnn_benchmark.utils.logger import setup_logger from maskrcnn_benchmark.utils.miscellaneous import mkdir def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) if cfg.MODEL.USE_SYNCBN: assert is_pytorch_1_1_0_or_later(), \ "SyncBatchNorm is only available in pytorch >= 1.1.0" model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ) return model def run_test(cfg, model, distributed): if distributed: model = model.module torch.cuda.empty_cache() # TODO check if it helps iou_types = ("bbox",) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm",) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints",) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize() def main(): parser = argparse.ArgumentParser(description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group( backend="nccl", init_method="env://" ) synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args.local_rank, args.distributed) if not args.skip_test: run_test(cfg, model, args.distributed) if __name__ == "__main__": main()