Repository: amirassov/kaggle-imaterialist Branch: master Commit: f1ae37100801 Files: 270 Total size: 1004.0 KB Directory structure: gitextract_ymxjag4v/ ├── .dockerignore ├── .gitignore ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── configs/ │ └── htc_dconv_c3-c5_mstrain_x101_64x4d_fpn_20e_1200x1900.py ├── mmdetection/ │ ├── .travis.yml │ ├── GETTING_STARTED.md │ ├── INSTALL.md │ ├── LICENSE │ ├── MODEL_ZOO.md │ ├── README.md │ ├── TECHNICAL_DETAILS.md │ ├── compile.sh │ ├── configs/ │ │ ├── cascade_mask_rcnn_r101_fpn_1x.py │ │ ├── cascade_mask_rcnn_r50_caffe_c4_1x.py │ │ ├── cascade_mask_rcnn_r50_fpn_1x.py │ │ ├── cascade_mask_rcnn_x101_32x4d_fpn_1x.py │ │ ├── cascade_mask_rcnn_x101_64x4d_fpn_1x.py │ │ ├── cascade_rcnn_r101_fpn_1x.py │ │ ├── cascade_rcnn_r50_caffe_c4_1x.py │ │ ├── cascade_rcnn_r50_fpn_1x.py │ │ ├── cascade_rcnn_x101_32x4d_fpn_1x.py │ │ ├── cascade_rcnn_x101_64x4d_fpn_1x.py │ │ ├── dcn/ │ │ │ ├── README.md │ │ │ ├── cascade_mask_rcnn_dconv_c3-c5_r50_fpn_1x.py │ │ │ ├── cascade_rcnn_dconv_c3-c5_r50_fpn_1x.py │ │ │ ├── faster_rcnn_dconv_c3-c5_r50_fpn_1x.py │ │ │ ├── faster_rcnn_dconv_c3-c5_x101_32x4d_fpn_1x.py │ │ │ ├── faster_rcnn_dpool_r50_fpn_1x.py │ │ │ ├── faster_rcnn_mdconv_c3-c5_r50_fpn_1x.py │ │ │ ├── faster_rcnn_mdpool_r50_fpn_1x.py │ │ │ └── mask_rcnn_dconv_c3-c5_r50_fpn_1x.py │ │ ├── fast_mask_rcnn_r101_fpn_1x.py │ │ ├── fast_mask_rcnn_r50_caffe_c4_1x.py │ │ ├── fast_mask_rcnn_r50_fpn_1x.py │ │ ├── fast_rcnn_r101_fpn_1x.py │ │ ├── fast_rcnn_r50_caffe_c4_1x.py │ │ ├── fast_rcnn_r50_fpn_1x.py │ │ ├── faster_rcnn_ohem_r50_fpn_1x.py │ │ ├── faster_rcnn_r101_fpn_1x.py │ │ ├── faster_rcnn_r50_caffe_c4_1x.py │ │ ├── faster_rcnn_r50_fpn_1x.py │ │ ├── faster_rcnn_x101_32x4d_fpn_1x.py │ │ ├── faster_rcnn_x101_64x4d_fpn_1x.py │ │ ├── fcos/ │ │ │ ├── README.md │ │ │ ├── fcos_mstrain_640_800_r101_caffe_fpn_gn_2x_4gpu.py │ │ │ ├── fcos_mstrain_640_800_x101_64x4d_fpn_gn_2x.py │ │ │ └── fcos_r50_caffe_fpn_gn_1x_4gpu.py │ │ ├── gn/ │ │ │ ├── README.md │ │ │ ├── mask_rcnn_r101_fpn_gn_2x.py │ │ │ ├── mask_rcnn_r50_fpn_gn_2x.py │ │ │ └── mask_rcnn_r50_fpn_gn_contrib_2x.py │ │ ├── gn+ws/ │ │ │ ├── README.md │ │ │ ├── faster_rcnn_r50_fpn_gn_ws_1x.py │ │ │ ├── mask_rcnn_r50_fpn_gn_ws_20_23_24e.py │ │ │ ├── mask_rcnn_r50_fpn_gn_ws_2x.py │ │ │ └── mask_rcnn_x101_32x4d_fpn_gn_ws_2x.py │ │ ├── htc/ │ │ │ ├── README.md │ │ │ ├── htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e.py │ │ │ ├── htc_r101_fpn_20e.py │ │ │ ├── htc_r50_fpn_1x.py │ │ │ ├── htc_r50_fpn_20e.py │ │ │ ├── htc_without_semantic_r50_fpn_1x.py │ │ │ ├── htc_x101_32x4d_fpn_20e_16gpu.py │ │ │ └── htc_x101_64x4d_fpn_20e_16gpu.py │ │ ├── mask_rcnn_r101_fpn_1x.py │ │ ├── mask_rcnn_r50_caffe_c4_1x.py │ │ ├── mask_rcnn_r50_fpn_1x.py │ │ ├── mask_rcnn_x101_32x4d_fpn_1x.py │ │ ├── mask_rcnn_x101_64x4d_fpn_1x.py │ │ ├── pascal_voc/ │ │ │ ├── faster_rcnn_r50_fpn_1x_voc0712.py │ │ │ ├── ssd300_voc.py │ │ │ └── ssd512_voc.py │ │ ├── retinanet_r101_fpn_1x.py │ │ ├── retinanet_r50_fpn_1x.py │ │ ├── retinanet_x101_32x4d_fpn_1x.py │ │ ├── retinanet_x101_64x4d_fpn_1x.py │ │ ├── rpn_r101_fpn_1x.py │ │ ├── rpn_r50_caffe_c4_1x.py │ │ ├── rpn_r50_fpn_1x.py │ │ ├── rpn_x101_32x4d_fpn_1x.py │ │ ├── rpn_x101_64x4d_fpn_1x.py │ │ ├── ssd300_coco.py │ │ └── ssd512_coco.py │ ├── mmdet/ │ │ ├── __init__.py │ │ ├── apis/ │ │ │ ├── __init__.py │ │ │ ├── env.py │ │ │ ├── inference.py │ │ │ └── train.py │ │ ├── core/ │ │ │ ├── __init__.py │ │ │ ├── anchor/ │ │ │ │ ├── __init__.py │ │ │ │ ├── anchor_generator.py │ │ │ │ └── anchor_target.py │ │ │ ├── bbox/ │ │ │ │ ├── __init__.py │ │ │ │ ├── assign_sampling.py │ │ │ │ ├── assigners/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── assign_result.py │ │ │ │ │ ├── base_assigner.py │ │ │ │ │ └── max_iou_assigner.py │ │ │ │ ├── bbox_target.py │ │ │ │ ├── geometry.py │ │ │ │ ├── samplers/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base_sampler.py │ │ │ │ │ ├── combined_sampler.py │ │ │ │ │ ├── instance_balanced_pos_sampler.py │ │ │ │ │ ├── iou_balanced_neg_sampler.py │ │ │ │ │ ├── ohem_sampler.py │ │ │ │ │ ├── pseudo_sampler.py │ │ │ │ │ ├── random_sampler.py │ │ │ │ │ └── sampling_result.py │ │ │ │ └── transforms.py │ │ │ ├── evaluation/ │ │ │ │ ├── __init__.py │ │ │ │ ├── bbox_overlaps.py │ │ │ │ ├── class_names.py │ │ │ │ ├── coco_utils.py │ │ │ │ ├── eval_hooks.py │ │ │ │ ├── mean_ap.py │ │ │ │ └── recall.py │ │ │ ├── loss/ │ │ │ │ ├── __init__.py │ │ │ │ └── losses.py │ │ │ ├── mask/ │ │ │ │ ├── __init__.py │ │ │ │ ├── mask_target.py │ │ │ │ └── utils.py │ │ │ ├── post_processing/ │ │ │ │ ├── __init__.py │ │ │ │ ├── bbox_nms.py │ │ │ │ └── merge_augs.py │ │ │ └── utils/ │ │ │ ├── __init__.py │ │ │ ├── dist_utils.py │ │ │ └── misc.py │ │ ├── datasets/ │ │ │ ├── __init__.py │ │ │ ├── coco.py │ │ │ ├── concat_dataset.py │ │ │ ├── custom.py │ │ │ ├── extra_aug.py │ │ │ ├── loader/ │ │ │ │ ├── __init__.py │ │ │ │ ├── build_loader.py │ │ │ │ └── sampler.py │ │ │ ├── repeat_dataset.py │ │ │ ├── transforms.py │ │ │ ├── utils.py │ │ │ ├── voc.py │ │ │ └── xml_style.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── anchor_heads/ │ │ │ │ ├── __init__.py │ │ │ │ ├── anchor_head.py │ │ │ │ ├── fcos_head.py │ │ │ │ ├── retina_head.py │ │ │ │ ├── rpn_head.py │ │ │ │ └── ssd_head.py │ │ │ ├── backbones/ │ │ │ │ ├── __init__.py │ │ │ │ ├── resnet.py │ │ │ │ ├── resnext.py │ │ │ │ └── ssd_vgg.py │ │ │ ├── bbox_heads/ │ │ │ │ ├── __init__.py │ │ │ │ ├── bbox_head.py │ │ │ │ └── convfc_bbox_head.py │ │ │ ├── builder.py │ │ │ ├── detectors/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── cascade_rcnn.py │ │ │ │ ├── ensemble_htc.py │ │ │ │ ├── fast_rcnn.py │ │ │ │ ├── faster_rcnn.py │ │ │ │ ├── fcos.py │ │ │ │ ├── htc.py │ │ │ │ ├── mask_rcnn.py │ │ │ │ ├── retinanet.py │ │ │ │ ├── rpn.py │ │ │ │ ├── single_stage.py │ │ │ │ ├── test_mixins.py │ │ │ │ └── two_stage.py │ │ │ ├── mask_heads/ │ │ │ │ ├── __init__.py │ │ │ │ ├── fcn_mask_head.py │ │ │ │ ├── fused_semantic_head.py │ │ │ │ └── htc_mask_head.py │ │ │ ├── necks/ │ │ │ │ ├── __init__.py │ │ │ │ └── fpn.py │ │ │ ├── registry.py │ │ │ ├── roi_extractors/ │ │ │ │ ├── __init__.py │ │ │ │ └── single_level.py │ │ │ ├── shared_heads/ │ │ │ │ ├── __init__.py │ │ │ │ └── res_layer.py │ │ │ └── utils/ │ │ │ ├── __init__.py │ │ │ ├── conv_module.py │ │ │ ├── conv_ws.py │ │ │ ├── norm.py │ │ │ ├── scale.py │ │ │ └── weight_init.py │ │ └── ops/ │ │ ├── __init__.py │ │ ├── dcn/ │ │ │ ├── __init__.py │ │ │ ├── functions/ │ │ │ │ ├── __init__.py │ │ │ │ ├── deform_conv.py │ │ │ │ └── deform_pool.py │ │ │ ├── modules/ │ │ │ │ ├── __init__.py │ │ │ │ ├── deform_conv.py │ │ │ │ └── deform_pool.py │ │ │ ├── setup.py │ │ │ └── src/ │ │ │ ├── deform_conv_cuda.cpp │ │ │ ├── deform_conv_cuda_kernel.cu │ │ │ ├── deform_pool_cuda.cpp │ │ │ └── deform_pool_cuda_kernel.cu │ │ ├── nms/ │ │ │ ├── __init__.py │ │ │ ├── nms_wrapper.py │ │ │ ├── setup.py │ │ │ └── src/ │ │ │ ├── nms_cpu.cpp │ │ │ ├── nms_cuda.cpp │ │ │ ├── nms_kernel.cu │ │ │ └── soft_nms_cpu.pyx │ │ ├── roi_align/ │ │ │ ├── __init__.py │ │ │ ├── functions/ │ │ │ │ ├── __init__.py │ │ │ │ └── roi_align.py │ │ │ ├── gradcheck.py │ │ │ ├── modules/ │ │ │ │ ├── __init__.py │ │ │ │ └── roi_align.py │ │ │ ├── setup.py │ │ │ └── src/ │ │ │ ├── roi_align_cuda.cpp │ │ │ └── roi_align_kernel.cu │ │ ├── roi_pool/ │ │ │ ├── __init__.py │ │ │ ├── functions/ │ │ │ │ ├── __init__.py │ │ │ │ └── roi_pool.py │ │ │ ├── gradcheck.py │ │ │ ├── modules/ │ │ │ │ ├── __init__.py │ │ │ │ └── roi_pool.py │ │ │ ├── setup.py │ │ │ └── src/ │ │ │ ├── roi_pool_cuda.cpp │ │ │ └── roi_pool_kernel.cu │ │ └── sigmoid_focal_loss/ │ │ ├── __init__.py │ │ ├── functions/ │ │ │ ├── __init__.py │ │ │ └── sigmoid_focal_loss.py │ │ ├── modules/ │ │ │ ├── __init__.py │ │ │ └── sigmoid_focal_loss.py │ │ ├── setup.py │ │ └── src/ │ │ ├── sigmoid_focal_loss.cpp │ │ └── sigmoid_focal_loss_cuda.cu │ ├── setup.py │ └── tools/ │ ├── analyze_logs.py │ ├── coco_eval.py │ ├── convert_datasets/ │ │ └── pascal_voc.py │ ├── dist_test.sh │ ├── dist_train.sh │ ├── publish_model.py │ ├── slurm_test.sh │ ├── slurm_train.sh │ ├── test.py │ ├── test_ensemble.py │ ├── train.py │ ├── upgrade_model_version.py │ └── voc_eval.py ├── scrips/ │ ├── create_mmdetection_test.sh │ ├── create_mmdetection_train.sh │ ├── dist_test.sh │ ├── dist_test_ensemble.sh │ ├── dist_train.sh │ ├── prepare_weights.sh │ └── split.sh └── src/ ├── __init__.py ├── create_mmdetection_test.py ├── create_mmdetection_train.py ├── draw.py ├── eda.py ├── metric.py ├── prune.py ├── rle.py ├── rm_attribute_classes.py ├── split.py ├── submit.py ├── utils.py └── visualization.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ # custom: .git/* data/* ipynb/* .idea/* # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ ================================================ FILE: .gitignore ================================================ *.ipynb .idea/ .DS_Store # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ ================================================ FILE: Dockerfile ================================================ FROM pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel RUN apt-get update && apt-get install -y \ git \ wget \ curl \ cmake \ unzip \ build-essential \ libsm6 \ libxext6 \ libfontconfig1 \ libxrender1 \ libswscale-dev \ libtbb2 \ libtbb-dev \ libjpeg-dev \ libpng-dev \ libtiff-dev \ libjasper-dev \ libavformat-dev \ libpq-dev \ libturbojpeg \ software-properties-common \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* RUN pip install --no-cache-dir \ numpy \ pandas \ PyYAML \ cycler \ dill \ h5py \ imgaug \ matplotlib \ opencv-contrib-python \ Pillow \ scikit-image \ scikit-learn \ scipy \ setuptools \ six \ tqdm \ ipython \ ipdb \ albumentations \ click \ jpeg4py \ addict \ colorama \ torchvision \ iterative-stratification RUN pip install --upgrade --no-cache-dir cython && pip install --no-cache-dir pycocotools==2.0.0 mmcv==0.2.5 ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2019 Miras Amir Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: Makefile ================================================ APP_NAME=amirassov/kaggle-imaterialist CONTAINER_NAME=kaggle-imaterialist # HELP .PHONY: help help: ## This help. @awk 'BEGIN (FS = ":.*?## ") /^[a-zA-Z_-]+:.*?## / (printf "\033[36m%-30s\033[0m %s\n", $$1, $$2)' $(MAKEFILE_LIST) build: ## Build the container nvidia-docker build -t $(APP_NAME) . run-dgx: ## Run container in omen nvidia-docker run \ -itd \ --ipc=host \ --name=$(CONTAINER_NAME) \ -e DISPLAY=localhost:10.0 \ -v /tmp/.X11-unix:/tmp/.X11-unix \ -v /raid/data_share/amirassov/kaggle-imaterialist_data:/data \ -v /raid/data_share/amirassov/kaggle-imaterialist_dumps:/dumps \ -v $(shell pwd):/kaggle-imaterialist $(APP_NAME) bash run-omen: ## Run container in omen nvidia-docker run \ -itd \ --ipc=host \ --name=$(CONTAINER_NAME) \ -e DISPLAY=localhost:10.0 \ -v /tmp/.X11-unix:/tmp/.X11-unix \ -v /home/videoanalytics/data/kaggle-imaterialist_data:/data \ -v /home/videoanalytics/data/dumps:/dumps \ -v $(shell pwd):/kaggle-imaterialist $(APP_NAME) bash exec: ## Run a bash in a running container nvidia-docker exec -it $(CONTAINER_NAME) bash stop: ## Stop and remove a running container docker stop $(CONTAINER_NAME); docker rm $(CONTAINER_NAME) ================================================ FILE: README.md ================================================ # The First Place Solution of [iMaterialist (Fashion) 2019](https://www.kaggle.com/c/imaterialist-fashion-2019-FGVC6/) ![ensemble](figures/prediction.png) ## Solution My solution is based on the COCO challenge 2018 winners article: https://arxiv.org/abs/1901.07518. ### Model: [Hybrid Task Cascade with ResNeXt-101-64x4d-FPN backbone](https://github.com/open-mmlab/mmdetection/blob/master/configs/htc/htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e.py). This model has a metric Mask mAP = 43.9 on COCO dataset. This is SOTA for instance segmentation. ### Validation: For validation, I used 450 training samples splitted using https://github.com/trent-b/iterative-stratification. ### Preprocessing: I applied light augmentatios from the [albumentations](https://github.com/albu/albumentations) library to the original image. Then I use multi-scale training: in each iteration, the scale of short edge is randomly sampled from [600, 1200], and the scale of long edge is fixed as 1900. ![preprocessing](figures/preproc.png) ### Training details: * pre-train from COCO * optimizer: `SGD(lr=0.03, momentum=0.9, weight_decay=0.0001)` * batch_size: 16 = 2 images per gpu x 8 gpus Tesla V100 * learning rate scheduler: ``` if iterations < 500: lr = warmup(warmup_ratio=1 / 3) if epochs == 10: lr = lr ∗ 0.1 if epochs == 18: lr = lr ∗ 0.1 if epochs > 20: stop ``` * training time: ~3 days. ### Parameter tuning: After the 12th epoch with the default parameters, the metric on LB was **0.21913**. Next, I tuned postprocessing thresholds using validation data: ``` rcnn=dict( score_thr=0.5, nms=dict(type='nms', iou_thr=0.3), max_per_img=100, mask_thr_binary=0.45 ) ``` This improved the metric on LB: **0.21913 -> 0.30011.** ### Test time augmentation: I use 3 scales as well as horizontal flip at test time and ensemble the results. Testing scales are (1000, 1600), (1200, 1900), (1400, 2200). I drew a TTA scheme for Mask R-CNN, which is implemented in mmdetection library. For Hybrid Task Cascade R-CNN, I rewrote this code. This improved the metric on LB: **0.30011 -> 0.31074.** ![TTA](figures/tta.png) ### Ensemble: I ensemble the 3 best checkpoints of my model. The ensemble scheme is similar to TTA. This improved the metric on LB: **0.31074 -> 0.31626.** ![ensemble](figures/ensemble.png) ### Attributes: I didn't use attributes at all: they were difficult to predict and the removal of classes with attributes greatly improved the metric. During the whole competition, I deleted classes with attributes: `{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}` U `{27, 28, 33}`. But two days before the end I read [the discussion] (https://www.kaggle.com/c/kaggle-imaterialist-fashion-2019-FGVC6/discussion/94811#latest548137) and added back classes `{27, 28, 33 }`. This improved the metric on LB: **0.31626 -> 0.33511.** ### Postprocessing for masks My post-processing algorithm for avoid intersections of masks of the same class: ```python def hard_overlaps_suppression(binary_mask, scores): not_overlap_mask = [] for i in np.argsort(scores)[::-1]: current_mask = binary_mask[..., i].copy() for mask in not_overlap_mask: current_mask = np.bitwise_and(current_mask, np.invert(mask)) not_overlap_mask.append(current_mask) return np.stack(not_overlap_mask, -1) ``` ### Small postprocessing: I deleted objects with an area of less than 20 pixels. This improved the metric on LB: **0.33511 -> 0.33621.** ## How to run? ### Docker ```bash make build make run-[server-name] make exec ``` ### Build mmdetection: ```bash cd mmdetection bash compile.sh python setup.py develop ``` ### Prepare pretrained weights: ```bash bash prepare_weights.sh ``` ### Data structure ``` /data/ ├── train/ │ └── ... ├── test/ │ └── ... └── train.csv.zip /dumps/ └── htc_dconv_c3-c5_mstrain_x101_64x4d_fpn_20e_1200x1900/ ``` Fix the [error](https://www.kaggle.com/c/kaggle-imaterialist-fashion-2019-FGVC6/discussion/91217#latest-529042) in `train.csv.zip.` ### Prepare annotations for mmdetection: ```bash cd scripts bash create_mmdetection_train.sh bash create_mmdetection_test.sh bash split.sh ``` ### Training the model: ```bash CUDA_VISIBLE_DEVICES=[list of gpus] bash dist_train.sh [config] [gpus] [--validate] ``` #### My best checkpoint: https://yadi.sk/d/-raqliq_ad6r_Q ### Test the model: ```bash CUDA_VISIBLE_DEVICES=[list of gpus] bash dist_test_ensemble.sh [config] [gpus] ``` ## References * https://github.com/open-mmlab/mmdetection ================================================ FILE: configs/htc_dconv_c3-c5_mstrain_x101_64x4d_fpn_20e_1200x1900.py ================================================ # model settings model = dict( type='HybridTaskCascade', num_stages=3, pretrained=None, interleaved=True, mask_info_flow=True, backbone=dict( type='ResNeXt', depth=101, groups=64, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch', dcn=dict( modulated=False, groups=64, deformable_groups=1, fallback_on_stride=False), stage_with_dcn=(False, True, True, True)), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=47, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=47, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=47, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ], mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='HTCMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=47)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.5, nms=dict(type='nms', iou_thr=0.3), max_per_img=100, mask_thr_binary=0.45), keep_all_stages=False) # dataset settings dataset_type = 'CustomDataset' data_root = '/data/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=4, train=dict( type=dataset_type, ann_file=data_root + '/data/train_99_mmdetection.pkl', img_prefix=data_root + 'train/', img_scale=[(600, 1900), (1200, 1900)], multiscale_mode='range', img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=False, with_label=True, extra_aug=dict( type='Compose', transforms=[ dict( p=0.5, max_h_size=64, type='Cutout' ), dict( brightness_limit=0.3, contrast_limit=0.3, p=0.5, type='RandomBrightnessContrast' ), dict( p=0.5, quality_lower=80, quality_upper=99, type='JpegCompression' ), ], p=1.0 ) ), val=dict( type=dataset_type, ann_file=data_root + '/data/val_01_mmdetection.pkl', img_prefix=data_root + 'train/', img_scale=(1200, 1900), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=False, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'test_mmdetection.pkl', img_prefix=data_root + 'test/', img_scale=[(1000, 1600), (1200, 1900), (1400, 2200)], img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=1.0, with_mask=True, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.03, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[10, 18]) checkpoint_config = dict(interval=1) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), ]) # runtime settings total_epochs = 20 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = '/dumps/htc_dconv_c3-c5_mstrain_x101_64x4d_fpn_20e_1200x1900' load_from = '/dumps/htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e_20190408-0e50669c_prune.pth' resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/.travis.yml ================================================ dist: trusty language: python install: - pip install flake8 python: - "3.5" - "3.6" script: - flake8 ================================================ FILE: mmdetection/GETTING_STARTED.md ================================================ # Getting Started This page provides basic tutorials about the usage of mmdetection. For installation instructions, please see [INSTALL.md](INSTALL.md). ## Inference with pretrained models We provide testing scripts to evaluate a whole dataset (COCO, PASCAL VOC, etc.), and also some high-level apis for easier integration to other projects. ### Test a dataset - [x] single GPU testing - [x] multiple GPU testing - [x] visualize detection results You can use the following commands to test a dataset. ```shell # single-gpu testing python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] [--show] # multi-gpu testing ./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] ``` Optional arguments: - `RESULT_FILE`: Filename of the output results in pickle format. If not specified, the results will not be saved to a file. - `EVAL_METRICS`: Items to be evaluated on the results. Allowed values are: `proposal_fast`, `proposal`, `bbox`, `segm`, `keypoints`. - `--show`: If specified, detection results will be ploted on the images and shown in a new window. Only applicable for single GPU testing. Examples: Assume that you have already downloaded the checkpoints to `checkpoints/`. 1. Test Faster R-CNN and show the results. ```shell python tools/test.py configs/faster_rcnn_r50_fpn_1x.py \ checkpoints/faster_rcnn_r50_fpn_1x_20181010-3d1b3351.pth \ --show ``` 2. Test Mask R-CNN and evaluate the bbox and mask AP. ```shell python tools/test.py configs/mask_rcnn_r50_fpn_1x.py \ checkpoints/mask_rcnn_r50_fpn_1x_20181010-069fa190.pth \ --out results.pkl --eval bbox segm ``` 3. Test Mask R-CNN with 8 GPUs, and evaluate the bbox and mask AP. ```shell ./tools/dist_test.sh configs/mask_rcnn_r50_fpn_1x.py \ checkpoints/mask_rcnn_r50_fpn_1x_20181010-069fa190.pth \ 8 --out results.pkl --eval bbox segm ``` ### High-level APIs for testing images. Here is an example of building the model and test given images. ```python from mmdet.apis import init_detector, inference_detector, show_result config_file = 'configs/faster_rcnn_r50_fpn_1x.py' checkpoint_file = 'checkpoints/faster_rcnn_r50_fpn_1x_20181010-3d1b3351.pth' # build the model from a config file and a checkpoint file model = init_detector(config_file, checkpoint_file) # test a single image and show the results img = 'test.jpg' # or img = mmcv.imread(img), which will only load it once result = inference_detector(model, img) show_result(img, result, model.CLASSES) # test a list of images and write the results to image files imgs = ['test1.jpg', 'test2.jpg'] for i, result in enumerate(inference_detector(model, imgs, device='cuda:0')): show_result(imgs[i], result, model.CLASSES, out_file='result_{}.jpg'.format(i)) ``` ## Train a model mmdetection implements distributed training and non-distributed training, which uses `MMDistributedDataParallel` and `MMDataParallel` respectively. All outputs (log files and checkpoints) will be saved to the working directory, which is specified by `work_dir` in the config file. **\*Important\***: The default learning rate in config files is for 8 GPUs. If you use less or more than 8 GPUs, you need to set the learning rate proportional to the GPU num, e.g., 0.01 for 4 GPUs and 0.04 for 16 GPUs. ### Train with a single GPU ```shell python tools/train.py ${CONFIG_FILE} ``` If you want to specify the working directory in the command, you can add an argument `--work_dir ${YOUR_WORK_DIR}`. ### Train with multiple GPUs ```shell ./tools/dist_train.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments] ``` Optional arguments are: - `--validate` (recommended): Perform evaluation at every k (default=1) epochs during the training. - `--work_dir ${WORK_DIR}`: Override the working directory specified in the config file. - `--resume_from ${CHECKPOINT_FILE}`: Resume from a previous checkpoint file. ### Train with multiple machines If you run mmdetection on a cluster managed with [slurm](https://slurm.schedmd.com/), you can just use the script `slurm_train.sh`. ```shell ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${WORK_DIR} [${GPUS}] ``` Here is an example of using 16 GPUs to train Mask R-CNN on the dev partition. ```shell ./tools/slurm_train.sh dev mask_r50_1x configs/mask_rcnn_r50_fpn_1x.py /nfs/xxxx/mask_rcnn_r50_fpn_1x 16 ``` You can check [slurm_train.sh](tools/slurm_train.sh) for full arguments and environment variables. If you have just multiple machines connected with ethernet, you can refer to pytorch [launch utility](https://pytorch.org/docs/stable/distributed_deprecated.html#launch-utility). Usually it is slow if you do not have high speed networking like infiniband. ## How-to ### Use my own datasets The simplest way is to convert your dataset to existing dataset formats (COCO or PASCAL VOC). Here we show an example of adding a custom dataset of 5 classes, assuming it is also in COCO format. In `mmdet/datasets/my_dataset.py`: ```python from .coco import CocoDataset class MyDataset(CocoDataset): CLASSES = ('a', 'b', 'c', 'd', 'e') ``` In `mmdet/datasets/__init__.py`: ```python from .my_dataset import MyDataset ``` Then you can use `MyDataset` in config files, with the same API as CocoDataset. It is also fine if you do not want to convert the annotation format to COCO or PASCAL format. Actually, we define a simple annotation format and all existing datasets are processed to be compatible with it, either online or offline. The annotation of a dataset is a list of dict, each dict corresponds to an image. There are 3 field `filename` (relative path), `width`, `height` for testing, and an additional field `ann` for training. `ann` is also a dict containing at least 2 fields: `bboxes` and `labels`, both of which are numpy arrays. Some datasets may provide annotations like crowd/difficult/ignored bboxes, we use `bboxes_ignore` and `labels_ignore` to cover them. Here is an example. ``` [ { 'filename': 'a.jpg', 'width': 1280, 'height': 720, 'ann': { 'bboxes': (n, 4), 'labels': (n, ), 'bboxes_ignore': (k, 4), 'labels_ignore': (k, ) (optional field) } }, ... ] ``` There are two ways to work with custom datasets. - online conversion You can write a new Dataset class inherited from `CustomDataset`, and overwrite two methods `load_annotations(self, ann_file)` and `get_ann_info(self, idx)`, like [CocoDataset](mmdet/datasets/coco.py) and [VOCDataset](mmdet/datasets/voc.py). - offline conversion You can convert the annotation format to the expected format above and save it to a pickle or json file, like [pascal_voc.py](tools/convert_datasets/pascal_voc.py). Then you can simply use `CustomDataset`. ### Develop new components We basically categorize model components into 4 types. - backbone: usually a FCN network to extract feature maps, e.g., ResNet, MobileNet. - neck: the component between backbones and heads, e.g., FPN, PAFPN. - head: the component for specific tasks, e.g., bbox prediction and mask prediction. - roi extractor: the part for extracting RoI features from feature maps, e.g., RoI Align. Here we show how to develop new components with an example of MobileNet. 1. Create a new file `mmdet/models/backbones/mobilenet.py`. ```python import torch.nn as nn from ..registry import BACKBONES @BACKBONES.register class MobileNet(nn.Module): def __init__(self, arg1, arg2): pass def forward(x): # should return a tuple pass ``` 2. Import the module in `mmdet/models/backbones/__init__.py`. ```python from .mobilenet import MobileNet ``` 3. Use it in your config file. ```python model = dict( ... backbone=dict( type='MobileNet', arg1=xxx, arg2=xxx), ... ``` For more information on how it works, you can refer to [TECHNICAL_DETAILS.md](TECHNICAL_DETAILS.md) (TODO). ================================================ FILE: mmdetection/INSTALL.md ================================================ ## Installation ### Requirements - Linux - Python 3.5+ ([Say goodbye to Python2](https://python3statement.org/)) - PyTorch 1.0+ or PyTorch-nightly - CUDA 9.0+ - NCCL 2+ - GCC 4.9+ - [mmcv](https://github.com/open-mmlab/mmcv) We have tested the following versions of OS and softwares: - OS: Ubuntu 16.04/18.04 and CentOS 7.2 - CUDA: 9.0/9.2/10.0 - NCCL: 2.1.15/2.2.13/2.3.7/2.4.2 - GCC: 4.9/5.3/5.4/7.3 ### Install mmdetection a. Create a conda virtual environment and activate it. Then install Cython. ```shell conda create -n open-mmlab python=3.7 -y source activate open-mmlab conda install cython ``` b. Install PyTorch stable or nightly and torchvision following the [official instructions](https://pytorch.org/). c. Clone the mmdetection repository. ```shell git clone https://github.com/open-mmlab/mmdetection.git cd mmdetection ``` d. Compile cuda extensions. ```shell ./compile.sh ``` e. Install mmdetection (other dependencies will be installed automatically). ```shell python setup.py develop # or "pip install -e ." ``` Note: 1. It is recommended that you run the step e each time you pull some updates from github. If there are some updates of the C/CUDA codes, you also need to run step d. The git commit id will be written to the version number with step e, e.g. 0.6.0+2e7045c. The version will also be saved in trained models. 2. Following the above instructions, mmdetection is installed on `dev` mode, any modifications to the code will take effect without installing it again. ### Prepare COCO dataset. It is recommended to symlink the dataset root to `$MMDETECTION/data`. ``` mmdetection ├── mmdet ├── tools ├── configs ├── data │ ├── coco │ │ ├── annotations │ │ ├── train2017 │ │ ├── val2017 │ │ ├── test2017 │ ├── VOCdevkit │ │ ├── VOC2007 │ │ ├── VOC2012 ``` ### Scripts [Here](https://gist.github.com/hellock/bf23cd7348c727d69d48682cb6909047) is a script for setting up mmdetection with conda. ### Notice You can run `python(3) setup.py develop` or `pip install -e .` to install mmdetection if you want to make modifications to it frequently. If there are more than one mmdetection on your machine, and you want to use them alternatively. Please insert the following code to the main file ```python import os.path as osp import sys sys.path.insert(0, osp.join(osp.dirname(osp.abspath(__file__)), '../')) ``` or run the following command in the terminal of corresponding folder. ```shell export PYTHONPATH=`pwd`:$PYTHONPATH ``` ================================================ FILE: mmdetection/LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: mmdetection/MODEL_ZOO.md ================================================ # Benchmark and Model Zoo ## Environment ### Hardware - 8 NVIDIA Tesla V100 GPUs - Intel Xeon 4114 CPU @ 2.20GHz ### Software environment - Python 3.6 / 3.7 - PyTorch Nightly - CUDA 9.0.176 - CUDNN 7.0.4 - NCCL 2.1.15 ## Mirror sites We use AWS as the main site to host our model zoo, and maintain a mirror on aliyun. You can replace `https://s3.ap-northeast-2.amazonaws.com/open-mmlab` with `https://open-mmlab.oss-cn-beijing.aliyuncs.com` in model urls. ## Common settings - All FPN baselines and RPN-C4 baselines were trained using 8 GPU with a batch size of 16 (2 images per GPU). Other C4 baselines were trained using 8 GPU with a batch size of 8 (1 image per GPU). - All models were trained on `coco_2017_train`, and tested on the `coco_2017_val`. - We use distributed training and BN layer stats are fixed. - We adopt the same training schedules as Detectron. 1x indicates 12 epochs and 2x indicates 24 epochs, which corresponds to slightly less iterations than Detectron and the difference can be ignored. - All pytorch-style pretrained backbones on ImageNet are from PyTorch model zoo. - For fair comparison with other codebases, we report the GPU memory as the maximum value of `torch.cuda.max_memory_allocated()` for all 8 GPUs. Note that this value is usually less than what `nvidia-smi` shows. - We report the inference time as the overall time including data loading, network forwarding and post processing. ## Baselines More models with different backbones will be added to the model zoo. ### RPN | Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | AR1000 | Download | | :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :------------------------------------------------------------------------------------------------------------------------: | | R-50-C4 | caffe | 1x | - | - | 20.5 | 51.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_caffe_c4_1x-ea7d3428.pth) | | R-50-C4 | caffe | 2x | 2.2 | 0.17 | 20.3 | 52.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_caffe_c4_2x-c6d5b958.pth) | | R-50-C4 | pytorch | 1x | - | - | 20.1 | 50.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_c4_1x-eb38972b.pth) | | R-50-C4 | pytorch | 2x | - | - | 20.0 | 51.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_c4_2x-3d4c1e14.pth) | | R-50-FPN | caffe | 1x | 3.3 | 0.253 | 16.9 | 58.2 | - | | R-50-FPN | pytorch | 1x | 3.5 | 0.276 | 17.7 | 57.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_fpn_1x_20181010-4a9c0712.pth) | | R-50-FPN | pytorch | 2x | - | - | - | 57.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_fpn_2x_20181010-88a4a471.pth) | | R-101-FPN | caffe | 1x | 5.2 | 0.379 | 13.9 | 59.4 | - | | R-101-FPN | pytorch | 1x | 5.4 | 0.396 | 14.4 | 58.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r101_fpn_1x_20181129-f50da4bd.pth) | | R-101-FPN | pytorch | 2x | - | - | - | 59.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r101_fpn_2x_20181129-e42c6c9a.pth) | | X-101-32x4d-FPN | pytorch | 1x | 6.6 | 0.589 | 11.8 | 59.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_x101_32x4d_fpn_1x_20181218-7e379d26.pth) | | X-101-32x4d-FPN | pytorch | 2x | - | - | - | 59.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_x101_32x4d_fpn_2x_20181218-0510af40.pth) | | X-101-64x4d-FPN | pytorch | 1x | 9.5 | 0.955 | 8.3 | 59.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_x101_64x4d_fpn_1x_20181218-c1a24f1f.pth) | | X-101-64x4d-FPN | pytorch | 2x | - | - | - | 60.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_x101_64x4d_fpn_2x_20181218-c22bdd70.pth) | ### Faster R-CNN | Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download | | :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :--------------------------------------------------------------------------------------------------------------------------------: | | R-50-C4 | caffe | 1x | - | - | 9.5 | 34.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_caffe_c4_1x-75ecfdfa.pth) | | R-50-C4 | caffe | 2x | 4.0 | 0.39 | 9.3 | 36.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_caffe_c4_2x-71c67f27.pth) | | R-50-C4 | pytorch | 1x | - | - | 9.3 | 33.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_c4_1x-642cf91f.pth) | | R-50-C4 | pytorch | 2x | - | - | 9.4 | 35.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_c4_2x-6e4fdf4f.pth) | | R-50-FPN | caffe | 1x | 3.6 | 0.333 | 13.5 | 36.6 | - | | R-50-FPN | pytorch | 1x | 3.8 | 0.353 | 13.6 | 36.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_fpn_1x_20181010-3d1b3351.pth) | | R-50-FPN | pytorch | 2x | - | - | - | 37.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_fpn_2x_20181010-443129e1.pth) | | R-101-FPN | caffe | 1x | 5.5 | 0.465 | 11.5 | 38.8 | - | | R-101-FPN | pytorch | 1x | 5.7 | 0.474 | 11.9 | 38.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r101_fpn_1x_20181129-d1468807.pth) | | R-101-FPN | pytorch | 2x | - | - | - | 39.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r101_fpn_2x_20181129-73e7ade7.pth) | | X-101-32x4d-FPN | pytorch | 1x | 6.9 | 0.672 | 10.3 | 40.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_x101_32x4d_fpn_1x_20181218-ad81c133.pth) | | X-101-32x4d-FPN | pytorch | 2x | - | - | - | 40.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_x101_32x4d_fpn_2x_20181218-0ed58946.pth) | | X-101-64x4d-FPN | pytorch | 1x | 9.8 | 1.040 | 7.3 | 41.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_x101_64x4d_fpn_1x_20181218-c9c69c8f.pth) | | X-101-64x4d-FPN | pytorch | 2x | - | - | - | 40.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_x101_64x4d_fpn_2x_20181218-fe94f9b8.pth) | ### Mask R-CNN | Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download | | :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :-----: | :------------------------------------------------------------------------------------------------------------------------------: | | R-50-C4 | caffe | 1x | - | - | 8.1 | 35.9 | 31.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_caffe_c4_1x-02a4ad3b.pth) | | R-50-C4 | caffe | 2x | 4.2 | 0.43 | 8.1 | 37.9 | 32.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_caffe_c4_2x-d150973a.pth) | | R-50-C4 | pytorch | 1x | - | - | 7.9 | 35.1 | 31.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_c4_1x-a83bdd40.pth) | | R-50-C4 | pytorch | 2x | - | - | 8.0 | 37.2 | 32.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_c4_2x-3cf169a9.pth) | | R-50-FPN | caffe | 1x | 3.8 | 0.430 | 10.2 | 37.4 | 34.3 | - | | R-50-FPN | pytorch | 1x | 3.9 | 0.453 | 10.6 | 37.3 | 34.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_fpn_1x_20181010-069fa190.pth) | | R-50-FPN | pytorch | 2x | - | - | - | 38.5 | 35.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_fpn_2x_20181010-41d35c05.pth) | | R-101-FPN | caffe | 1x | 5.7 | 0.534 | 9.4 | 39.9 | 36.1 | - | | R-101-FPN | pytorch | 1x | 5.8 | 0.571 | 9.5 | 39.4 | 35.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r101_fpn_1x_20181129-34ad1961.pth) | | R-101-FPN | pytorch | 2x | - | - | - | 40.3 | 36.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r101_fpn_2x_20181129-a254bdfc.pth) | | X-101-32x4d-FPN | pytorch | 1x | 7.1 | 0.759 | 8.3 | 41.1 | 37.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_x101_32x4d_fpn_1x_20181218-44e635cc.pth) | | X-101-32x4d-FPN | pytorch | 2x | - | - | - | 41.4 | 37.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_x101_32x4d_fpn_2x_20181218-f023dffa.pth) | | X-101-64x4d-FPN | pytorch | 1x | 10.0 | 1.102 | 6.5 | 42.1 | 38.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_x101_64x4d_fpn_1x_20181218-cb159987.pth) | | X-101-64x4d-FPN | pytorch | 2x | - | - | - | 42.0 | 37.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_x101_64x4d_fpn_2x_20181218-ea936e44.pth) | ### Fast R-CNN (with pre-computed proposals) | Backbone | Style | Type | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download | | :-------: | :-----: | :----: | :-----: | :------: | :-----------------: | :------------: | :----: | :-----: | :-----------------------------------------------------------------------------------------------------------------------------: | | R-50-C4 | caffe | Faster | 1x | - | - | 6.7 | 35.0 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_caffe_c4_1x-0ef9a60b.pth) | | R-50-C4 | caffe | Faster | 2x | 3.8 | 0.34 | 6.6 | 36.4 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_c4_2x-657a9fc6.pth) | | R-50-C4 | pytorch | Faster | 1x | - | - | 6.3 | 34.2 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_c4_1x-2bc00ca9.pth) | | R-50-C4 | pytorch | Faster | 2x | - | - | 6.1 | 35.8 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_caffe_c4_2x-9171d0fc.pth) | | R-50-FPN | caffe | Faster | 1x | 3.3 | 0.242 | 18.4 | 36.6 | - | - | | R-50-FPN | pytorch | Faster | 1x | 3.5 | 0.250 | 16.5 | 35.8 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_fpn_1x_20181010-08160859.pth) | | R-50-C4 | caffe | Mask | 1x | - | - | 8.1 | 35.9 | 31.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_caffe_c4_1x-b43f7f3c.pth) | | R-50-C4 | caffe | Mask | 2x | 4.2 | 0.43 | 8.1 | 37.9 | 32.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_caffe_c4_2x-e3580184.pth) | | R-50-C4 | pytorch | Mask | 1x | - | - | 7.9 | 35.1 | 31.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_c4_1x-bc7fa8c8.pth) | | R-50-C4 | pytorch | Mask | 2x | - | - | 8.0 | 37.2 | 32.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_fpn_2x_20181010-5048cb03.pth) | | R-50-FPN | pytorch | Faster | 2x | - | - | - | 37.1 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_fpn_2x_20181010-d263ada5.pth) | | R-101-FPN | caffe | Faster | 1x | 5.2 | 0.355 | 14.4 | 38.6 | - | - | | R-101-FPN | pytorch | Faster | 1x | 5.4 | 0.388 | 13.2 | 38.1 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r101_fpn_1x_20181129-ffaa2eb0.pth) | | R-101-FPN | pytorch | Faster | 2x | - | - | - | 38.8 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r101_fpn_2x_20181129-9dba92ce.pth) | | R-50-FPN | caffe | Mask | 1x | 3.4 | 0.328 | 12.8 | 37.3 | 34.5 | - | | R-50-FPN | pytorch | Mask | 1x | 3.5 | 0.346 | 12.7 | 36.8 | 34.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_fpn_1x_20181010-e030a38f.pth) | | R-50-FPN | pytorch | Mask | 2x | - | - | - | 37.9 | 34.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_fpn_2x_20181010-5048cb03.pth) | | R-101-FPN | caffe | Mask | 1x | 5.2 | 0.429 | 11.2 | 39.4 | 36.1 | - | | R-101-FPN | pytorch | Mask | 1x | 5.4 | 0.462 | 10.9 | 38.9 | 35.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r101_fpn_1x_20181129-2273fa9b.pth) | | R-101-FPN | pytorch | Mask | 2x | - | - | - | 39.9 | 36.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r101_fpn_2x_20181129-bf63ec5e.pth) | ### RetinaNet | Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download | | :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :------------------------------------------------------------------------------------------------------------------------------: | | R-50-FPN | caffe | 1x | 3.4 | 0.285 | 12.5 | 35.8 | - | | R-50-FPN | pytorch | 1x | 3.6 | 0.308 | 12.1 | 35.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_r50_fpn_1x_20181125-7b0c2548.pth) | | R-50-FPN | pytorch | 2x | - | - | - | 36.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_r50_fpn_2x_20181125-8b724df2.pth) | | R-101-FPN | caffe | 1x | 5.3 | 0.410 | 10.4 | 37.8 | - | | R-101-FPN | pytorch | 1x | 5.5 | 0.429 | 10.9 | 37.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_r101_fpn_1x_20181129-f016f384.pth) | | R-101-FPN | pytorch | 2x | - | - | - | 38.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_r101_fpn_2x_20181129-72c14526.pth) | | X-101-32x4d-FPN | pytorch | 1x | 6.7 | 0.632 | 9.3 | 39.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_x101_32x4d_fpn_1x_20190501-967812ba.pth) | | X-101-32x4d-FPN | pytorch | 2x | - | - | - | 39.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_x101_32x4d_fpn_2x_20181218-8596452d.pth) | | X-101-64x4d-FPN | pytorch | 1x | 9.6 | 0.993 | 7.0 | 40.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_x101_64x4d_fpn_1x_20181218-a0a22662.pth) | | X-101-64x4d-FPN | pytorch | 2x | - | - | - | 39.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_x101_64x4d_fpn_2x_20181218-5e88d045.pth) | ### Cascade R-CNN | Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download | | :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :---------------------------------------------------------------------------------------------------------------------------------: | | R-50-C4 | caffe | 1x | 8.7 | 0.92 | 5.0 | 38.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r50_caffe_c4_1x-7c85c62b.pth) | | R-50-FPN | caffe | 1x | 3.9 | 0.464 | 10.9 | 40.5 | - | | R-50-FPN | pytorch | 1x | 4.1 | 0.455 | 11.9 | 40.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r50_fpn_1x_20190501-3b6211ab.pth) | | R-50-FPN | pytorch | 20e | - | - | - | 41.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r50_fpn_20e_20181123-db483a09.pth) | | R-101-FPN | caffe | 1x | 5.8 | 0.569 | 9.6 | 42.4 | - | | R-101-FPN | pytorch | 1x | 6.0 | 0.584 | 10.3 | 42.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r101_fpn_1x_20181129-d64ebac7.pth) | | R-101-FPN | pytorch | 20e | - | - | - | 42.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r101_fpn_20e_20181129-b46dcede.pth) | | X-101-32x4d-FPN | pytorch | 1x | 7.2 | 0.770 | 8.9 | 43.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_x101_32x4d_fpn_1x_20190501-af628be5.pth) | | X-101-32x4d-FPN | pytorch | 20e | - | - | - | 44.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_x101_32x4d_fpn_2x_20181218-28f73c4c.pth) | | X-101-64x4d-FPN | pytorch | 1x | 10.0 | 1.133 | 6.7 | 44.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_x101_64x4d_fpn_1x_20181218-e2dc376a.pth) | | X-101-64x4d-FPN | pytorch | 20e | - | - | - | 44.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_x101_64x4d_fpn_2x_20181218-5add321e.pth) | ### Cascade Mask R-CNN | Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download | | :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :-----: | :---------------------------------------------------------------------------------------------------------------------------------------: | | R-50-C4 | caffe | 1x | 9.1 | 0.99 | 4.5 | 39.3 | 32.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r50_caffe_c4_1x-f72cc254.pth) | | R-50-FPN | caffe | 1x | 5.1 | 0.692 | 7.6 | 40.9 | 35.5 | - | | R-50-FPN | pytorch | 1x | 5.3 | 0.683 | 7.4 | 41.2 | 35.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r50_fpn_1x_20181123-88b170c9.pth) | | R-50-FPN | pytorch | 20e | - | - | - | 42.3 | 36.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r50_fpn_20e_20181123-6e0c9713.pth) | | R-101-FPN | caffe | 1x | 7.0 | 0.803 | 7.2 | 43.1 | 37.2 | - | | R-101-FPN | pytorch | 1x | 7.2 | 0.807 | 6.8 | 42.6 | 37.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r101_fpn_1x_20181129-64f00602.pth) | | R-101-FPN | pytorch | 20e | - | - | - | 43.3 | 37.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r101_fpn_20e_20181129-cb85151d.pth) | | X-101-32x4d-FPN | pytorch | 1x | 8.4 | 0.976 | 6.6 | 44.4 | 38.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_x101_32x4d_fpn_1x_20181218-1d944c89.pth) | | X-101-32x4d-FPN | pytorch | 20e | - | - | - | 44.7 | 38.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_x101_32x4d_fpn_20e_20181218-761a3473.pth) | | X-101-64x4d-FPN | pytorch | 1x | 11.4 | 1.33 | 5.3 | 45.4 | 39.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_x101_64x4d_fpn_1x_20190501-827e0a70.pth) | | X-101-64x4d-FPN | pytorch | 20e | - | - | - | 45.7 | 39.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_x101_64x4d_fpn_20e_20181218-630773a7.pth) | **Notes:** - The `20e` schedule in Cascade (Mask) R-CNN indicates decreasing the lr at 16 and 19 epochs, with a total of 20 epochs. ### Hybrid Task Cascade (HTC) | Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download | | :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :-----: | :-----------------------------------------------------------------------------------------------------------------------------: | | R-50-FPN | pytorch | 1x | 7.4 | 0.936 | 4.1 | 42.1 | 37.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_r50_fpn_1x_20190408-878c1712.pth) | | R-50-FPN | pytorch | 20e | - | - | - | 43.2 | 38.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_r50_fpn_20e_20190408-c03b7015.pth) | | R-101-FPN | pytorch | 20e | 9.3 | 1.051 | 4.0 | 44.9 | 39.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_r101_fpn_20e_20190408-a2e586db.pth) | | X-101-32x4d-FPN | pytorch | 20e | 5.8 | 0.769 | 3.8 | 46.1 | 40.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_x101_32x4d_fpn_20e_20190408-9eae4d0b.pth) | | X-101-64x4d-FPN | pytorch | 20e | 7.5 | 1.120 | 3.5 | 46.9 | 40.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_x101_64x4d_fpn_20e_20190408-497f2561.pth) | **Notes:** - Please refer to [Hybrid Task Cascade](configs/htc/README.md) for details and more a powerful model (50.7/43.9). ### SSD | Backbone | Size | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download | | :------: | :---: | :---: | :-----: | :------: | :-----------------: | :------------: | :----: | :-------------------------------------------------------------------------------------------------------------------------------: | | VGG16 | 300 | caffe | 120e | 3.5 | 0.256 | 25.9 / 34.6 | 25.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ssd300_coco_vgg16_caffe_120e_20181221-84d7110b.pth) | | VGG16 | 512 | caffe | 120e | 7.6 | 0.412 | 20.7 / 25.4 | 29.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ssd512_coco_vgg16_caffe_120e_20181221-d48b0be8.pth) | ### SSD (PASCAL VOC) | Backbone | Size | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download | | :------: | :---: | :---: | :-----: | :------: | :-----------------: | :------------: | :----: | :------------------------------------------------------------------------------------------------------------------------------: | | VGG16 | 300 | caffe | 240e | 2.5 | 0.159 | 35.7 / 53.6 | 77.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ssd300_voc_vgg16_caffe_240e_20190501-7160d09a.pth) | | VGG16 | 512 | caffe | 240e | 4.3 | 0.214 | 27.5 / 35.9 | 80.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ssd512_voc_vgg16_caffe_240e_20190501-ff194be1.pth) | **Notes:** - `cudnn.benchmark` is set as `True` for SSD training and testing. - Inference time is reported for batch size = 1 and batch size = 8. - The speed difference between VOC and COCO is caused by model parameters and nms. ### Group Normalization (GN) Please refer to [Group Normalization](configs/gn/README.md) for details. ### Weight Standardization Please refer to [Weight Standardization](configs/gn+ws/README.md) for details. ### Deformable Convolution v2 Please refer to [Deformable Convolutional Networks](configs/dcn/README.md) for details. ## Comparison with Detectron and maskrcnn-benchmark We compare mmdetection with [Detectron](https://github.com/facebookresearch/Detectron) and [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark). The backbone used is R-50-FPN. In general, mmdetection has 3 advantages over Detectron. - **Higher performance** (especially in terms of mask AP) - **Faster training speed** - **Memory efficient** ### Performance Detectron and maskrcnn-benchmark use caffe-style ResNet as the backbone. We report results using both caffe-style (weights converted from [here](https://github.com/facebookresearch/Detectron/blob/master/MODEL_ZOO.md#imagenet-pretrained-models)) and pytorch-style (weights from the official model zoo) ResNet backbone, indicated as *pytorch-style results* / *caffe-style results*. We find that pytorch-style ResNet usually converges slower than caffe-style ResNet, thus leading to slightly lower results in 1x schedule, but the final results of 2x schedule is higher.
Type Lr schd Detectron maskrcnn-benchmark mmdetection
RPN 1x 57.2 - 57.1 / 58.2
2x - - 57.6 / -
Faster R-CNN 1x 36.7 36.8 36.4 / 36.6
2x 37.9 - 37.7 / -
Mask R-CNN 1x 37.7 & 33.9 37.8 & 34.2 37.3 & 34.2 / 37.4 & 34.3
2x 38.6 & 34.5 - 38.5 & 35.1 / -
Fast R-CNN 1x 36.4 - 35.8 / 36.6
2x 36.8 - 37.1 / -
Fast R-CNN (w/mask) 1x 37.3 & 33.7 - 36.8 & 34.1 / 37.3 & 34.5
2x 37.7 & 34.0 - 37.9 & 34.8 / -
### Training Speed The training speed is measure with s/iter. The lower, the better.
Type Detectron (P1001) maskrcnn-benchmark (V100) mmdetection (V1002)
RPN 0.416 - 0.253
Faster R-CNN 0.544 0.353 0.333
Mask R-CNN 0.889 0.454 0.430
Fast R-CNN 0.285 - 0.242
Fast R-CNN (w/mask) 0.377 - 0.328
\*1. Facebook's Big Basin servers (P100/V100) is slightly faster than the servers we use. mmdetection can also run slightly faster on FB's servers. \*2. For fair comparison, we list the caffe-style results here. ### Inference Speed The inference speed is measured with fps (img/s) on a single GPU. The higher, the better.
Type Detectron (P100) maskrcnn-benchmark (V100) mmdetection (V100)
RPN 12.5 - 16.9
Faster R-CNN 10.3 7.9 13.5
Mask R-CNN 8.5 7.7 10.2
Fast R-CNN 12.5 - 18.4
Fast R-CNN (w/mask) 9.9 - 12.8
### Training memory
Type Detectron maskrcnn-benchmark mmdetection
RPN 6.4 - 3.3
Faster R-CNN 7.2 4.4 3.6
Mask R-CNN 8.6 5.2 3.8
Fast R-CNN 6.0 - 3.3
Fast R-CNN (w/mask) 7.9 - 3.4
There is no doubt that maskrcnn-benchmark and mmdetection is more memory efficient than Detectron, and the main advantage is PyTorch itself. We also perform some memory optimizations to push it forward. Note that Caffe2 and PyTorch have different apis to obtain memory usage with different implementations. For all codebases, `nvidia-smi` shows a larger memory usage than the reported number in the above table. ================================================ FILE: mmdetection/README.md ================================================ # mmdetection ## Introduction The master branch works with **PyTorch 1.1** or higher. If you would like to use PyTorch 0.4.1, please checkout to the [pytorch-0.4.1](https://github.com/open-mmlab/mmdetection/tree/pytorch-0.4.1) branch. mmdetection is an open source object detection toolbox based on PyTorch. It is a part of the open-mmlab project developed by [Multimedia Laboratory, CUHK](http://mmlab.ie.cuhk.edu.hk/). ![demo image](demo/coco_test_12510.jpg) ### Major features - **Modular Design** One can easily construct a customized object detection framework by combining different components. - **Support of multiple frameworks out of box** The toolbox directly supports popular detection frameworks, *e.g.* Faster RCNN, Mask RCNN, RetinaNet, etc. - **Efficient** All basic bbox and mask operations run on GPUs now. The training speed is nearly 2x faster than Detectron and comparable to maskrcnn-benchmark. - **State of the art** This was the codebase of the *MMDet* team, who won the [COCO Detection 2018 challenge](http://cocodataset.org/#detection-leaderboard). Apart from mmdetection, we also released a library [mmcv](https://github.com/open-mmlab/mmcv) for computer vision research, which is heavily depended on by this toolbox. ## License This project is released under the [Apache 2.0 license](LICENSE). ## Updates v0.6.0 (14/04/2019) - Up to 30% speedup compared to the model zoo. - Support both PyTorch stable and nightly version. - Replace NMS and SigmoidFocalLoss with Pytorch CUDA extensions. v0.6rc0(06/02/2019) - Migrate to PyTorch 1.0. v0.5.7 (06/02/2019) - Add support for Deformable ConvNet v2. (Many thanks to the authors and [@chengdazhi](https://github.com/chengdazhi)) - This is the last release based on PyTorch 0.4.1. v0.5.6 (17/01/2019) - Add support for Group Normalization. - Unify RPNHead and single stage heads (RetinaHead, SSDHead) with AnchorHead. v0.5.5 (22/12/2018) - Add SSD for COCO and PASCAL VOC. - Add ResNeXt backbones and detection models. - Refactoring for Samplers/Assigners and add OHEM. - Add VOC dataset and evaluation scripts. v0.5.4 (27/11/2018) - Add SingleStageDetector and RetinaNet. v0.5.3 (26/11/2018) - Add Cascade R-CNN and Cascade Mask R-CNN. - Add support for Soft-NMS in config files. v0.5.2 (21/10/2018) - Add support for custom datasets. - Add a script to convert PASCAL VOC annotations to the expected format. v0.5.1 (20/10/2018) - Add BBoxAssigner and BBoxSampler, the `train_cfg` field in config files are restructured. - `ConvFCRoIHead` / `SharedFCRoIHead` are renamed to `ConvFCBBoxHead` / `SharedFCBBoxHead` for consistency. ## Benchmark and model zoo Supported methods and backbones are shown in the below table. Results and models are available in the [Model zoo](MODEL_ZOO.md). | | ResNet | ResNeXt | SENet | VGG | |--------------------|:--------:|:--------:|:--------:|:--------:| | RPN | ✓ | ✓ | ☐ | ✗ | | Fast R-CNN | ✓ | ✓ | ☐ | ✗ | | Faster R-CNN | ✓ | ✓ | ☐ | ✗ | | Mask R-CNN | ✓ | ✓ | ☐ | ✗ | | Cascade R-CNN | ✓ | ✓ | ☐ | ✗ | | Cascade Mask R-CNN | ✓ | ✓ | ☐ | ✗ | | SSD | ✗ | ✗ | ✗ | ✓ | | RetinaNet | ✓ | ✓ | ☐ | ✗ | | Hybrid Task Cascade| ✓ | ✓ | ☐ | ✗ | | FCOS | ✓ | ✓ | ☐ | ✗ | Other features - [x] DCNv2 - [x] Group Normalization - [x] Weight Standardization - [x] OHEM - [x] Soft-NMS - [ ] Mixed Precision (FP16) Training (coming soon) ## Installation Please refer to [INSTALL.md](INSTALL.md) for installation and dataset preparation. ## Get Started Please see [GETTING_STARTED.md](GETTING_STARTED.md) for the basic usage of mmdetection. ## Citation If you use our codebase or models in your research, please cite this project. We will release a paper or technical report later. ``` @misc{mmdetection2018, author = {Kai Chen and Jiangmiao Pang and Jiaqi Wang and Yu Xiong and Xiaoxiao Li and Shuyang Sun and Wansen Feng and Ziwei Liu and Jianping Shi and Wanli Ouyang and Chen Change Loy and Dahua Lin}, title = {mmdetection}, howpublished = {\url{https://github.com/open-mmlab/mmdetection}}, year = {2018} } ``` ================================================ FILE: mmdetection/TECHNICAL_DETAILS.md ================================================ ## Overview In this section, we will introduce the main units of training a detector: data loading, model and iteration pipeline. ## Data loading Following typical conventions, we use `Dataset` and `DataLoader` for data loading with multiple workers. `Dataset` returns a dict of data items corresponding the arguments of models' forward method. Since the data in object detection may not be the same size (image size, gt bbox size, etc.), we introduce a new `DataContainer` type in `mmcv` to help collect and distribute data of different size. See [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/parallel/data_container.py) for more details. ## Model In mmdetection, model components are basically categorized as 4 types. - backbone: usually a FCN network to extract feature maps, e.g., ResNet. - neck: the part between backbones and heads, e.g., FPN, ASPP. - head: the part for specific tasks, e.g., bbox prediction and mask prediction. - roi extractor: the part for extracting features from feature maps, e.g., RoI Align. We also write implement some general detection pipelines with the above components, such as `SingleStageDetector` and `TwoStageDetector`. ### Build a model with basic components Following some basic pipelines (e.g., two-stage detectors), the model structure can be customized through config files with no pains. If we want to implement some new components, e.g, the path aggregation FPN structure in [Path Aggregation Network for Instance Segmentation](https://arxiv.org/abs/1803.01534), there are two things to do. 1. create a new file in `mmdet/models/necks/pafpn.py`. ```python class PAFPN(nn.Module): def __init__(self, in_channels, out_channels, num_outs, start_level=0, end_level=-1, add_extra_convs=False): pass def forward(self, inputs): # implementation is ignored pass ``` 2. modify the config file from ```python neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5) ``` to ```python neck=dict( type='PAFPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5) ``` We will release more components (backbones, necks, heads) for research purpose. ### Write a new model To write a new detection pipeline, you need to inherit from `BaseDetector`, which defines the following abstract methods. - `extract_feat()`: given an image batch of shape (n, c, h, w), extract the feature map(s). - `forward_train()`: forward method of the training mode - `simple_test()`: single scale testing without augmentation - `aug_test()`: testing with augmentation (multi-scale, flip, etc.) [TwoStageDetector](https://github.com/hellock/mmdetection/blob/master/mmdet/models/detectors/two_stage.py) is a good example which shows how to do that. ## Iteration pipeline We adopt distributed training for both single machine and multiple machines. Supposing that the server has 8 GPUs, 8 processes will be started and each process runs on a single GPU. Each process keeps an isolated model, data loader, and optimizer. Model parameters are only synchronized once at the begining. After a forward and backward pass, gradients will be allreduced among all GPUs, and the optimizer will update model parameters. Since the gradients are allreduced, the model parameter stays the same for all processes after the iteration. ================================================ FILE: mmdetection/compile.sh ================================================ #!/usr/bin/env bash PYTHON=${PYTHON:-"python"} echo "Building roi align op..." cd mmdet/ops/roi_align if [ -d "build" ]; then rm -r build fi $PYTHON setup.py build_ext --inplace echo "Building roi pool op..." cd ../roi_pool if [ -d "build" ]; then rm -r build fi $PYTHON setup.py build_ext --inplace echo "Building nms op..." cd ../nms if [ -d "build" ]; then rm -r build fi $PYTHON setup.py build_ext --inplace echo "Building dcn..." cd ../dcn if [ -d "build" ]; then rm -r build fi $PYTHON setup.py build_ext --inplace echo "Building sigmoid focal loss op..." cd ../sigmoid_focal_loss if [ -d "build" ]; then rm -r build fi $PYTHON setup.py build_ext --inplace ================================================ FILE: mmdetection/configs/cascade_mask_rcnn_r101_fpn_1x.py ================================================ # model settings model = dict( type='CascadeRCNN', num_stages=3, pretrained='modelzoo://resnet101', backbone=dict( type='ResNet', depth=101, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ], mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/cascade_mask_rcnn_r101_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/cascade_mask_rcnn_r50_caffe_c4_1x.py ================================================ # model settings norm_cfg = dict(type='BN', requires_grad=False) model = dict( type='CascadeRCNN', num_stages=3, pretrained='open-mmlab://resnet50_caffe', backbone=dict( type='ResNet', depth=50, num_stages=3, strides=(1, 2, 2), dilations=(1, 1, 1), out_indices=(2, ), frozen_stages=1, norm_cfg=norm_cfg, norm_eval=True, style='caffe'), shared_head=dict( type='ResLayer', depth=50, stage=3, stride=2, dilation=1, style='caffe', norm_cfg=norm_cfg, norm_eval=True), rpn_head=dict( type='RPNHead', in_channels=1024, feat_channels=1024, anchor_scales=[2, 4, 8, 16, 32], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[16], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=1024, featmap_strides=[16]), bbox_head=[ dict( type='BBoxHead', with_avg_pool=True, roi_feat_size=7, in_channels=2048, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='BBoxHead', with_avg_pool=True, roi_feat_size=7, in_channels=2048, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='BBoxHead', with_avg_pool=True, roi_feat_size=7, in_channels=2048, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ], mask_roi_extractor=None, mask_head=dict( type='FCNMaskHead', num_convs=0, in_channels=2048, conv_out_channels=256, num_classes=81)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=12000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=14, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=14, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=14, pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=6000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) data = dict( imgs_per_gpu=1, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/cascade_mask_rcnn_r50_caffe_c4_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/cascade_mask_rcnn_r50_fpn_1x.py ================================================ # model settings model = dict( type='CascadeRCNN', num_stages=3, pretrained='modelzoo://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ], mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/cascade_mask_rcnn_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/cascade_mask_rcnn_x101_32x4d_fpn_1x.py ================================================ # model settings model = dict( type='CascadeRCNN', num_stages=3, pretrained='open-mmlab://resnext101_32x4d', backbone=dict( type='ResNeXt', depth=101, groups=32, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ], mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/cascade_mask_rcnn_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/cascade_mask_rcnn_x101_64x4d_fpn_1x.py ================================================ # model settings model = dict( type='CascadeRCNN', num_stages=3, pretrained='open-mmlab://resnext101_64x4d', backbone=dict( type='ResNeXt', depth=101, groups=64, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ], mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/cascade_mask_rcnn_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/cascade_rcnn_r101_fpn_1x.py ================================================ # model settings model = dict( type='CascadeRCNN', num_stages=3, pretrained='modelzoo://resnet101', backbone=dict( type='ResNet', depth=101, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ]) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/cascade_rcnn_r101_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/cascade_rcnn_r50_caffe_c4_1x.py ================================================ # model settings norm_cfg = dict(type='BN', requires_grad=False) model = dict( type='CascadeRCNN', num_stages=3, pretrained='open-mmlab://resnet50_caffe', backbone=dict( type='ResNet', depth=50, num_stages=3, strides=(1, 2, 2), dilations=(1, 1, 1), out_indices=(2, ), frozen_stages=1, norm_cfg=norm_cfg, norm_eval=True, style='caffe'), shared_head=dict( type='ResLayer', depth=50, stage=3, stride=2, dilation=1, style='caffe', norm_cfg=norm_cfg, norm_eval=True), rpn_head=dict( type='RPNHead', in_channels=1024, feat_channels=1024, anchor_scales=[2, 4, 8, 16, 32], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[16], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=1024, featmap_strides=[16]), bbox_head=[ dict( type='BBoxHead', with_avg_pool=True, roi_feat_size=7, in_channels=2048, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='BBoxHead', with_avg_pool=True, roi_feat_size=7, in_channels=2048, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='BBoxHead', with_avg_pool=True, roi_feat_size=7, in_channels=2048, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ]) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=12000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=14, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=14, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=14, pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=6000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) data = dict( imgs_per_gpu=1, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/cascade_rcnn_r50_c4_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/cascade_rcnn_r50_fpn_1x.py ================================================ # model settings model = dict( type='CascadeRCNN', num_stages=3, pretrained='modelzoo://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ]) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/cascade_rcnn_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/cascade_rcnn_x101_32x4d_fpn_1x.py ================================================ # model settings model = dict( type='CascadeRCNN', num_stages=3, pretrained='open-mmlab://resnext101_32x4d', backbone=dict( type='ResNeXt', depth=101, groups=32, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ]) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/cascade_rcnn_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/cascade_rcnn_x101_64x4d_fpn_1x.py ================================================ # model settings model = dict( type='CascadeRCNN', num_stages=3, pretrained='open-mmlab://resnext101_64x4d', backbone=dict( type='ResNeXt', depth=101, groups=64, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ]) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/cascade_rcnn_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/dcn/README.md ================================================ # Deformable Convolutional Networks # Introduction ``` @inproceedings{dai2017deformable, title={Deformable Convolutional Networks}, author={Dai, Jifeng and Qi, Haozhi and Xiong, Yuwen and Li, Yi and Zhang, Guodong and Hu, Han and Wei, Yichen}, booktitle={Proceedings of the IEEE international conference on computer vision}, year={2017} } @article{zhu2018deformable, title={Deformable ConvNets v2: More Deformable, Better Results}, author={Zhu, Xizhou and Hu, Han and Lin, Stephen and Dai, Jifeng}, journal={arXiv preprint arXiv:1811.11168}, year={2018} } ``` ## Results and Models | Backbone | Model | Style | Conv | Pool | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download | |:---------:|:------------:|:-------:|:-------------:|:------:|:-------:|:--------:|:-------------------:|:--------------:|:------:|:-------:|:--------:| | R-50-FPN | Faster | pytorch | dconv(c3-c5) | - | 1x | 3.9 | 0.594 | 10.2 | 40.0 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/faster_rcnn_dconv_c3-c5_r50_fpn_1x_20190125-e41688c9.pth) | | R-50-FPN | Faster | pytorch | mdconv(c3-c5) | - | 1x | 3.7 | 0.598 | 10.0 | 40.2 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/faster_rcnn_mdconv_c3-c5_r50_fpn_1x_20190125-1b768045.pth) | | R-50-FPN | Faster | pytorch | - | dpool | 1x | 4.6 | 0.714 | 8.7 | 37.8 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/faster_rcnn_dpool_r50_fpn_1x_20190125-f4fc1d70.pth) | | R-50-FPN | Faster | pytorch | - | mdpool | 1x | 5.2 | 0.769 | 8.2 | 38.0 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/faster_rcnn_mdpool_r50_fpn_1x_20190125-473d0f3d.pth) | | R-101-FPN | Faster | pytorch | dconv(c3-c5) | - | 1x | 5.8 | 0.811 | 8.0 | 42.1 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/faster_rcnn_dconv_c3-c5_r101_fpn_1x_20190125-a7e31b65.pth) | | X-101-32x4d-FPN | Faster | pytorch | dconv(c3-c5) | - | 1x | 7.1 | 1.126 | 6.6 | 43.4 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/faster_rcnn_dconv_c3-c5_x101_32x4d_fpn_1x_20190201-6d46376f.pth) | | R-50-FPN | Mask | pytorch | dconv(c3-c5) | - | 1x | 4.5 | 0.712 | 7.7 | 41.1 | 37.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/mask_rcnn_dconv_c3-c5_r50_fpn_1x_20190125-4f94ff79.pth) | | R-50-FPN | Mask | pytorch | mdconv(c3-c5) | - | 1x | 4.5 | 0.712 | 7.7 | 41.3 | 37.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/mask_rcnn_mdconv_c3-c5_r50_fpn_1x_20190125-c5601dc3.pth) | | R-101-FPN | Mask | pytorch | dconv(c3-c5) | - | 1x | 6.4 | 0.939 | 6.5 | 43.2 | 38.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/mask_rcnn_dconv_c3-c5_r101_fpn_1x_20190125-decb6db5.pth) | | R-50-FPN | Cascade | pytorch | dconv(c3-c5) | - | 1x | 4.4 | 0.660 | 7.6 | 44.0 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/cascade_rcnn_dconv_c3-c5_r50_fpn_1x_20190125-dfa53166.pth) | | R-101-FPN | Cascade | pytorch | dconv(c3-c5) | - | 1x | 6.3 | 0.881 | 6.8 | 45.0 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/cascade_rcnn_dconv_c3-c5_r101_fpn_1x_20190125-aaa877cc.pth) | | R-50-FPN | Cascade Mask | pytorch | dconv(c3-c5) | - | 1x | 6.6 | 0.942 | 5.7 | 44.4 | 38.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/cascade_mask_rcnn_dconv_c3-c5_r50_fpn_1x_20190125-09d8a443.pth) | | R-101-FPN | Cascade Mask | pytorch | dconv(c3-c5) | - | 1x | 8.5 | 1.156 | 5.1 | 45.7 | 39.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/cascade_mask_rcnn_dconv_c3-c5_r101_fpn_1x_20190125-0d62c190.pth) | **Notes:** - `dconv` and `mdconv` denote (modulated) deformable convolution, `c3-c5` means adding dconv in resnet stage 3 to 5. `dpool` and `mdpool` denote (modulated) deformable roi pooling. - The dcn ops are modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch, which should be more memory efficient and slightly faster. - **Memory, Train/Inf time is outdated.** ================================================ FILE: mmdetection/configs/dcn/cascade_mask_rcnn_dconv_c3-c5_r50_fpn_1x.py ================================================ # model settings model = dict( type='CascadeRCNN', num_stages=3, pretrained='modelzoo://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch', dcn=dict( modulated=False, deformable_groups=1, fallback_on_stride=False), stage_with_dcn=(False, True, True, True)), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ], mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/cascade_mask_rcnn_dconv_c3-c5_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/dcn/cascade_rcnn_dconv_c3-c5_r50_fpn_1x.py ================================================ # model settings model = dict( type='CascadeRCNN', num_stages=3, pretrained='modelzoo://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch', dcn=dict( modulated=False, deformable_groups=1, fallback_on_stride=False), stage_with_dcn=(False, True, True, True)), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ]) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/cascade_rcnn_dconv_c3-c5_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/dcn/faster_rcnn_dconv_c3-c5_r50_fpn_1x.py ================================================ # model settings model = dict( type='FasterRCNN', pretrained='modelzoo://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch', dcn=dict( modulated=False, deformable_groups=1, fallback_on_stride=False), stage_with_dcn=(False, True, True, True)), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) # soft-nms is also supported for rcnn testing # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05) ) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/faster_rcnn_dconv_c3-c5_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/dcn/faster_rcnn_dconv_c3-c5_x101_32x4d_fpn_1x.py ================================================ # model settings model = dict( type='FasterRCNN', pretrained='open-mmlab://resnext101_32x4d', backbone=dict( type='ResNeXt', depth=101, groups=32, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch', dcn=dict( modulated=False, groups=32, deformable_groups=1, fallback_on_stride=False), stage_with_dcn=(False, True, True, True)), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) # soft-nms is also supported for rcnn testing # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05) ) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/faster_rcnn_dconv_c3-c5_x101_32x4d_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/dcn/faster_rcnn_dpool_r50_fpn_1x.py ================================================ # model settings model = dict( type='FasterRCNN', pretrained='modelzoo://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict( type='DeformRoIPoolingPack', out_size=7, out_channels=256, no_trans=False, group_size=1, trans_std=0.1), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) # soft-nms is also supported for rcnn testing # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05) ) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/faster_rcnn_dpool_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/dcn/faster_rcnn_mdconv_c3-c5_r50_fpn_1x.py ================================================ # model settings model = dict( type='FasterRCNN', pretrained='modelzoo://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch', dcn=dict( modulated=True, deformable_groups=1, fallback_on_stride=False), stage_with_dcn=(False, True, True, True)), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) # soft-nms is also supported for rcnn testing # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05) ) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/faster_rcnn_mdconv_c3-c5_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/dcn/faster_rcnn_mdpool_r50_fpn_1x.py ================================================ # model settings model = dict( type='FasterRCNN', pretrained='modelzoo://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict( type='ModulatedDeformRoIPoolingPack', out_size=7, out_channels=256, no_trans=False, group_size=1, trans_std=0.1), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) # soft-nms is also supported for rcnn testing # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05) ) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/faster_rcnn_mdpool_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/dcn/mask_rcnn_dconv_c3-c5_r50_fpn_1x.py ================================================ # model settings model = dict( type='MaskRCNN', pretrained='modelzoo://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch', dcn=dict( modulated=False, deformable_groups=1, fallback_on_stride=False), stage_with_dcn=(False, True, True, True)), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/mask_rcnn_dconv_c3-c5_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/fast_mask_rcnn_r101_fpn_1x.py ================================================ # model settings model = dict( type='FastRCNN', pretrained='modelzoo://resnet101', backbone=dict( type='ResNet', depth=101, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81)) # model training and testing settings train_cfg = dict( rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)) test_cfg = dict( rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_train2017.pkl', flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl', size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl', size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/fast_mask_rcnn_r101_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/fast_mask_rcnn_r50_caffe_c4_1x.py ================================================ # model settings norm_cfg = dict(type='BN', requires_grad=False) model = dict( type='FastRCNN', pretrained='open-mmlab://resnet50_caffe', backbone=dict( type='ResNet', depth=50, num_stages=3, strides=(1, 2, 2), dilations=(1, 1, 1), out_indices=(2, ), frozen_stages=1, norm_cfg=norm_cfg, norm_eval=True, style='caffe'), shared_head=dict( type='ResLayer', depth=50, stage=3, stride=2, dilation=1, style='caffe', norm_cfg=norm_cfg, norm_eval=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=1024, featmap_strides=[16]), bbox_head=dict( type='BBoxHead', with_avg_pool=True, roi_feat_size=7, in_channels=2048, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False), mask_roi_extractor=None, mask_head=dict( type='FCNMaskHead', num_convs=0, in_channels=2048, conv_out_channels=256, num_classes=81)) # model training and testing settings train_cfg = dict( rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=14, pos_weight=-1, debug=False)) test_cfg = dict( rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) data = dict( imgs_per_gpu=1, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, proposal_file=data_root + 'proposals/rpn_r50_c4_1x_train2017.pkl', flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, proposal_file=data_root + 'proposals/rpn_r50_c4_1x_val2017.pkl', flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, proposal_file=data_root + 'proposals/rpn_r50_c4_1x_val2017.pkl', flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/fast_mask_rcnn_r50_caffe_c4_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/fast_mask_rcnn_r50_fpn_1x.py ================================================ # model settings model = dict( type='FastRCNN', pretrained='modelzoo://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81)) # model training and testing settings train_cfg = dict( rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)) test_cfg = dict( rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_train2017.pkl', flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl', size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl', size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/fast_mask_rcnn_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/fast_rcnn_r101_fpn_1x.py ================================================ # model settings model = dict( type='FastRCNN', pretrained='modelzoo://resnet101', backbone=dict( type='ResNet', depth=101, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False)) # model training and testing settings train_cfg = dict( rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)) test_cfg = dict( rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_train2017.pkl', flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl', size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl', size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/fast_rcnn_r101_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/fast_rcnn_r50_caffe_c4_1x.py ================================================ # model settings norm_cfg = dict(type='BN', requires_grad=False) model = dict( type='FastRCNN', pretrained='open-mmlab://resnet50_caffe', backbone=dict( type='ResNet', depth=50, num_stages=3, strides=(1, 2, 2), dilations=(1, 1, 1), out_indices=(2, ), frozen_stages=1, norm_cfg=norm_cfg, norm_eval=True, style='caffe'), shared_head=dict( type='ResLayer', depth=50, stage=3, stride=2, dilation=1, style='caffe', norm_cfg=norm_cfg, norm_eval=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=1024, featmap_strides=[16]), bbox_head=dict( type='BBoxHead', with_avg_pool=True, roi_feat_size=7, in_channels=2048, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False)) # model training and testing settings train_cfg = dict( rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)) test_cfg = dict( rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) data = dict( imgs_per_gpu=1, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, proposal_file=data_root + 'proposals/rpn_r50_c4_1x_train2017.pkl', flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, proposal_file=data_root + 'proposals/rpn_r50_c4_1x_val2017.pkl', flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, proposal_file=data_root + 'proposals/rpn_r50_c4_1x_val2017.pkl', flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/fast_rcnn_r50_caffe_c4_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/fast_rcnn_r50_fpn_1x.py ================================================ # model settings model = dict( type='FastRCNN', pretrained='modelzoo://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False)) # model training and testing settings train_cfg = dict( rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)) test_cfg = dict( rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_train2017.pkl', flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl', size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl', size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/fast_rcnn_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/faster_rcnn_ohem_r50_fpn_1x.py ================================================ # model settings model = dict( type='FasterRCNN', pretrained='modelzoo://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='OHEMSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) # soft-nms is also supported for rcnn testing # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05) ) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/faster_rcnn_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/faster_rcnn_r101_fpn_1x.py ================================================ # model settings model = dict( type='FasterRCNN', pretrained='modelzoo://resnet101', backbone=dict( type='ResNet', depth=101, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) # soft-nms is also supported for rcnn testing # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05) ) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/faster_rcnn_r101_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/faster_rcnn_r50_caffe_c4_1x.py ================================================ # model settings norm_cfg = dict(type='BN', requires_grad=False) model = dict( type='FasterRCNN', pretrained='open-mmlab://resnet50_caffe', backbone=dict( type='ResNet', depth=50, num_stages=3, strides=(1, 2, 2), dilations=(1, 1, 1), out_indices=(2, ), frozen_stages=1, norm_cfg=norm_cfg, norm_eval=True, style='caffe'), shared_head=dict( type='ResLayer', depth=50, stage=3, stride=2, dilation=1, style='caffe', norm_cfg=norm_cfg, norm_eval=True), rpn_head=dict( type='RPNHead', in_channels=1024, feat_channels=1024, anchor_scales=[2, 4, 8, 16, 32], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[16], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=1024, featmap_strides=[16]), bbox_head=dict( type='BBoxHead', with_avg_pool=True, roi_feat_size=7, in_channels=2048, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=12000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=6000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) data = dict( imgs_per_gpu=1, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/faster_rcnn_r50_caffe_c4_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/faster_rcnn_r50_fpn_1x.py ================================================ # model settings model = dict( type='FasterRCNN', pretrained='modelzoo://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) # soft-nms is also supported for rcnn testing # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05) ) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/faster_rcnn_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/faster_rcnn_x101_32x4d_fpn_1x.py ================================================ # model settings model = dict( type='FasterRCNN', pretrained='open-mmlab://resnext101_32x4d', backbone=dict( type='ResNeXt', depth=101, groups=32, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) # soft-nms is also supported for rcnn testing # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05) ) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/faster_rcnn_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/faster_rcnn_x101_64x4d_fpn_1x.py ================================================ # model settings model = dict( type='FasterRCNN', pretrained='open-mmlab://resnext101_64x4d', backbone=dict( type='ResNeXt', depth=101, groups=64, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) # soft-nms is also supported for rcnn testing # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05) ) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/faster_rcnn_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/fcos/README.md ================================================ # FCOS: Fully Convolutional One-Stage Object Detection ## Introduction ``` @article{tian2019fcos, title={FCOS: Fully Convolutional One-Stage Object Detection}, author={Tian, Zhi and Shen, Chunhua and Chen, Hao and He, Tong}, journal={arXiv preprint arXiv:1904.01355}, year={2019} } ``` ## Results and Models | Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download | |:---------:|:-------:|:-------:|:--------:|:-------------------:|:--------------:|:------:|:--------:| | R-50-FPN | caffe | 1x | 6.9 | 0.396 | 13.6 | 36.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fcos/fcos_r50_fpn_1x-9f253a93.pth) | | R-50-FPN | caffe | 2x | - | - | - | 38.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fcos/fcos_r50_fpn_2x-f7329d80.pth) | | R-101-FPN | caffe | 1x | 10.4 | 0.558 | 11.6 | 39.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fcos/fcos_r101_fpn_1x-e4889733.pth) | | R-101-FPN | caffe | 2x | - | - | - | 40.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fcos/fcos_r101_fpn_2x-42e6f62d.pth) | | X-101-64x4d-FPN | caffe |2x | 9.7 | 0.892 | 7.0 | 42.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fcos/fcos_x101_64x4d_fpn_2x-a36c0872.pth) | **Notes:** - To be consistent with the author's implementation, we use 4 GPUs with 4 images/GPU for R-50 and R-101 models, and 8 GPUs with 2 image/GPU for X-101 models. ================================================ FILE: mmdetection/configs/fcos/fcos_mstrain_640_800_r101_caffe_fpn_gn_2x_4gpu.py ================================================ # model settings model = dict( type='FCOS', pretrained='open-mmlab://resnet101_caffe', backbone=dict( type='ResNet', depth=101, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=False), style='caffe'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=1, add_extra_convs=True, extra_convs_on_inputs=False, # use P5 num_outs=5, relu_before_extra_convs=True), bbox_head=dict( type='FCOSHead', num_classes=81, in_channels=256, stacked_convs=4, feat_channels=256, strides=[8, 16, 32, 64, 128])) # training and testing settings train_cfg = dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.4, min_pos_iou=0, ignore_iof_thr=-1), smoothl1_beta=0.11, gamma=2.0, alpha=0.25, allowed_border=-1, pos_weight=-1, debug=False) test_cfg = dict( nms_pre=1000, min_bbox_size=0, score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) data = dict( imgs_per_gpu=4, workers_per_gpu=4, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=[(1333, 640), (1333, 800)], multiscale_mode='value', img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=False, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=False, test_mode=True)) # optimizer optimizer = dict( type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, paramwise_options=dict(bias_lr_mult=2., bias_decay_mult=0.)) optimizer_config = dict(grad_clip=None) # learning policy lr_config = dict( policy='step', warmup='constant', warmup_iters=500, warmup_ratio=1.0 / 3, step=[16, 22]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 24 device_ids = range(4) dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/fcos_mstrain_640_800_r101_caffe_fpn_gn_2x_4gpu' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/fcos/fcos_mstrain_640_800_x101_64x4d_fpn_gn_2x.py ================================================ # model settings model = dict( type='FCOS', pretrained='open-mmlab://resnext101_64x4d', backbone=dict( type='ResNeXt', depth=101, groups=64, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=1, add_extra_convs=True, extra_convs_on_inputs=False, # use P5 num_outs=5, relu_before_extra_convs=True), bbox_head=dict( type='FCOSHead', num_classes=81, in_channels=256, stacked_convs=4, feat_channels=256, strides=[8, 16, 32, 64, 128])) # training and testing settings train_cfg = dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.4, min_pos_iou=0, ignore_iof_thr=-1), smoothl1_beta=0.11, gamma=2.0, alpha=0.25, allowed_border=-1, pos_weight=-1, debug=False) test_cfg = dict( nms_pre=1000, min_bbox_size=0, score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=[(1333, 640), (1333, 800)], multiscale_mode='value', img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=False, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=False, test_mode=True)) # optimizer optimizer = dict( type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, paramwise_options=dict(bias_lr_mult=2., bias_decay_mult=0.)) optimizer_config = dict(grad_clip=None) # learning policy lr_config = dict( policy='step', warmup='constant', warmup_iters=500, warmup_ratio=1.0 / 3, step=[16, 22]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 24 device_ids = range(8) dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/fcos_mstrain_640_800_x101_64x4d_fpn_gn_2x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/fcos/fcos_r50_caffe_fpn_gn_1x_4gpu.py ================================================ # model settings model = dict( type='FCOS', pretrained='open-mmlab://resnet50_caffe', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=False), style='caffe'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=1, add_extra_convs=True, extra_convs_on_inputs=False, # use P5 num_outs=5, relu_before_extra_convs=True), bbox_head=dict( type='FCOSHead', num_classes=81, in_channels=256, stacked_convs=4, feat_channels=256, strides=[8, 16, 32, 64, 128])) # training and testing settings train_cfg = dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.4, min_pos_iou=0, ignore_iof_thr=-1), smoothl1_beta=0.11, gamma=2.0, alpha=0.25, allowed_border=-1, pos_weight=-1, debug=False) test_cfg = dict( nms_pre=1000, min_bbox_size=0, score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) data = dict( imgs_per_gpu=4, workers_per_gpu=4, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=False, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=False, test_mode=True)) # optimizer optimizer = dict( type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, paramwise_options=dict(bias_lr_mult=2., bias_decay_mult=0.)) optimizer_config = dict(grad_clip=None) # learning policy lr_config = dict( policy='step', warmup='constant', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 device_ids = range(4) dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/fcos_r50_caffe_fpn_gn_1x_4gpu' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/gn/README.md ================================================ # Group Normalization ## Introduction ``` @inproceedings{wu2018group, title={Group Normalization}, author={Wu, Yuxin and He, Kaiming}, booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, year={2018} } ``` ## Results and Models | Backbone | model | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download | |:-------------:|:----------:|:-------:|:--------:|:-------------------:|:--------------:|:------:|:-------:|:--------:| | R-50-FPN (d) | Mask R-CNN | 2x | 7.2 | 0.806 | 5.4 | 39.8 | 36.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/gn/mask_rcnn_r50_fpn_gn_2x_20180113-86832cf2.pth) | | R-50-FPN (d) | Mask R-CNN | 3x | 7.2 | 0.806 | 5.4 | 40.1 | 36.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/gn/mask_rcnn_r50_fpn_gn_3x_20180113-8e82f48d.pth) | | R-101-FPN (d) | Mask R-CNN | 2x | 9.9 | 0.970 | 4.8 | 41.5 | 37.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/gn/mask_rcnn_r101_fpn_gn_2x_20180113-9598649c.pth) | | R-101-FPN (d) | Mask R-CNN | 3x | 9.9 | 0.970 | 4.8 | 41.6 | 37.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/gn/mask_rcnn_r101_fpn_gn_3x_20180113-a14ffb96.pth) | | R-50-FPN (c) | Mask R-CNN | 2x | 7.2 | 0.806 | 5.4 | 39.7 | 35.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/gn/mask_rcnn_r50_fpn_gn_contrib_2x_20180113-ec93305c.pth) | | R-50-FPN (c) | Mask R-CNN | 3x | 7.2 | 0.806 | 5.4 | 40.0 | 36.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/gn/mask_rcnn_r50_fpn_gn_contrib_3x_20180113-9d230cab.pth) | **Notes:** - (d) means pretrained model converted from Detectron, and (c) means the contributed model pretrained by [@thangvubk](https://github.com/thangvubk). - The `3x` schedule is epoch [28, 34, 36]. - **Memory, Train/Inf time is outdated.** ================================================ FILE: mmdetection/configs/gn/mask_rcnn_r101_fpn_gn_2x.py ================================================ # model settings norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) model = dict( type='MaskRCNN', pretrained='open-mmlab://detectron/resnet101_gn', backbone=dict( type='ResNet', depth=101, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch', norm_cfg=norm_cfg), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5, norm_cfg=norm_cfg), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='ConvFCBBoxHead', num_shared_convs=4, num_shared_fcs=1, in_channels=256, conv_out_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False, norm_cfg=norm_cfg), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81, norm_cfg=norm_cfg)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[16, 22]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 24 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/mask_rcnn_r101_fpn_gn_2x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/gn/mask_rcnn_r50_fpn_gn_2x.py ================================================ # model settings norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) model = dict( type='MaskRCNN', pretrained='open-mmlab://detectron/resnet50_gn', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch', norm_cfg=norm_cfg), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5, norm_cfg=norm_cfg), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='ConvFCBBoxHead', num_shared_convs=4, num_shared_fcs=1, in_channels=256, conv_out_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False, norm_cfg=norm_cfg), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81, norm_cfg=norm_cfg)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[16, 22]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 24 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/mask_rcnn_r50_fpn_gn_2x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/gn/mask_rcnn_r50_fpn_gn_contrib_2x.py ================================================ # model settings norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) model = dict( type='MaskRCNN', pretrained='open-mmlab://contrib/resnet50_gn', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch', norm_cfg=norm_cfg), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5, norm_cfg=norm_cfg), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='ConvFCBBoxHead', num_shared_convs=4, num_shared_fcs=1, in_channels=256, conv_out_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False, norm_cfg=norm_cfg), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81, norm_cfg=norm_cfg)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[16, 22]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 24 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/mask_rcnn_r50_fpn_gn_contrib_2x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/gn+ws/README.md ================================================ # Weight Standardization ## Introduction ``` @article{weightstandardization, author = {Siyuan Qiao and Huiyu Wang and Chenxi Liu and Wei Shen and Alan Yuille}, title = {Weight Standardization}, journal = {arXiv preprint arXiv:1903.10520}, year = {2019}, } ``` ## Results and Models Faster R-CNN | Backbone | Style | Normalization | Lr schd | box AP | mask AP | Download | |:---------:|:-------:|:-------------:|:-------:|:------:|:-------:|:--------:| | R-50-FPN | pytorch | GN | 1x | 37.8 | - | - | | R-50-FPN | pytorch | GN+WS | 1x | 38.9 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ws/faster_rcnn_r50_fpn_gn_ws_1x_20190418-935d00b6.pth) | | R-101-FPN | pytorch | GN | 1x | 39.8 | - | - | | R-101-FPN | pytorch | GN+WS | 1x | 41.4 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ws/faster_rcnn_r101_fpn_gn_ws_1x_20190419-728705ec.pth) | | X-50-32x4d-FPN | pytorch | GN | 1x | 36.5 | - | - | | X-50-32x4d-FPN | pytorch | GN+WS | 1x | 39.9 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ws/faster_rcnn_x50_32x4d_fpn_gn_ws_1x_20190419-4e61072b.pth) | | X-101-32x4d-FPN | pytorch | GN | 1x | 33.2 | - | - | | X-101-32x4d-FPN | pytorch | GN+WS | 1x | 41.8 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ws/faster_rcnn_x101_32x4d_fpn_gn_ws_1x_20190419-c78e5583.pth) | Mask R-CNN | Backbone | Style | Normalization | Lr schd | box AP | mask AP | Download | |:---------:|:-------:|:-------------:|:-------:|:------:|:-------:|:--------:| | R-50-FPN | pytorch | GN | 2x | 39.9 | 36.0 | - | | R-50-FPN | pytorch | GN+WS | 2x | 40.3 | 36.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ws/mask_rcnn_r50_fpn_gn_ws_2x_20190419-9ec97bbb.pth) | | R-101-FPN | pytorch | GN | 2x | 41.6 | 37.3 | - | | R-101-FPN | pytorch | GN+WS | 2x | 42.0 | 37.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ws/mask_rcnn_r101_fpn_gn_ws_2x_20190419-bc7399a6.pth) | | X-50-32x4d-FPN | pytorch | GN | 2x | 39.2 | 35.5 | - | | X-50-32x4d-FPN | pytorch | GN+WS | 2x | 40.7 | 36.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ws/mask_rcnn_x50_32x4d_fpn_gn_ws_2x_20190419-2110205e.pth) | | X-101-32x4d-FPN | pytorch | GN | 2x | 36.4 | 33.1 | - | | X-101-32x4d-FPN | pytorch | GN+WS | 2x | 42.1 | 37.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ws/mask_rcnn_x101_32x4d_fpn_gn_ws_2x_20190419-7777b15f.pth) | | R-50-FPN | pytorch | GN | 20-23-24e | 40.6 | 36.6 | - | | R-50-FPN | pytorch | GN+WS | 20-23-24e | 41.1 | 37.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ws/mask_rcnn_r50_fpn_gn_ws_20_23_24e_20190425-1d9e499e.pth) | | R-101-FPN | pytorch | GN | 20-23-24e | 42.3 | 38.1 | - | | R-101-FPN | pytorch | GN+WS | 20-23-24e | 43.0 | 38.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ws/mask_rcnn_r101_fpn_gn_ws_20_23_24e_20190425-66cb3792.pth) | | X-50-32x4d-FPN | pytorch | GN | 20-23-24e | 39.6 | 35.9 | - | | X-50-32x4d-FPN | pytorch | GN+WS | 20-23-24e | 41.9 | 37.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ws/mask_rcnn_x50_32x4d_fpn_gn_ws_20_23_24e_20190425-d01e2200.pth) | | X-101-32x4d-FPN | pytorch | GN | 20-23-24e | 36.6 | 33.4 | - | | X-101-32x4d-FPN | pytorch | GN+WS | 20-23-24e | 43.4 | 38.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ws/mask_rcnn_x101_32x4d_fpn_gn_ws_20_23_24e_20190425-1ff3e5b2.pth) | Note: - GN+WS requires about 5% more memory than GN, and it is only 5% slower than GN. - In the paper, a 20-23-24e lr schedule is used instead of 2x. - The X-50-GN and X-101-GN pretrained models are also shared by the authors. ================================================ FILE: mmdetection/configs/gn+ws/faster_rcnn_r50_fpn_gn_ws_1x.py ================================================ # model settings conv_cfg = dict(type='ConvWS') norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) model = dict( type='FasterRCNN', pretrained='open-mmlab://jhu/resnet50_gn_ws', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch', conv_cfg=conv_cfg, norm_cfg=norm_cfg), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5, conv_cfg=conv_cfg, norm_cfg=norm_cfg), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='ConvFCBBoxHead', num_shared_convs=4, num_shared_fcs=1, in_channels=256, conv_out_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False, conv_cfg=conv_cfg, norm_cfg=norm_cfg)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/faster_rcnn_r50_fpn_gn_ws_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/gn+ws/mask_rcnn_r50_fpn_gn_ws_20_23_24e.py ================================================ # model settings conv_cfg = dict(type='ConvWS') norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) model = dict( type='MaskRCNN', pretrained='open-mmlab://jhu/resnet50_gn_ws', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch', conv_cfg=conv_cfg, norm_cfg=norm_cfg), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5, conv_cfg=conv_cfg, norm_cfg=norm_cfg), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='ConvFCBBoxHead', num_shared_convs=4, num_shared_fcs=1, in_channels=256, conv_out_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False, conv_cfg=conv_cfg, norm_cfg=norm_cfg), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81, conv_cfg=conv_cfg, norm_cfg=norm_cfg)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[20, 23]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 24 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/mask_rcnn_r50_fpn_gn_ws_20_23_24e' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/gn+ws/mask_rcnn_r50_fpn_gn_ws_2x.py ================================================ # model settings conv_cfg = dict(type='ConvWS') norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) model = dict( type='MaskRCNN', pretrained='open-mmlab://jhu/resnet50_gn_ws', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch', conv_cfg=conv_cfg, norm_cfg=norm_cfg), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5, conv_cfg=conv_cfg, norm_cfg=norm_cfg), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='ConvFCBBoxHead', num_shared_convs=4, num_shared_fcs=1, in_channels=256, conv_out_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False, conv_cfg=conv_cfg, norm_cfg=norm_cfg), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81, conv_cfg=conv_cfg, norm_cfg=norm_cfg)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[16, 22]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 24 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/mask_rcnn_r50_fpn_gn_ws_2x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/gn+ws/mask_rcnn_x101_32x4d_fpn_gn_ws_2x.py ================================================ # model settings conv_cfg = dict(type='ConvWS') norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) model = dict( type='MaskRCNN', pretrained='open-mmlab://jhu/resnext101_32x4d_gn_ws', backbone=dict( type='ResNeXt', depth=101, groups=32, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch', conv_cfg=conv_cfg, norm_cfg=norm_cfg), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5, conv_cfg=conv_cfg, norm_cfg=norm_cfg), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='ConvFCBBoxHead', num_shared_convs=4, num_shared_fcs=1, in_channels=256, conv_out_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False, conv_cfg=conv_cfg, norm_cfg=norm_cfg), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81, conv_cfg=conv_cfg, norm_cfg=norm_cfg)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[16, 22]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 24 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/mask_rcnn_x101_32x4d_fpn_gn_ws_2x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/htc/README.md ================================================ # Hybrid Task Cascade for Instance Segmentation ## Introduction We provide config files to reproduce the results in the CVPR 2019 paper for [Hybrid Task Cascade](https://arxiv.org/abs/1901.07518). ``` @inproceedings{chen2019hybrid, title={Hybrid task cascade for instance segmentation}, author={Chen, Kai and Pang, Jiangmiao and Wang, Jiaqi and Xiong, Yu and Li, Xiaoxiao and Sun, Shuyang and Feng, Wansen and Liu, Ziwei and Shi, Jianping and Ouyang, Wanli and Chen Change Loy and Dahua Lin}, booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, year={2019} } ``` ## Dataset HTC requires COCO and COCO-stuff dataset for training. You need to download and extract it in the COCO dataset path. The directory should be like this. ``` mmdetection ├── mmdet ├── tools ├── configs ├── data │ ├── coco │ │ ├── annotations │ │ ├── train2017 │ │ ├── val2017 │ │ ├── test2017 | | ├── stuffthingmaps ``` ## Results and Models The results on COCO 2017val is shown in the below table. (results on test-dev are usually slightly higher than val) | Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download | |:---------:|:-------:|:-------:|:--------:|:-------------------:|:--------------:|:------:|:-------:|:--------:| | R-50-FPN | pytorch | 1x | 7.4 | 0.936 | 4.1 | 42.1 | 37.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_r50_fpn_1x_20190408-878c1712.pth) | | R-50-FPN | pytorch | 20e | - | - | - | 43.2 | 38.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_r50_fpn_20e_20190408-c03b7015.pth) | | R-101-FPN | pytorch | 20e | 9.3 | 1.051 | 4.0 | 44.9 | 39.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_r101_fpn_20e_20190408-a2e586db.pth) | | X-101-32x4d-FPN | pytorch |20e| 5.8 | 0.769 | 3.8 | 46.1 | 40.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_x101_32x4d_fpn_20e_20190408-9eae4d0b.pth) | | X-101-64x4d-FPN | pytorch |20e| 7.5 | 1.120 | 3.5 | 46.9 | 40.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_x101_64x4d_fpn_20e_20190408-497f2561.pth) | - In the HTC paper and COCO 2018 Challenge, `score_thr` is set to 0.001 for both baselines and HTC. - We use 8 GPUs with 2 images/GPU for R-50 and R-101 models, and 16 GPUs with 1 image/GPU for X-101 models. If you would like to train X-101 HTC with 8 GPUs, you need to change the lr from 0.02 to 0.01. We also provide a powerful HTC with DCN and multi-scale training model. No testing augmentation is used. | Backbone | Style | DCN | training scales | Lr schd | box AP | mask AP | Download | |:----------------:|:-------:|:-----:|:---------------:|:-------:|:------:|:-------:|:--------:| | X-101-64x4d-FPN | pytorch | c3-c5 | 400~1400 | 20e | 50.7 | 43.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e_20190408-0e50669c.pth) | ================================================ FILE: mmdetection/configs/htc/htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e.py ================================================ # model settings model = dict( type='HybridTaskCascade', num_stages=3, pretrained='open-mmlab://resnext101_64x4d', interleaved=True, mask_info_flow=True, backbone=dict( type='ResNeXt', depth=101, groups=64, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch', dcn=dict( modulated=False, groups=64, deformable_groups=1, fallback_on_stride=False), stage_with_dcn=(False, True, True, True)), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ], mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='HTCMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81), semantic_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[8]), semantic_head=dict( type='FusedSemanticHead', num_ins=5, fusion_level=1, num_convs=4, in_channels=256, conv_out_channels=256, num_classes=183, ignore_label=255, loss_weight=0.2)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.001, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=1, workers_per_gpu=1, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=[(1600, 400), (1600, 1400)], multiscale_mode='range', img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, seg_prefix=data_root + 'stuffthingmaps/train2017/', seg_scale_factor=1 / 8, with_mask=True, with_crowd=True, with_label=True, with_semantic_seg=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[16, 19]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 20 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/htc/htc_r101_fpn_20e.py ================================================ # model settings model = dict( type='HybridTaskCascade', num_stages=3, pretrained='modelzoo://resnet101', interleaved=True, mask_info_flow=True, backbone=dict( type='ResNet', depth=101, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ], mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='HTCMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81), semantic_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[8]), semantic_head=dict( type='FusedSemanticHead', num_ins=5, fusion_level=1, num_convs=4, in_channels=256, conv_out_channels=256, num_classes=183, ignore_label=255, loss_weight=0.2)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.001, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, seg_prefix=data_root + 'stuffthingmaps/train2017/', seg_scale_factor=1 / 8, with_mask=True, with_crowd=True, with_label=True, with_semantic_seg=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[16, 19]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 20 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/htc_r101_fpn_20e' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/htc/htc_r50_fpn_1x.py ================================================ # model settings model = dict( type='HybridTaskCascade', num_stages=3, pretrained='modelzoo://resnet50', interleaved=True, mask_info_flow=True, backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ], mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='HTCMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81), semantic_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[8]), semantic_head=dict( type='FusedSemanticHead', num_ins=5, fusion_level=1, num_convs=4, in_channels=256, conv_out_channels=256, num_classes=183, ignore_label=255, loss_weight=0.2)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.001, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, seg_prefix=data_root + 'stuffthingmaps/train2017/', seg_scale_factor=1 / 8, with_mask=True, with_crowd=True, with_label=True, with_semantic_seg=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/htc_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/htc/htc_r50_fpn_20e.py ================================================ # model settings model = dict( type='HybridTaskCascade', num_stages=3, pretrained='modelzoo://resnet50', interleaved=True, mask_info_flow=True, backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ], mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='HTCMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81), semantic_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[8]), semantic_head=dict( type='FusedSemanticHead', num_ins=5, fusion_level=1, num_convs=4, in_channels=256, conv_out_channels=256, num_classes=183, ignore_label=255, loss_weight=0.2)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.001, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, seg_prefix=data_root + 'stuffthingmaps/train2017/', seg_scale_factor=1 / 8, with_mask=True, with_crowd=True, with_label=True, with_semantic_seg=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[16, 19]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 20 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/htc_r50_fpn_20e' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/htc/htc_without_semantic_r50_fpn_1x.py ================================================ # model settings model = dict( type='HybridTaskCascade', num_stages=3, pretrained='modelzoo://resnet50', interleaved=True, mask_info_flow=True, backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ], mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='HTCMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.001, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/htc_without_semantic_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/htc/htc_x101_32x4d_fpn_20e_16gpu.py ================================================ # model settings model = dict( type='HybridTaskCascade', num_stages=3, pretrained='open-mmlab://resnext101_32x4d', interleaved=True, mask_info_flow=True, backbone=dict( type='ResNeXt', depth=101, groups=32, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ], mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='HTCMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81), semantic_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[8]), semantic_head=dict( type='FusedSemanticHead', num_ins=5, fusion_level=1, num_convs=4, in_channels=256, conv_out_channels=256, num_classes=183, ignore_label=255, loss_weight=0.2)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.001, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=1, workers_per_gpu=1, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, seg_prefix=data_root + 'stuffthingmaps/train2017/', seg_scale_factor=1 / 8, with_mask=True, with_crowd=True, with_label=True, with_semantic_seg=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[16, 19]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 20 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/htc_x101_32x4d_fpn_20e' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/htc/htc_x101_64x4d_fpn_20e_16gpu.py ================================================ # model settings model = dict( type='HybridTaskCascade', num_stages=3, pretrained='open-mmlab://resnext101_64x4d', interleaved=True, mask_info_flow=True, backbone=dict( type='ResNeXt', depth=101, groups=64, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True) ], mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='HTCMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81), semantic_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[8]), semantic_head=dict( type='FusedSemanticHead', num_ins=5, fusion_level=1, num_convs=4, in_channels=256, conv_out_channels=256, num_classes=183, ignore_label=255, loss_weight=0.2)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.001, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5), keep_all_stages=False) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=1, workers_per_gpu=1, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, seg_prefix=data_root + 'stuffthingmaps/train2017/', seg_scale_factor=1 / 8, with_mask=True, with_crowd=True, with_label=True, with_semantic_seg=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[16, 19]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 20 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/htc_x101_64x4d_fpn_20e' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/mask_rcnn_r101_fpn_1x.py ================================================ # model settings model = dict( type='MaskRCNN', pretrained='modelzoo://resnet101', backbone=dict( type='ResNet', depth=101, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/mask_rcnn_r101_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/mask_rcnn_r50_caffe_c4_1x.py ================================================ # model settings norm_cfg = dict(type='BN', requires_grad=False) model = dict( type='MaskRCNN', pretrained='open-mmlab://resnet50_caffe', backbone=dict( type='ResNet', depth=50, num_stages=3, strides=(1, 2, 2), dilations=(1, 1, 1), out_indices=(2, ), frozen_stages=1, norm_cfg=norm_cfg, norm_eval=True, style='caffe'), shared_head=dict( type='ResLayer', depth=50, stage=3, stride=2, dilation=1, style='caffe', norm_cfg=norm_cfg, norm_eval=True), rpn_head=dict( type='RPNHead', in_channels=1024, feat_channels=1024, anchor_scales=[2, 4, 8, 16, 32], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[16], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=1024, featmap_strides=[16]), bbox_head=dict( type='BBoxHead', with_avg_pool=True, roi_feat_size=7, in_channels=2048, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False), mask_roi_extractor=None, mask_head=dict( type='FCNMaskHead', num_convs=0, in_channels=2048, conv_out_channels=256, num_classes=81)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=12000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=14, pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=6000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) data = dict( imgs_per_gpu=1, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/mask_rcnn_r50_caffe_c4_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/mask_rcnn_r50_fpn_1x.py ================================================ # model settings model = dict( type='MaskRCNN', pretrained='modelzoo://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/mask_rcnn_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/mask_rcnn_x101_32x4d_fpn_1x.py ================================================ # model settings model = dict( type='MaskRCNN', pretrained='open-mmlab://resnext101_32x4d', backbone=dict( type='ResNeXt', depth=101, groups=32, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/mask_rcnn_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/mask_rcnn_x101_64x4d_fpn_1x.py ================================================ # model settings model = dict( type='MaskRCNN', pretrained='open-mmlab://resnext101_64x4d', backbone=dict( type='ResNeXt', depth=101, groups=64, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=81)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=True, with_crowd=True, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/mask_rcnn_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712.py ================================================ # model settings model = dict( type='FasterRCNN', pretrained='modelzoo://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=21, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) # soft-nms is also supported for rcnn testing # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05) ) # dataset settings dataset_type = 'VOCDataset' data_root = 'data/VOCdevkit/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type='RepeatDataset', # to avoid reloading datasets frequently times=3, dataset=dict( type=dataset_type, ann_file=[ data_root + 'VOC2007/ImageSets/Main/trainval.txt', data_root + 'VOC2012/ImageSets/Main/trainval.txt' ], img_prefix=[data_root + 'VOC2007/', data_root + 'VOC2012/'], img_scale=(1000, 600), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True)), val=dict( type=dataset_type, ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', img_prefix=data_root + 'VOC2007/', img_scale=(1000, 600), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', img_prefix=data_root + 'VOC2007/', img_scale=(1000, 600), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict(policy='step', step=[3]) # actual epoch = 3 * 3 = 9 checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 4 # actual epoch = 4 * 3 = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/faster_rcnn_r50_fpn_1x_voc0712' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/pascal_voc/ssd300_voc.py ================================================ # model settings input_size = 300 model = dict( type='SingleStageDetector', pretrained='open-mmlab://vgg16_caffe', backbone=dict( type='SSDVGG', input_size=input_size, depth=16, with_last_pool=False, ceil_mode=True, out_indices=(3, 4), out_feature_indices=(22, 34), l2_norm_scale=20), neck=None, bbox_head=dict( type='SSDHead', input_size=input_size, in_channels=(512, 1024, 512, 256, 256, 256), num_classes=21, anchor_strides=(8, 16, 32, 64, 100, 300), basesize_ratio_range=(0.2, 0.9), anchor_ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]), target_means=(.0, .0, .0, .0), target_stds=(0.1, 0.1, 0.2, 0.2))) cudnn_benchmark = True train_cfg = dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0., ignore_iof_thr=-1, gt_max_assign_all=False), smoothl1_beta=1., allowed_border=-1, pos_weight=-1, neg_pos_ratio=3, debug=False) test_cfg = dict( nms=dict(type='nms', iou_thr=0.45), min_bbox_size=0, score_thr=0.02, max_per_img=200) # model training and testing settings # dataset settings dataset_type = 'VOCDataset' data_root = 'data/VOCdevkit/' img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True) data = dict( imgs_per_gpu=4, workers_per_gpu=2, train=dict( type='RepeatDataset', times=10, dataset=dict( type=dataset_type, ann_file=[ data_root + 'VOC2007/ImageSets/Main/trainval.txt', data_root + 'VOC2012/ImageSets/Main/trainval.txt' ], img_prefix=[data_root + 'VOC2007/', data_root + 'VOC2012/'], img_scale=(300, 300), img_norm_cfg=img_norm_cfg, size_divisor=None, flip_ratio=0.5, with_mask=False, with_crowd=False, with_label=True, test_mode=False, extra_aug=dict( photo_metric_distortion=dict( brightness_delta=32, contrast_range=(0.5, 1.5), saturation_range=(0.5, 1.5), hue_delta=18), expand=dict( mean=img_norm_cfg['mean'], to_rgb=img_norm_cfg['to_rgb'], ratio_range=(1, 4)), random_crop=dict( min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3)), resize_keep_ratio=False)), val=dict( type=dataset_type, ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', img_prefix=data_root + 'VOC2007/', img_scale=(300, 300), img_norm_cfg=img_norm_cfg, size_divisor=None, flip_ratio=0, with_mask=False, with_label=False, test_mode=True, resize_keep_ratio=False), test=dict( type=dataset_type, ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', img_prefix=data_root + 'VOC2007/', img_scale=(300, 300), img_norm_cfg=img_norm_cfg, size_divisor=None, flip_ratio=0, with_mask=False, with_label=False, test_mode=True, resize_keep_ratio=False)) # optimizer optimizer = dict(type='SGD', lr=1e-3, momentum=0.9, weight_decay=5e-4) optimizer_config = dict() # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[16, 20]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 24 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/ssd300_voc' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/pascal_voc/ssd512_voc.py ================================================ # model settings input_size = 512 model = dict( type='SingleStageDetector', pretrained='open-mmlab://vgg16_caffe', backbone=dict( type='SSDVGG', input_size=input_size, depth=16, with_last_pool=False, ceil_mode=True, out_indices=(3, 4), out_feature_indices=(22, 34), l2_norm_scale=20), neck=None, bbox_head=dict( type='SSDHead', input_size=input_size, in_channels=(512, 1024, 512, 256, 256, 256, 256), num_classes=21, anchor_strides=(8, 16, 32, 64, 128, 256, 512), basesize_ratio_range=(0.15, 0.9), anchor_ratios=([2], [2, 3], [2, 3], [2, 3], [2, 3], [2], [2]), target_means=(.0, .0, .0, .0), target_stds=(0.1, 0.1, 0.2, 0.2))) cudnn_benchmark = True train_cfg = dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0., ignore_iof_thr=-1, gt_max_assign_all=False), smoothl1_beta=1., allowed_border=-1, pos_weight=-1, neg_pos_ratio=3, debug=False) test_cfg = dict( nms=dict(type='nms', iou_thr=0.45), min_bbox_size=0, score_thr=0.02, max_per_img=200) # model training and testing settings # dataset settings dataset_type = 'VOCDataset' data_root = 'data/VOCdevkit/' img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True) data = dict( imgs_per_gpu=4, workers_per_gpu=2, train=dict( type='RepeatDataset', times=10, dataset=dict( type=dataset_type, ann_file=[ data_root + 'VOC2007/ImageSets/Main/trainval.txt', data_root + 'VOC2012/ImageSets/Main/trainval.txt' ], img_prefix=[data_root + 'VOC2007/', data_root + 'VOC2012/'], img_scale=(512, 512), img_norm_cfg=img_norm_cfg, size_divisor=None, flip_ratio=0.5, with_mask=False, with_crowd=False, with_label=True, test_mode=False, extra_aug=dict( photo_metric_distortion=dict( brightness_delta=32, contrast_range=(0.5, 1.5), saturation_range=(0.5, 1.5), hue_delta=18), expand=dict( mean=img_norm_cfg['mean'], to_rgb=img_norm_cfg['to_rgb'], ratio_range=(1, 4)), random_crop=dict( min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3)), resize_keep_ratio=False)), val=dict( type=dataset_type, ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', img_prefix=data_root + 'VOC2007/', img_scale=(512, 512), img_norm_cfg=img_norm_cfg, size_divisor=None, flip_ratio=0, with_mask=False, with_label=False, test_mode=True, resize_keep_ratio=False), test=dict( type=dataset_type, ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', img_prefix=data_root + 'VOC2007/', img_scale=(512, 512), img_norm_cfg=img_norm_cfg, size_divisor=None, flip_ratio=0, with_mask=False, with_label=False, test_mode=True, resize_keep_ratio=False)) # optimizer optimizer = dict(type='SGD', lr=1e-3, momentum=0.9, weight_decay=5e-4) optimizer_config = dict() # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[16, 20]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 24 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/ssd512_voc' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/retinanet_r101_fpn_1x.py ================================================ # model settings model = dict( type='RetinaNet', pretrained='modelzoo://resnet101', backbone=dict( type='ResNet', depth=101, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=1, add_extra_convs=True, num_outs=5), bbox_head=dict( type='RetinaHead', num_classes=81, in_channels=256, stacked_convs=4, feat_channels=256, octave_base_scale=4, scales_per_octave=3, anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[8, 16, 32, 64, 128], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0])) # training and testing settings train_cfg = dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.4, min_pos_iou=0, ignore_iof_thr=-1), smoothl1_beta=0.11, gamma=2.0, alpha=0.25, allowed_border=-1, pos_weight=-1, debug=False) test_cfg = dict( nms_pre=1000, min_bbox_size=0, score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=False, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 device_ids = range(8) dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/retinanet_r101_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/retinanet_r50_fpn_1x.py ================================================ # model settings model = dict( type='RetinaNet', pretrained='modelzoo://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=1, add_extra_convs=True, num_outs=5), bbox_head=dict( type='RetinaHead', num_classes=81, in_channels=256, stacked_convs=4, feat_channels=256, octave_base_scale=4, scales_per_octave=3, anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[8, 16, 32, 64, 128], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0])) # training and testing settings train_cfg = dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.4, min_pos_iou=0, ignore_iof_thr=-1), smoothl1_beta=0.11, gamma=2.0, alpha=0.25, allowed_border=-1, pos_weight=-1, debug=False) test_cfg = dict( nms_pre=1000, min_bbox_size=0, score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=False, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 device_ids = range(8) dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/retinanet_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/retinanet_x101_32x4d_fpn_1x.py ================================================ # model settings model = dict( type='RetinaNet', pretrained='open-mmlab://resnext101_32x4d', backbone=dict( type='ResNeXt', depth=101, groups=32, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=1, add_extra_convs=True, num_outs=5), bbox_head=dict( type='RetinaHead', num_classes=81, in_channels=256, stacked_convs=4, feat_channels=256, octave_base_scale=4, scales_per_octave=3, anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[8, 16, 32, 64, 128], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0])) # training and testing settings train_cfg = dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.4, min_pos_iou=0, ignore_iof_thr=-1), smoothl1_beta=0.11, gamma=2.0, alpha=0.25, allowed_border=-1, pos_weight=-1, debug=False) test_cfg = dict( nms_pre=1000, min_bbox_size=0, score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=False, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 device_ids = range(8) dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/retinanet_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/retinanet_x101_64x4d_fpn_1x.py ================================================ # model settings model = dict( type='RetinaNet', pretrained='open-mmlab://resnext101_64x4d', backbone=dict( type='ResNeXt', depth=101, groups=64, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=1, add_extra_convs=True, num_outs=5), bbox_head=dict( type='RetinaHead', num_classes=81, in_channels=256, stacked_convs=4, feat_channels=256, octave_base_scale=4, scales_per_octave=3, anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[8, 16, 32, 64, 128], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0])) # training and testing settings train_cfg = dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.4, min_pos_iou=0, ignore_iof_thr=-1), smoothl1_beta=0.11, gamma=2.0, alpha=0.25, allowed_border=-1, pos_weight=-1, debug=False) test_cfg = dict( nms_pre=1000, min_bbox_size=0, score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=False, with_label=True), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=True), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 device_ids = range(8) dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/retinanet_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/rpn_r101_fpn_1x.py ================================================ # model settings model = dict( type='RPN', pretrained='modelzoo://resnet101', backbone=dict( type='ResNet', depth=101, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=False, with_label=False), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=False), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) # runner configs optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/rpn_r101_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/rpn_r50_caffe_c4_1x.py ================================================ # model settings model = dict( type='RPN', pretrained='open-mmlab://resnet50_caffe', backbone=dict( type='ResNet', depth=50, num_stages=3, strides=(1, 2, 2), dilations=(1, 1, 1), out_indices=(2, ), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=False), norm_eval=True, style='caffe'), neck=None, rpn_head=dict( type='RPNHead', in_channels=1024, feat_channels=1024, anchor_scales=[2, 4, 8, 16, 32], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[16], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=12000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=False, with_label=False), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=False), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) # runner configs optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/rpn_r50_caffe_c4_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/rpn_r50_fpn_1x.py ================================================ # model settings model = dict( type='RPN', pretrained='modelzoo://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=False, with_label=False), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=False), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) # runner configs optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/rpn_r50_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/rpn_x101_32x4d_fpn_1x.py ================================================ # model settings model = dict( type='RPN', pretrained='open-mmlab://resnext101_32x4d', backbone=dict( type='ResNeXt', depth=101, groups=32, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=False, with_label=False), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=False), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) # runner configs optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/rpn_r101_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/rpn_x101_64x4d_fpn_1x.py ================================================ # model settings model = dict( type='RPN', pretrained='open-mmlab://resnext101_64x4d', backbone=dict( type='ResNeXt', depth=101, groups=64, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], use_sigmoid_cls=True)) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, smoothl1_beta=1 / 9.0, debug=False)) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0)) # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0.5, with_mask=False, with_crowd=False, with_label=False), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_crowd=False, with_label=False), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(1333, 800), img_norm_cfg=img_norm_cfg, size_divisor=32, flip_ratio=0, with_mask=False, with_label=False, test_mode=True)) # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) # runner configs optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/rpn_r101_fpn_1x' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/ssd300_coco.py ================================================ # model settings input_size = 300 model = dict( type='SingleStageDetector', pretrained='open-mmlab://vgg16_caffe', backbone=dict( type='SSDVGG', input_size=input_size, depth=16, with_last_pool=False, ceil_mode=True, out_indices=(3, 4), out_feature_indices=(22, 34), l2_norm_scale=20), neck=None, bbox_head=dict( type='SSDHead', input_size=input_size, in_channels=(512, 1024, 512, 256, 256, 256), num_classes=81, anchor_strides=(8, 16, 32, 64, 100, 300), basesize_ratio_range=(0.15, 0.9), anchor_ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]), target_means=(.0, .0, .0, .0), target_stds=(0.1, 0.1, 0.2, 0.2))) cudnn_benchmark = True train_cfg = dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0., ignore_iof_thr=-1, gt_max_assign_all=False), smoothl1_beta=1., allowed_border=-1, pos_weight=-1, neg_pos_ratio=3, debug=False) test_cfg = dict( nms=dict(type='nms', iou_thr=0.45), min_bbox_size=0, score_thr=0.02, max_per_img=200) # model training and testing settings # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True) data = dict( imgs_per_gpu=8, workers_per_gpu=3, train=dict( type='RepeatDataset', times=5, dataset=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(300, 300), img_norm_cfg=img_norm_cfg, size_divisor=None, flip_ratio=0.5, with_mask=False, with_crowd=False, with_label=True, test_mode=False, extra_aug=dict( photo_metric_distortion=dict( brightness_delta=32, contrast_range=(0.5, 1.5), saturation_range=(0.5, 1.5), hue_delta=18), expand=dict( mean=img_norm_cfg['mean'], to_rgb=img_norm_cfg['to_rgb'], ratio_range=(1, 4)), random_crop=dict( min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3)), resize_keep_ratio=False)), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(300, 300), img_norm_cfg=img_norm_cfg, size_divisor=None, flip_ratio=0, with_mask=False, with_label=False, test_mode=True, resize_keep_ratio=False), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(300, 300), img_norm_cfg=img_norm_cfg, size_divisor=None, flip_ratio=0, with_mask=False, with_label=False, test_mode=True, resize_keep_ratio=False)) # optimizer optimizer = dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4) optimizer_config = dict() # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[16, 22]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 24 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/ssd300_coco' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/configs/ssd512_coco.py ================================================ # model settings input_size = 512 model = dict( type='SingleStageDetector', pretrained='open-mmlab://vgg16_caffe', backbone=dict( type='SSDVGG', input_size=input_size, depth=16, with_last_pool=False, ceil_mode=True, out_indices=(3, 4), out_feature_indices=(22, 34), l2_norm_scale=20), neck=None, bbox_head=dict( type='SSDHead', input_size=input_size, in_channels=(512, 1024, 512, 256, 256, 256, 256), num_classes=81, anchor_strides=(8, 16, 32, 64, 128, 256, 512), basesize_ratio_range=(0.1, 0.9), anchor_ratios=([2], [2, 3], [2, 3], [2, 3], [2, 3], [2], [2]), target_means=(.0, .0, .0, .0), target_stds=(0.1, 0.1, 0.2, 0.2))) cudnn_benchmark = True train_cfg = dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0., ignore_iof_thr=-1, gt_max_assign_all=False), smoothl1_beta=1., allowed_border=-1, pos_weight=-1, neg_pos_ratio=3, debug=False) test_cfg = dict( nms=dict(type='nms', iou_thr=0.45), min_bbox_size=0, score_thr=0.02, max_per_img=200) # model training and testing settings # dataset settings dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True) data = dict( imgs_per_gpu=8, workers_per_gpu=3, train=dict( type='RepeatDataset', times=5, dataset=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', img_scale=(512, 512), img_norm_cfg=img_norm_cfg, size_divisor=None, flip_ratio=0.5, with_mask=False, with_crowd=False, with_label=True, test_mode=False, extra_aug=dict( photo_metric_distortion=dict( brightness_delta=32, contrast_range=(0.5, 1.5), saturation_range=(0.5, 1.5), hue_delta=18), expand=dict( mean=img_norm_cfg['mean'], to_rgb=img_norm_cfg['to_rgb'], ratio_range=(1, 4)), random_crop=dict( min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3)), resize_keep_ratio=False)), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(512, 512), img_norm_cfg=img_norm_cfg, size_divisor=None, flip_ratio=0, with_mask=False, with_label=False, test_mode=True, resize_keep_ratio=False), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', img_scale=(512, 512), img_norm_cfg=img_norm_cfg, size_divisor=None, flip_ratio=0, with_mask=False, with_label=False, test_mode=True, resize_keep_ratio=False)) # optimizer optimizer = dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4) optimizer_config = dict() # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[16, 22]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 24 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/ssd512_coco' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: mmdetection/mmdet/__init__.py ================================================ from .version import __version__, short_version __all__ = ['__version__', 'short_version'] ================================================ FILE: mmdetection/mmdet/apis/__init__.py ================================================ from .env import init_dist, get_root_logger, set_random_seed from .train import train_detector from .inference import init_detector, inference_detector, show_result __all__ = [ 'init_dist', 'get_root_logger', 'set_random_seed', 'train_detector', 'init_detector', 'inference_detector', 'show_result' ] ================================================ FILE: mmdetection/mmdet/apis/env.py ================================================ import logging import os import random import subprocess import numpy as np import torch import torch.distributed as dist import torch.multiprocessing as mp from mmcv.runner import get_dist_info def init_dist(launcher, backend='nccl', **kwargs): if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') if launcher == 'pytorch': _init_dist_pytorch(backend, **kwargs) elif launcher == 'mpi': _init_dist_mpi(backend, **kwargs) elif launcher == 'slurm': _init_dist_slurm(backend, **kwargs) else: raise ValueError('Invalid launcher type: {}'.format(launcher)) def _init_dist_pytorch(backend, **kwargs): # TODO: use local_rank instead of rank % num_gpus rank = int(os.environ['RANK']) num_gpus = torch.cuda.device_count() torch.cuda.set_device(rank % num_gpus) dist.init_process_group(backend=backend, **kwargs) def _init_dist_mpi(backend, **kwargs): raise NotImplementedError def _init_dist_slurm(backend, port=29500, **kwargs): proc_id = int(os.environ['SLURM_PROCID']) ntasks = int(os.environ['SLURM_NTASKS']) node_list = os.environ['SLURM_NODELIST'] num_gpus = torch.cuda.device_count() torch.cuda.set_device(proc_id % num_gpus) addr = subprocess.getoutput( 'scontrol show hostname {} | head -n1'.format(node_list)) os.environ['MASTER_PORT'] = str(port) os.environ['MASTER_ADDR'] = addr os.environ['WORLD_SIZE'] = str(ntasks) os.environ['RANK'] = str(proc_id) dist.init_process_group(backend=backend) def set_random_seed(seed): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) def get_root_logger(log_level=logging.INFO): logger = logging.getLogger() if not logger.hasHandlers(): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(message)s', level=log_level) rank, _ = get_dist_info() if rank != 0: logger.setLevel('ERROR') return logger ================================================ FILE: mmdetection/mmdet/apis/inference.py ================================================ import warnings import mmcv import numpy as np import pycocotools.mask as maskUtils import torch from mmcv.runner import load_checkpoint from mmdet.core import get_classes from mmdet.datasets import to_tensor from mmdet.datasets.transforms import ImageTransform from mmdet.models import build_detector def init_detector(config, checkpoint=None, device='cuda:0'): """Initialize a detector from config file. Args: config (str or :obj:`mmcv.Config`): Config file path or the config object. checkpoint (str, optional): Checkpoint path. If left as None, the model will not load any weights. Returns: nn.Module: The constructed detector. """ if isinstance(config, str): config = mmcv.Config.fromfile(config) elif not isinstance(config, mmcv.Config): raise TypeError('config must be a filename or Config object, ' 'but got {}'.format(type(config))) config.model.pretrained = None model = build_detector(config.model, test_cfg=config.test_cfg) if checkpoint is not None: checkpoint = load_checkpoint(model, checkpoint) if 'CLASSES' in checkpoint['meta']: model.CLASSES = checkpoint['meta']['classes'] else: warnings.warn('Class names are not saved in the checkpoint\'s ' 'meta data, use COCO classes by default.') model.CLASSES = get_classes('coco') model.cfg = config # save the config in the model for convenience model.to(device) model.eval() return model def inference_detector(model, imgs): """Inference image(s) with the detector. Args: model (nn.Module): The loaded detector. imgs (str/ndarray or list[str/ndarray]): Either image files or loaded images. Returns: If imgs is a str, a generator will be returned, otherwise return the detection results directly. """ cfg = model.cfg img_transform = ImageTransform( size_divisor=cfg.data.test.size_divisor, **cfg.img_norm_cfg) device = next(model.parameters()).device # model device if not isinstance(imgs, list): return _inference_single(model, imgs, img_transform, device) else: return _inference_generator(model, imgs, img_transform, device) def _prepare_data(img, img_transform, cfg, device): ori_shape = img.shape img, img_shape, pad_shape, scale_factor = img_transform( img, scale=cfg.data.test.img_scale, keep_ratio=cfg.data.test.get('resize_keep_ratio', True)) img = to_tensor(img).to(device).unsqueeze(0) img_meta = [ dict( ori_shape=ori_shape, img_shape=img_shape, pad_shape=pad_shape, scale_factor=scale_factor, flip=False) ] return dict(img=[img], img_meta=[img_meta]) def _inference_single(model, img, img_transform, device): img = mmcv.imread(img) data = _prepare_data(img, img_transform, model.cfg, device) with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) return result def _inference_generator(model, imgs, img_transform, device): for img in imgs: yield _inference_single(model, img, img_transform, device) # TODO: merge this method with the one in BaseDetector def show_result(img, result, class_names, score_thr=0.3, out_file=None): """Visualize the detection results on the image. Args: img (str or np.ndarray): Image filename or loaded image. result (tuple[list] or list): The detection result, can be either (bbox, segm) or just bbox. class_names (list[str] or tuple[str]): A list of class names. score_thr (float): The threshold to visualize the bboxes and masks. out_file (str, optional): If specified, the visualization result will be written to the out file instead of shown in a window. """ assert isinstance(class_names, (tuple, list)) img = mmcv.imread(img) if isinstance(result, tuple): bbox_result, segm_result = result else: bbox_result, segm_result = result, None bboxes = np.vstack(bbox_result) # draw segmentation masks if segm_result is not None: segms = mmcv.concat_list(segm_result) inds = np.where(bboxes[:, -1] > score_thr)[0] for i in inds: color_mask = np.random.randint( 0, 256, (1, 3), dtype=np.uint8) mask = maskUtils.decode(segms[i]).astype(np.bool) img[mask] = img[mask] * 0.5 + color_mask * 0.5 # draw bounding boxes labels = [ np.full(bbox.shape[0], i, dtype=np.int32) for i, bbox in enumerate(bbox_result) ] labels = np.concatenate(labels) mmcv.imshow_det_bboxes( img.copy(), bboxes, labels, class_names=class_names, score_thr=score_thr, show=out_file is None, out_file=out_file) ================================================ FILE: mmdetection/mmdet/apis/train.py ================================================ from __future__ import division import re from collections import OrderedDict import torch from mmcv.runner import Runner, DistSamplerSeedHook, obj_from_dict from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmdet import datasets from mmdet.core import (DistOptimizerHook, DistEvalmAPHook, CocoDistEvalRecallHook, CocoDistEvalmAPHook) from mmdet.datasets import build_dataloader from mmdet.models import RPN from .env import get_root_logger def parse_losses(losses): log_vars = OrderedDict() for loss_name, loss_value in losses.items(): if isinstance(loss_value, torch.Tensor): log_vars[loss_name] = loss_value.mean() elif isinstance(loss_value, list): log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value) else: raise TypeError( '{} is not a tensor or list of tensors'.format(loss_name)) loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key) log_vars['loss'] = loss for name in log_vars: log_vars[name] = log_vars[name].item() return loss, log_vars def batch_processor(model, data, train_mode): losses = model(**data) loss, log_vars = parse_losses(losses) outputs = dict( loss=loss, log_vars=log_vars, num_samples=len(data['img'].data)) return outputs def train_detector(model, dataset, cfg, distributed=False, validate=False, logger=None): if logger is None: logger = get_root_logger(cfg.log_level) # start training if distributed: _dist_train(model, dataset, cfg, validate=validate) else: _non_dist_train(model, dataset, cfg, validate=validate) def build_optimizer(model, optimizer_cfg): """Build optimizer from configs. Args: model (:obj:`nn.Module`): The model with parameters to be optimized. optimizer_cfg (dict): The config dict of the optimizer. Positional fields are: - type: class name of the optimizer. - lr: base learning rate. Optional fields are: - any arguments of the corresponding optimizer type, e.g., weight_decay, momentum, etc. - paramwise_options: a dict with 3 accepted fileds (bias_lr_mult, bias_decay_mult, norm_decay_mult). `bias_lr_mult` and `bias_decay_mult` will be multiplied to the lr and weight decay respectively for all bias parameters (except for the normalization layers), and `norm_decay_mult` will be multiplied to the weight decay for all weight and bias parameters of normalization layers. Returns: torch.optim.Optimizer: The initialized optimizer. """ if hasattr(model, 'module'): model = model.module optimizer_cfg = optimizer_cfg.copy() paramwise_options = optimizer_cfg.pop('paramwise_options', None) # if no paramwise option is specified, just use the global setting if paramwise_options is None: return obj_from_dict(optimizer_cfg, torch.optim, dict(params=model.parameters())) else: assert isinstance(paramwise_options, dict) # get base lr and weight decay base_lr = optimizer_cfg['lr'] base_wd = optimizer_cfg.get('weight_decay', None) # weight_decay must be explicitly specified if mult is specified if ('bias_decay_mult' in paramwise_options or 'norm_decay_mult' in paramwise_options): assert base_wd is not None # get param-wise options bias_lr_mult = paramwise_options.get('bias_lr_mult', 1.) bias_decay_mult = paramwise_options.get('bias_decay_mult', 1.) norm_decay_mult = paramwise_options.get('norm_decay_mult', 1.) # set param-wise lr and weight decay params = [] for name, param in model.named_parameters(): if not param.requires_grad: continue param_group = {'params': [param]} # for norm layers, overwrite the weight decay of weight and bias # TODO: obtain the norm layer prefixes dynamically if re.search(r'(bn|gn)(\d+)?.(weight|bias)', name): if base_wd is not None: param_group['weight_decay'] = base_wd * norm_decay_mult # for other layers, overwrite both lr and weight decay of bias elif name.endswith('.bias'): param_group['lr'] = base_lr * bias_lr_mult if base_wd is not None: param_group['weight_decay'] = base_wd * bias_decay_mult # otherwise use the global settings params.append(param_group) optimizer_cls = getattr(torch.optim, optimizer_cfg.pop('type')) return optimizer_cls(params, **optimizer_cfg) def _dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader( dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = cfg.data.val if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook(CocoDistEvalRecallHook(val_dataset_cfg)) else: dataset_type = getattr(datasets, val_dataset_cfg.type) if issubclass(dataset_type, datasets.CocoDataset): runner.register_hook(CocoDistEvalmAPHook(val_dataset_cfg)) else: runner.register_hook(DistEvalmAPHook(val_dataset_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs) def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader( dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs) ================================================ FILE: mmdetection/mmdet/core/__init__.py ================================================ from .anchor import * # noqa: F401, F403 from .bbox import * # noqa: F401, F403 from .mask import * # noqa: F401, F403 from .loss import * # noqa: F401, F403 from .evaluation import * # noqa: F401, F403 from .post_processing import * # noqa: F401, F403 from .utils import * # noqa: F401, F403 ================================================ FILE: mmdetection/mmdet/core/anchor/__init__.py ================================================ from .anchor_generator import AnchorGenerator from .anchor_target import anchor_target __all__ = ['AnchorGenerator', 'anchor_target'] ================================================ FILE: mmdetection/mmdet/core/anchor/anchor_generator.py ================================================ import torch class AnchorGenerator(object): def __init__(self, base_size, scales, ratios, scale_major=True, ctr=None): self.base_size = base_size self.scales = torch.Tensor(scales) self.ratios = torch.Tensor(ratios) self.scale_major = scale_major self.ctr = ctr self.base_anchors = self.gen_base_anchors() @property def num_base_anchors(self): return self.base_anchors.size(0) def gen_base_anchors(self): w = self.base_size h = self.base_size if self.ctr is None: x_ctr = 0.5 * (w - 1) y_ctr = 0.5 * (h - 1) else: x_ctr, y_ctr = self.ctr h_ratios = torch.sqrt(self.ratios) w_ratios = 1 / h_ratios if self.scale_major: ws = (w * w_ratios[:, None] * self.scales[None, :]).view(-1) hs = (h * h_ratios[:, None] * self.scales[None, :]).view(-1) else: ws = (w * self.scales[:, None] * w_ratios[None, :]).view(-1) hs = (h * self.scales[:, None] * h_ratios[None, :]).view(-1) base_anchors = torch.stack( [ x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1), x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1) ], dim=-1).round() return base_anchors def _meshgrid(self, x, y, row_major=True): xx = x.repeat(len(y)) yy = y.view(-1, 1).repeat(1, len(x)).view(-1) if row_major: return xx, yy else: return yy, xx def grid_anchors(self, featmap_size, stride=16, device='cuda'): base_anchors = self.base_anchors.to(device) feat_h, feat_w = featmap_size shift_x = torch.arange(0, feat_w, device=device) * stride shift_y = torch.arange(0, feat_h, device=device) * stride shift_xx, shift_yy = self._meshgrid(shift_x, shift_y) shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1) shifts = shifts.type_as(base_anchors) # first feat_w elements correspond to the first row of shifts # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get # shifted anchors (K, A, 4), reshape to (K*A, 4) all_anchors = base_anchors[None, :, :] + shifts[:, None, :] all_anchors = all_anchors.view(-1, 4) # first A rows correspond to A anchors of (0, 0) in feature map, # then (0, 1), (0, 2), ... return all_anchors def valid_flags(self, featmap_size, valid_size, device='cuda'): feat_h, feat_w = featmap_size valid_h, valid_w = valid_size assert valid_h <= feat_h and valid_w <= feat_w valid_x = torch.zeros(feat_w, dtype=torch.uint8, device=device) valid_y = torch.zeros(feat_h, dtype=torch.uint8, device=device) valid_x[:valid_w] = 1 valid_y[:valid_h] = 1 valid_xx, valid_yy = self._meshgrid(valid_x, valid_y) valid = valid_xx & valid_yy valid = valid[:, None].expand( valid.size(0), self.num_base_anchors).contiguous().view(-1) return valid ================================================ FILE: mmdetection/mmdet/core/anchor/anchor_target.py ================================================ import torch from ..bbox import assign_and_sample, build_assigner, PseudoSampler, bbox2delta from ..utils import multi_apply def anchor_target(anchor_list, valid_flag_list, gt_bboxes_list, img_metas, target_means, target_stds, cfg, gt_bboxes_ignore_list=None, gt_labels_list=None, label_channels=1, sampling=True, unmap_outputs=True): """Compute regression and classification targets for anchors. Args: anchor_list (list[list]): Multi level anchors of each image. valid_flag_list (list[list]): Multi level valid flags of each image. gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image. img_metas (list[dict]): Meta info of each image. target_means (Iterable): Mean value of regression targets. target_stds (Iterable): Std value of regression targets. cfg (dict): RPN train configs. Returns: tuple """ num_imgs = len(img_metas) assert len(anchor_list) == len(valid_flag_list) == num_imgs # anchor number of multi levels num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] # concat all level anchors and flags to a single tensor for i in range(num_imgs): assert len(anchor_list[i]) == len(valid_flag_list[i]) anchor_list[i] = torch.cat(anchor_list[i]) valid_flag_list[i] = torch.cat(valid_flag_list[i]) # compute targets for each image if gt_bboxes_ignore_list is None: gt_bboxes_ignore_list = [None for _ in range(num_imgs)] if gt_labels_list is None: gt_labels_list = [None for _ in range(num_imgs)] (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights, pos_inds_list, neg_inds_list) = multi_apply( anchor_target_single, anchor_list, valid_flag_list, gt_bboxes_list, gt_bboxes_ignore_list, gt_labels_list, img_metas, target_means=target_means, target_stds=target_stds, cfg=cfg, label_channels=label_channels, sampling=sampling, unmap_outputs=unmap_outputs) # no valid anchors if any([labels is None for labels in all_labels]): return None # sampled anchors of all images num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list]) num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list]) # split targets to a list w.r.t. multiple levels labels_list = images_to_levels(all_labels, num_level_anchors) label_weights_list = images_to_levels(all_label_weights, num_level_anchors) bbox_targets_list = images_to_levels(all_bbox_targets, num_level_anchors) bbox_weights_list = images_to_levels(all_bbox_weights, num_level_anchors) return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) def images_to_levels(target, num_level_anchors): """Convert targets by image to targets by feature level. [target_img0, target_img1] -> [target_level0, target_level1, ...] """ target = torch.stack(target, 0) level_targets = [] start = 0 for n in num_level_anchors: end = start + n level_targets.append(target[:, start:end].squeeze(0)) start = end return level_targets def anchor_target_single(flat_anchors, valid_flags, gt_bboxes, gt_bboxes_ignore, gt_labels, img_meta, target_means, target_stds, cfg, label_channels=1, sampling=True, unmap_outputs=True): inside_flags = anchor_inside_flags(flat_anchors, valid_flags, img_meta['img_shape'][:2], cfg.allowed_border) if not inside_flags.any(): return (None, ) * 6 # assign gt and sample anchors anchors = flat_anchors[inside_flags, :] if sampling: assign_result, sampling_result = assign_and_sample( anchors, gt_bboxes, gt_bboxes_ignore, None, cfg) else: bbox_assigner = build_assigner(cfg.assigner) assign_result = bbox_assigner.assign(anchors, gt_bboxes, gt_bboxes_ignore, gt_labels) bbox_sampler = PseudoSampler() sampling_result = bbox_sampler.sample(assign_result, anchors, gt_bboxes) num_valid_anchors = anchors.shape[0] bbox_targets = torch.zeros_like(anchors) bbox_weights = torch.zeros_like(anchors) labels = anchors.new_zeros(num_valid_anchors, dtype=torch.long) label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds if len(pos_inds) > 0: pos_bbox_targets = bbox2delta(sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes, target_means, target_stds) bbox_targets[pos_inds, :] = pos_bbox_targets bbox_weights[pos_inds, :] = 1.0 if gt_labels is None: labels[pos_inds] = 1 else: labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] if cfg.pos_weight <= 0: label_weights[pos_inds] = 1.0 else: label_weights[pos_inds] = cfg.pos_weight if len(neg_inds) > 0: label_weights[neg_inds] = 1.0 # map up to original set of anchors if unmap_outputs: num_total_anchors = flat_anchors.size(0) labels = unmap(labels, num_total_anchors, inside_flags) label_weights = unmap(label_weights, num_total_anchors, inside_flags) bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags) bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags) return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, neg_inds) def anchor_inside_flags(flat_anchors, valid_flags, img_shape, allowed_border=0): img_h, img_w = img_shape[:2] if allowed_border >= 0: inside_flags = valid_flags & \ (flat_anchors[:, 0] >= -allowed_border) & \ (flat_anchors[:, 1] >= -allowed_border) & \ (flat_anchors[:, 2] < img_w + allowed_border) & \ (flat_anchors[:, 3] < img_h + allowed_border) else: inside_flags = valid_flags return inside_flags def unmap(data, count, inds, fill=0): """ Unmap a subset of item (data) back to the original set of items (of size count) """ if data.dim() == 1: ret = data.new_full((count, ), fill) ret[inds] = data else: new_size = (count, ) + data.size()[1:] ret = data.new_full(new_size, fill) ret[inds, :] = data return ret ================================================ FILE: mmdetection/mmdet/core/bbox/__init__.py ================================================ from .geometry import bbox_overlaps from .assigners import BaseAssigner, MaxIoUAssigner, AssignResult from .samplers import (BaseSampler, PseudoSampler, RandomSampler, InstanceBalancedPosSampler, IoUBalancedNegSampler, CombinedSampler, SamplingResult) from .assign_sampling import build_assigner, build_sampler, assign_and_sample from .transforms import (bbox2delta, delta2bbox, bbox_flip, bbox_mapping, bbox_mapping_back, bbox2roi, roi2bbox, bbox2result, distance2bbox) from .bbox_target import bbox_target __all__ = [ 'bbox_overlaps', 'BaseAssigner', 'MaxIoUAssigner', 'AssignResult', 'BaseSampler', 'PseudoSampler', 'RandomSampler', 'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler', 'SamplingResult', 'build_assigner', 'build_sampler', 'assign_and_sample', 'bbox2delta', 'delta2bbox', 'bbox_flip', 'bbox_mapping', 'bbox_mapping_back', 'bbox2roi', 'roi2bbox', 'bbox2result', 'distance2bbox', 'bbox_target' ] ================================================ FILE: mmdetection/mmdet/core/bbox/assign_sampling.py ================================================ import mmcv from . import assigners, samplers def build_assigner(cfg, **kwargs): if isinstance(cfg, assigners.BaseAssigner): return cfg elif isinstance(cfg, dict): return mmcv.runner.obj_from_dict( cfg, assigners, default_args=kwargs) else: raise TypeError('Invalid type {} for building a sampler'.format( type(cfg))) def build_sampler(cfg, **kwargs): if isinstance(cfg, samplers.BaseSampler): return cfg elif isinstance(cfg, dict): return mmcv.runner.obj_from_dict( cfg, samplers, default_args=kwargs) else: raise TypeError('Invalid type {} for building a sampler'.format( type(cfg))) def assign_and_sample(bboxes, gt_bboxes, gt_bboxes_ignore, gt_labels, cfg): bbox_assigner = build_assigner(cfg.assigner) bbox_sampler = build_sampler(cfg.sampler) assign_result = bbox_assigner.assign(bboxes, gt_bboxes, gt_bboxes_ignore, gt_labels) sampling_result = bbox_sampler.sample(assign_result, bboxes, gt_bboxes, gt_labels) return assign_result, sampling_result ================================================ FILE: mmdetection/mmdet/core/bbox/assigners/__init__.py ================================================ from .base_assigner import BaseAssigner from .max_iou_assigner import MaxIoUAssigner from .assign_result import AssignResult __all__ = ['BaseAssigner', 'MaxIoUAssigner', 'AssignResult'] ================================================ FILE: mmdetection/mmdet/core/bbox/assigners/assign_result.py ================================================ import torch class AssignResult(object): def __init__(self, num_gts, gt_inds, max_overlaps, labels=None): self.num_gts = num_gts self.gt_inds = gt_inds self.max_overlaps = max_overlaps self.labels = labels def add_gt_(self, gt_labels): self_inds = torch.arange( 1, len(gt_labels) + 1, dtype=torch.long, device=gt_labels.device) self.gt_inds = torch.cat([self_inds, self.gt_inds]) self.max_overlaps = torch.cat( [self.max_overlaps.new_ones(self.num_gts), self.max_overlaps]) if self.labels is not None: self.labels = torch.cat([gt_labels, self.labels]) ================================================ FILE: mmdetection/mmdet/core/bbox/assigners/base_assigner.py ================================================ from abc import ABCMeta, abstractmethod class BaseAssigner(metaclass=ABCMeta): @abstractmethod def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None): pass ================================================ FILE: mmdetection/mmdet/core/bbox/assigners/max_iou_assigner.py ================================================ import torch from .base_assigner import BaseAssigner from .assign_result import AssignResult from ..geometry import bbox_overlaps class MaxIoUAssigner(BaseAssigner): """Assign a corresponding gt bbox or background to each bbox. Each proposals will be assigned with `-1`, `0`, or a positive integer indicating the ground truth index. - -1: don't care - 0: negative sample, no assigned gt - positive integer: positive sample, index (1-based) of assigned gt Args: pos_iou_thr (float): IoU threshold for positive bboxes. neg_iou_thr (float or tuple): IoU threshold for negative bboxes. min_pos_iou (float): Minimum iou for a bbox to be considered as a positive bbox. Positive samples can have smaller IoU than pos_iou_thr due to the 4th step (assign max IoU sample to each gt). gt_max_assign_all (bool): Whether to assign all bboxes with the same highest overlap with some gt to that gt. ignore_iof_thr (float): IoF threshold for ignoring bboxes (if `gt_bboxes_ignore` is specified). Negative values mean not ignoring any bboxes. ignore_wrt_candidates (bool): Whether to compute the iof between `bboxes` and `gt_bboxes_ignore`, or the contrary. """ def __init__(self, pos_iou_thr, neg_iou_thr, min_pos_iou=.0, gt_max_assign_all=True, ignore_iof_thr=-1, ignore_wrt_candidates=True): self.pos_iou_thr = pos_iou_thr self.neg_iou_thr = neg_iou_thr self.min_pos_iou = min_pos_iou self.gt_max_assign_all = gt_max_assign_all self.ignore_iof_thr = ignore_iof_thr self.ignore_wrt_candidates = ignore_wrt_candidates def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None): """Assign gt to bboxes. This method assign a gt bbox to every bbox (proposal/anchor), each bbox will be assigned with -1, 0, or a positive number. -1 means don't care, 0 means negative sample, positive number is the index (1-based) of assigned gt. The assignment is done in following steps, the order matters. 1. assign every bbox to -1 2. assign proposals whose iou with all gts < neg_iou_thr to 0 3. for each bbox, if the iou with its nearest gt >= pos_iou_thr, assign it to that bbox 4. for each gt bbox, assign its nearest proposals (may be more than one) to itself Args: bboxes (Tensor): Bounding boxes to be assigned, shape(n, 4). gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4). gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are labelled as `ignored`, e.g., crowd boxes in COCO. gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ). Returns: :obj:`AssignResult`: The assign result. """ if bboxes.shape[0] == 0 or gt_bboxes.shape[0] == 0: raise ValueError('No gt or bboxes') bboxes = bboxes[:, :4] overlaps = bbox_overlaps(gt_bboxes, bboxes) if (self.ignore_iof_thr > 0) and (gt_bboxes_ignore is not None) and ( gt_bboxes_ignore.numel() > 0): if self.ignore_wrt_candidates: ignore_overlaps = bbox_overlaps( bboxes, gt_bboxes_ignore, mode='iof') ignore_max_overlaps, _ = ignore_overlaps.max(dim=1) else: ignore_overlaps = bbox_overlaps( gt_bboxes_ignore, bboxes, mode='iof') ignore_max_overlaps, _ = ignore_overlaps.max(dim=0) overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1 assign_result = self.assign_wrt_overlaps(overlaps, gt_labels) return assign_result def assign_wrt_overlaps(self, overlaps, gt_labels=None): """Assign w.r.t. the overlaps of bboxes with gts. Args: overlaps (Tensor): Overlaps between k gt_bboxes and n bboxes, shape(k, n). gt_labels (Tensor, optional): Labels of k gt_bboxes, shape (k, ). Returns: :obj:`AssignResult`: The assign result. """ if overlaps.numel() == 0: raise ValueError('No gt or proposals') num_gts, num_bboxes = overlaps.size(0), overlaps.size(1) # 1. assign -1 by default assigned_gt_inds = overlaps.new_full( (num_bboxes, ), -1, dtype=torch.long) # for each anchor, which gt best overlaps with it # for each anchor, the max iou of all gts max_overlaps, argmax_overlaps = overlaps.max(dim=0) # for each gt, which anchor best overlaps with it # for each gt, the max iou of all proposals gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1) # 2. assign negative: below if isinstance(self.neg_iou_thr, float): assigned_gt_inds[(max_overlaps >= 0) & (max_overlaps < self.neg_iou_thr)] = 0 elif isinstance(self.neg_iou_thr, tuple): assert len(self.neg_iou_thr) == 2 assigned_gt_inds[(max_overlaps >= self.neg_iou_thr[0]) & (max_overlaps < self.neg_iou_thr[1])] = 0 # 3. assign positive: above positive IoU threshold pos_inds = max_overlaps >= self.pos_iou_thr assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1 # 4. assign fg: for each gt, proposals with highest IoU for i in range(num_gts): if gt_max_overlaps[i] >= self.min_pos_iou: if self.gt_max_assign_all: max_iou_inds = overlaps[i, :] == gt_max_overlaps[i] assigned_gt_inds[max_iou_inds] = i + 1 else: assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1 if gt_labels is not None: assigned_labels = assigned_gt_inds.new_zeros((num_bboxes, )) pos_inds = torch.nonzero(assigned_gt_inds > 0).squeeze() if pos_inds.numel() > 0: assigned_labels[pos_inds] = gt_labels[ assigned_gt_inds[pos_inds] - 1] else: assigned_labels = None return AssignResult( num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels) ================================================ FILE: mmdetection/mmdet/core/bbox/bbox_target.py ================================================ import torch from .transforms import bbox2delta from ..utils import multi_apply def bbox_target(pos_bboxes_list, neg_bboxes_list, pos_gt_bboxes_list, pos_gt_labels_list, cfg, reg_classes=1, target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], concat=True): labels, label_weights, bbox_targets, bbox_weights = multi_apply( bbox_target_single, pos_bboxes_list, neg_bboxes_list, pos_gt_bboxes_list, pos_gt_labels_list, cfg=cfg, reg_classes=reg_classes, target_means=target_means, target_stds=target_stds) if concat: labels = torch.cat(labels, 0) label_weights = torch.cat(label_weights, 0) bbox_targets = torch.cat(bbox_targets, 0) bbox_weights = torch.cat(bbox_weights, 0) return labels, label_weights, bbox_targets, bbox_weights def bbox_target_single(pos_bboxes, neg_bboxes, pos_gt_bboxes, pos_gt_labels, cfg, reg_classes=1, target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]): num_pos = pos_bboxes.size(0) num_neg = neg_bboxes.size(0) num_samples = num_pos + num_neg labels = pos_bboxes.new_zeros(num_samples, dtype=torch.long) label_weights = pos_bboxes.new_zeros(num_samples) bbox_targets = pos_bboxes.new_zeros(num_samples, 4) bbox_weights = pos_bboxes.new_zeros(num_samples, 4) if num_pos > 0: labels[:num_pos] = pos_gt_labels pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight label_weights[:num_pos] = pos_weight pos_bbox_targets = bbox2delta(pos_bboxes, pos_gt_bboxes, target_means, target_stds) bbox_targets[:num_pos, :] = pos_bbox_targets bbox_weights[:num_pos, :] = 1 if num_neg > 0: label_weights[-num_neg:] = 1.0 return labels, label_weights, bbox_targets, bbox_weights def expand_target(bbox_targets, bbox_weights, labels, num_classes): bbox_targets_expand = bbox_targets.new_zeros((bbox_targets.size(0), 4 * num_classes)) bbox_weights_expand = bbox_weights.new_zeros((bbox_weights.size(0), 4 * num_classes)) for i in torch.nonzero(labels > 0).squeeze(-1): start, end = labels[i] * 4, (labels[i] + 1) * 4 bbox_targets_expand[i, start:end] = bbox_targets[i, :] bbox_weights_expand[i, start:end] = bbox_weights[i, :] return bbox_targets_expand, bbox_weights_expand ================================================ FILE: mmdetection/mmdet/core/bbox/geometry.py ================================================ import torch def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False): """Calculate overlap between two set of bboxes. If ``is_aligned`` is ``False``, then calculate the ious between each bbox of bboxes1 and bboxes2, otherwise the ious between each aligned pair of bboxes1 and bboxes2. Args: bboxes1 (Tensor): shape (m, 4) bboxes2 (Tensor): shape (n, 4), if is_aligned is ``True``, then m and n must be equal. mode (str): "iou" (intersection over union) or iof (intersection over foreground). Returns: ious(Tensor): shape (m, n) if is_aligned == False else shape (m, 1) """ assert mode in ['iou', 'iof'] rows = bboxes1.size(0) cols = bboxes2.size(0) if is_aligned: assert rows == cols if rows * cols == 0: return bboxes1.new(rows, 1) if is_aligned else bboxes1.new(rows, cols) if is_aligned: lt = torch.max(bboxes1[:, :2], bboxes2[:, :2]) # [rows, 2] rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:]) # [rows, 2] wh = (rb - lt + 1).clamp(min=0) # [rows, 2] overlap = wh[:, 0] * wh[:, 1] area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * ( bboxes1[:, 3] - bboxes1[:, 1] + 1) if mode == 'iou': area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * ( bboxes2[:, 3] - bboxes2[:, 1] + 1) ious = overlap / (area1 + area2 - overlap) else: ious = overlap / area1 else: lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2]) # [rows, cols, 2] rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:]) # [rows, cols, 2] wh = (rb - lt + 1).clamp(min=0) # [rows, cols, 2] overlap = wh[:, :, 0] * wh[:, :, 1] area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * ( bboxes1[:, 3] - bboxes1[:, 1] + 1) if mode == 'iou': area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * ( bboxes2[:, 3] - bboxes2[:, 1] + 1) ious = overlap / (area1[:, None] + area2 - overlap) else: ious = overlap / (area1[:, None]) return ious ================================================ FILE: mmdetection/mmdet/core/bbox/samplers/__init__.py ================================================ from .base_sampler import BaseSampler from .pseudo_sampler import PseudoSampler from .random_sampler import RandomSampler from .instance_balanced_pos_sampler import InstanceBalancedPosSampler from .iou_balanced_neg_sampler import IoUBalancedNegSampler from .combined_sampler import CombinedSampler from .ohem_sampler import OHEMSampler from .sampling_result import SamplingResult __all__ = [ 'BaseSampler', 'PseudoSampler', 'RandomSampler', 'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler', 'OHEMSampler', 'SamplingResult' ] ================================================ FILE: mmdetection/mmdet/core/bbox/samplers/base_sampler.py ================================================ from abc import ABCMeta, abstractmethod import torch from .sampling_result import SamplingResult class BaseSampler(metaclass=ABCMeta): def __init__(self, num, pos_fraction, neg_pos_ub=-1, add_gt_as_proposals=True, **kwargs): self.num = num self.pos_fraction = pos_fraction self.neg_pos_ub = neg_pos_ub self.add_gt_as_proposals = add_gt_as_proposals self.pos_sampler = self self.neg_sampler = self @abstractmethod def _sample_pos(self, assign_result, num_expected, **kwargs): pass @abstractmethod def _sample_neg(self, assign_result, num_expected, **kwargs): pass def sample(self, assign_result, bboxes, gt_bboxes, gt_labels=None, **kwargs): """Sample positive and negative bboxes. This is a simple implementation of bbox sampling given candidates, assigning results and ground truth bboxes. Args: assign_result (:obj:`AssignResult`): Bbox assigning results. bboxes (Tensor): Boxes to be sampled from. gt_bboxes (Tensor): Ground truth bboxes. gt_labels (Tensor, optional): Class labels of ground truth bboxes. Returns: :obj:`SamplingResult`: Sampling result. """ bboxes = bboxes[:, :4] gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.uint8) if self.add_gt_as_proposals: bboxes = torch.cat([gt_bboxes, bboxes], dim=0) assign_result.add_gt_(gt_labels) gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.uint8) gt_flags = torch.cat([gt_ones, gt_flags]) num_expected_pos = int(self.num * self.pos_fraction) pos_inds = self.pos_sampler._sample_pos( assign_result, num_expected_pos, bboxes=bboxes, **kwargs) # We found that sampled indices have duplicated items occasionally. # (may be a bug of PyTorch) pos_inds = pos_inds.unique() num_sampled_pos = pos_inds.numel() num_expected_neg = self.num - num_sampled_pos if self.neg_pos_ub >= 0: _pos = max(1, num_sampled_pos) neg_upper_bound = int(self.neg_pos_ub * _pos) if num_expected_neg > neg_upper_bound: num_expected_neg = neg_upper_bound neg_inds = self.neg_sampler._sample_neg( assign_result, num_expected_neg, bboxes=bboxes, **kwargs) neg_inds = neg_inds.unique() return SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes, assign_result, gt_flags) ================================================ FILE: mmdetection/mmdet/core/bbox/samplers/combined_sampler.py ================================================ from .base_sampler import BaseSampler from ..assign_sampling import build_sampler class CombinedSampler(BaseSampler): def __init__(self, pos_sampler, neg_sampler, **kwargs): super(CombinedSampler, self).__init__(**kwargs) self.pos_sampler = build_sampler(pos_sampler, **kwargs) self.neg_sampler = build_sampler(neg_sampler, **kwargs) def _sample_pos(self, **kwargs): raise NotImplementedError def _sample_neg(self, **kwargs): raise NotImplementedError ================================================ FILE: mmdetection/mmdet/core/bbox/samplers/instance_balanced_pos_sampler.py ================================================ import numpy as np import torch from .random_sampler import RandomSampler class InstanceBalancedPosSampler(RandomSampler): def _sample_pos(self, assign_result, num_expected, **kwargs): pos_inds = torch.nonzero(assign_result.gt_inds > 0) if pos_inds.numel() != 0: pos_inds = pos_inds.squeeze(1) if pos_inds.numel() <= num_expected: return pos_inds else: unique_gt_inds = assign_result.gt_inds[pos_inds].unique() num_gts = len(unique_gt_inds) num_per_gt = int(round(num_expected / float(num_gts)) + 1) sampled_inds = [] for i in unique_gt_inds: inds = torch.nonzero(assign_result.gt_inds == i.item()) if inds.numel() != 0: inds = inds.squeeze(1) else: continue if len(inds) > num_per_gt: inds = self.random_choice(inds, num_per_gt) sampled_inds.append(inds) sampled_inds = torch.cat(sampled_inds) if len(sampled_inds) < num_expected: num_extra = num_expected - len(sampled_inds) extra_inds = np.array( list(set(pos_inds.cpu()) - set(sampled_inds.cpu()))) if len(extra_inds) > num_extra: extra_inds = self.random_choice(extra_inds, num_extra) extra_inds = torch.from_numpy(extra_inds).to( assign_result.gt_inds.device).long() sampled_inds = torch.cat([sampled_inds, extra_inds]) elif len(sampled_inds) > num_expected: sampled_inds = self.random_choice(sampled_inds, num_expected) return sampled_inds ================================================ FILE: mmdetection/mmdet/core/bbox/samplers/iou_balanced_neg_sampler.py ================================================ import numpy as np import torch from .random_sampler import RandomSampler class IoUBalancedNegSampler(RandomSampler): def __init__(self, num, pos_fraction, hard_thr=0.1, hard_fraction=0.5, **kwargs): super(IoUBalancedNegSampler, self).__init__(num, pos_fraction, **kwargs) assert hard_thr > 0 assert 0 < hard_fraction < 1 self.hard_thr = hard_thr self.hard_fraction = hard_fraction def _sample_neg(self, assign_result, num_expected, **kwargs): neg_inds = torch.nonzero(assign_result.gt_inds == 0) if neg_inds.numel() != 0: neg_inds = neg_inds.squeeze(1) if len(neg_inds) <= num_expected: return neg_inds else: max_overlaps = assign_result.max_overlaps.cpu().numpy() # balance sampling for negative samples neg_set = set(neg_inds.cpu().numpy()) easy_set = set( np.where( np.logical_and(max_overlaps >= 0, max_overlaps < self.hard_thr))[0]) hard_set = set(np.where(max_overlaps >= self.hard_thr)[0]) easy_neg_inds = list(easy_set & neg_set) hard_neg_inds = list(hard_set & neg_set) num_expected_hard = int(num_expected * self.hard_fraction) if len(hard_neg_inds) > num_expected_hard: sampled_hard_inds = self.random_choice(hard_neg_inds, num_expected_hard) else: sampled_hard_inds = np.array(hard_neg_inds, dtype=np.int) num_expected_easy = num_expected - len(sampled_hard_inds) if len(easy_neg_inds) > num_expected_easy: sampled_easy_inds = self.random_choice(easy_neg_inds, num_expected_easy) else: sampled_easy_inds = np.array(easy_neg_inds, dtype=np.int) sampled_inds = np.concatenate((sampled_easy_inds, sampled_hard_inds)) if len(sampled_inds) < num_expected: num_extra = num_expected - len(sampled_inds) extra_inds = np.array(list(neg_set - set(sampled_inds))) if len(extra_inds) > num_extra: extra_inds = self.random_choice(extra_inds, num_extra) sampled_inds = np.concatenate((sampled_inds, extra_inds)) sampled_inds = torch.from_numpy(sampled_inds).long().to( assign_result.gt_inds.device) return sampled_inds ================================================ FILE: mmdetection/mmdet/core/bbox/samplers/ohem_sampler.py ================================================ import torch from .base_sampler import BaseSampler from ..transforms import bbox2roi class OHEMSampler(BaseSampler): def __init__(self, num, pos_fraction, context, neg_pos_ub=-1, add_gt_as_proposals=True, **kwargs): super(OHEMSampler, self).__init__(num, pos_fraction, neg_pos_ub, add_gt_as_proposals) if not hasattr(context, 'num_stages'): self.bbox_roi_extractor = context.bbox_roi_extractor self.bbox_head = context.bbox_head else: self.bbox_roi_extractor = context.bbox_roi_extractor[ context.current_stage] self.bbox_head = context.bbox_head[context.current_stage] def hard_mining(self, inds, num_expected, bboxes, labels, feats): with torch.no_grad(): rois = bbox2roi([bboxes]) bbox_feats = self.bbox_roi_extractor( feats[:self.bbox_roi_extractor.num_inputs], rois) cls_score, _ = self.bbox_head(bbox_feats) loss = self.bbox_head.loss( cls_score=cls_score, bbox_pred=None, labels=labels, label_weights=cls_score.new_ones(cls_score.size(0)), bbox_targets=None, bbox_weights=None, reduce=False)['loss_cls'] _, topk_loss_inds = loss.topk(num_expected) return inds[topk_loss_inds] def _sample_pos(self, assign_result, num_expected, bboxes=None, feats=None, **kwargs): # Sample some hard positive samples pos_inds = torch.nonzero(assign_result.gt_inds > 0) if pos_inds.numel() != 0: pos_inds = pos_inds.squeeze(1) if pos_inds.numel() <= num_expected: return pos_inds else: return self.hard_mining(pos_inds, num_expected, bboxes[pos_inds], assign_result.labels[pos_inds], feats) def _sample_neg(self, assign_result, num_expected, bboxes=None, feats=None, **kwargs): # Sample some hard negative samples neg_inds = torch.nonzero(assign_result.gt_inds == 0) if neg_inds.numel() != 0: neg_inds = neg_inds.squeeze(1) if len(neg_inds) <= num_expected: return neg_inds else: return self.hard_mining(neg_inds, num_expected, bboxes[neg_inds], assign_result.labels[neg_inds], feats) ================================================ FILE: mmdetection/mmdet/core/bbox/samplers/pseudo_sampler.py ================================================ import torch from .base_sampler import BaseSampler from .sampling_result import SamplingResult class PseudoSampler(BaseSampler): def __init__(self, **kwargs): pass def _sample_pos(self, **kwargs): raise NotImplementedError def _sample_neg(self, **kwargs): raise NotImplementedError def sample(self, assign_result, bboxes, gt_bboxes, **kwargs): pos_inds = torch.nonzero( assign_result.gt_inds > 0).squeeze(-1).unique() neg_inds = torch.nonzero( assign_result.gt_inds == 0).squeeze(-1).unique() gt_flags = bboxes.new_zeros(bboxes.shape[0], dtype=torch.uint8) sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes, assign_result, gt_flags) return sampling_result ================================================ FILE: mmdetection/mmdet/core/bbox/samplers/random_sampler.py ================================================ import numpy as np import torch from .base_sampler import BaseSampler class RandomSampler(BaseSampler): def __init__(self, num, pos_fraction, neg_pos_ub=-1, add_gt_as_proposals=True, **kwargs): super(RandomSampler, self).__init__(num, pos_fraction, neg_pos_ub, add_gt_as_proposals) @staticmethod def random_choice(gallery, num): """Random select some elements from the gallery. It seems that Pytorch's implementation is slower than numpy so we use numpy to randperm the indices. """ assert len(gallery) >= num if isinstance(gallery, list): gallery = np.array(gallery) cands = np.arange(len(gallery)) np.random.shuffle(cands) rand_inds = cands[:num] if not isinstance(gallery, np.ndarray): rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device) return gallery[rand_inds] def _sample_pos(self, assign_result, num_expected, **kwargs): """Randomly sample some positive samples.""" pos_inds = torch.nonzero(assign_result.gt_inds > 0) if pos_inds.numel() != 0: pos_inds = pos_inds.squeeze(1) if pos_inds.numel() <= num_expected: return pos_inds else: return self.random_choice(pos_inds, num_expected) def _sample_neg(self, assign_result, num_expected, **kwargs): """Randomly sample some negative samples.""" neg_inds = torch.nonzero(assign_result.gt_inds == 0) if neg_inds.numel() != 0: neg_inds = neg_inds.squeeze(1) if len(neg_inds) <= num_expected: return neg_inds else: return self.random_choice(neg_inds, num_expected) ================================================ FILE: mmdetection/mmdet/core/bbox/samplers/sampling_result.py ================================================ import torch class SamplingResult(object): def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result, gt_flags): self.pos_inds = pos_inds self.neg_inds = neg_inds self.pos_bboxes = bboxes[pos_inds] self.neg_bboxes = bboxes[neg_inds] self.pos_is_gt = gt_flags[pos_inds] self.num_gts = gt_bboxes.shape[0] self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds, :] if assign_result.labels is not None: self.pos_gt_labels = assign_result.labels[pos_inds] else: self.pos_gt_labels = None @property def bboxes(self): return torch.cat([self.pos_bboxes, self.neg_bboxes]) ================================================ FILE: mmdetection/mmdet/core/bbox/transforms.py ================================================ import mmcv import numpy as np import torch def bbox2delta(proposals, gt, means=[0, 0, 0, 0], stds=[1, 1, 1, 1]): assert proposals.size() == gt.size() proposals = proposals.float() gt = gt.float() px = (proposals[..., 0] + proposals[..., 2]) * 0.5 py = (proposals[..., 1] + proposals[..., 3]) * 0.5 pw = proposals[..., 2] - proposals[..., 0] + 1.0 ph = proposals[..., 3] - proposals[..., 1] + 1.0 gx = (gt[..., 0] + gt[..., 2]) * 0.5 gy = (gt[..., 1] + gt[..., 3]) * 0.5 gw = gt[..., 2] - gt[..., 0] + 1.0 gh = gt[..., 3] - gt[..., 1] + 1.0 dx = (gx - px) / pw dy = (gy - py) / ph dw = torch.log(gw / pw) dh = torch.log(gh / ph) deltas = torch.stack([dx, dy, dw, dh], dim=-1) means = deltas.new_tensor(means).unsqueeze(0) stds = deltas.new_tensor(stds).unsqueeze(0) deltas = deltas.sub_(means).div_(stds) return deltas def delta2bbox(rois, deltas, means=[0, 0, 0, 0], stds=[1, 1, 1, 1], max_shape=None, wh_ratio_clip=16 / 1000): means = deltas.new_tensor(means).repeat(1, deltas.size(1) // 4) stds = deltas.new_tensor(stds).repeat(1, deltas.size(1) // 4) denorm_deltas = deltas * stds + means dx = denorm_deltas[:, 0::4] dy = denorm_deltas[:, 1::4] dw = denorm_deltas[:, 2::4] dh = denorm_deltas[:, 3::4] max_ratio = np.abs(np.log(wh_ratio_clip)) dw = dw.clamp(min=-max_ratio, max=max_ratio) dh = dh.clamp(min=-max_ratio, max=max_ratio) px = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx) py = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy) pw = (rois[:, 2] - rois[:, 0] + 1.0).unsqueeze(1).expand_as(dw) ph = (rois[:, 3] - rois[:, 1] + 1.0).unsqueeze(1).expand_as(dh) gw = pw * dw.exp() gh = ph * dh.exp() gx = torch.addcmul(px, 1, pw, dx) # gx = px + pw * dx gy = torch.addcmul(py, 1, ph, dy) # gy = py + ph * dy x1 = gx - gw * 0.5 + 0.5 y1 = gy - gh * 0.5 + 0.5 x2 = gx + gw * 0.5 - 0.5 y2 = gy + gh * 0.5 - 0.5 if max_shape is not None: x1 = x1.clamp(min=0, max=max_shape[1] - 1) y1 = y1.clamp(min=0, max=max_shape[0] - 1) x2 = x2.clamp(min=0, max=max_shape[1] - 1) y2 = y2.clamp(min=0, max=max_shape[0] - 1) bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view_as(deltas) return bboxes def bbox_flip(bboxes, img_shape): """Flip bboxes horizontally. Args: bboxes(Tensor or ndarray): Shape (..., 4*k) img_shape(tuple): Image shape. Returns: Same type as `bboxes`: Flipped bboxes. """ if isinstance(bboxes, torch.Tensor): assert bboxes.shape[-1] % 4 == 0 flipped = bboxes.clone() flipped[:, 0::4] = img_shape[1] - bboxes[:, 2::4] - 1 flipped[:, 2::4] = img_shape[1] - bboxes[:, 0::4] - 1 return flipped elif isinstance(bboxes, np.ndarray): return mmcv.bbox_flip(bboxes, img_shape) def bbox_mapping(bboxes, img_shape, scale_factor, flip): """Map bboxes from the original image scale to testing scale""" new_bboxes = bboxes * scale_factor if flip: new_bboxes = bbox_flip(new_bboxes, img_shape) return new_bboxes def bbox_mapping_back(bboxes, img_shape, scale_factor, flip): """Map bboxes from testing scale to original image scale""" new_bboxes = bbox_flip(bboxes, img_shape) if flip else bboxes new_bboxes = new_bboxes / scale_factor return new_bboxes def bbox2roi(bbox_list): """Convert a list of bboxes to roi format. Args: bbox_list (list[Tensor]): a list of bboxes corresponding to a batch of images. Returns: Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2] """ rois_list = [] for img_id, bboxes in enumerate(bbox_list): if bboxes.size(0) > 0: img_inds = bboxes.new_full((bboxes.size(0), 1), img_id) rois = torch.cat([img_inds, bboxes[:, :4]], dim=-1) else: rois = bboxes.new_zeros((0, 5)) rois_list.append(rois) rois = torch.cat(rois_list, 0) return rois def roi2bbox(rois): bbox_list = [] img_ids = torch.unique(rois[:, 0].cpu(), sorted=True) for img_id in img_ids: inds = (rois[:, 0] == img_id.item()) bbox = rois[inds, 1:] bbox_list.append(bbox) return bbox_list def bbox2result(bboxes, labels, num_classes): """Convert detection results to a list of numpy arrays. Args: bboxes (Tensor): shape (n, 5) labels (Tensor): shape (n, ) num_classes (int): class number, including background class Returns: list(ndarray): bbox results of each class """ if bboxes.shape[0] == 0: return [ np.zeros((0, 5), dtype=np.float32) for i in range(num_classes - 1) ] else: bboxes = bboxes.cpu().numpy() labels = labels.cpu().numpy() return [bboxes[labels == i, :] for i in range(num_classes - 1)] def distance2bbox(points, distance, max_shape=None): """Decode distance prediction to bounding box. Args: points (Tensor): Shape (n, 2), [x, y]. distance (Tensor): Distance from the given point to 4 boundaries (left, top, right, bottom). max_shape (tuple): Shape of the image. Returns: Tensor: Decoded bboxes. """ x1 = points[:, 0] - distance[:, 0] y1 = points[:, 1] - distance[:, 1] x2 = points[:, 0] + distance[:, 2] y2 = points[:, 1] + distance[:, 3] if max_shape is not None: x1 = x1.clamp(min=0, max=max_shape[1] - 1) y1 = y1.clamp(min=0, max=max_shape[0] - 1) x2 = x2.clamp(min=0, max=max_shape[1] - 1) y2 = y2.clamp(min=0, max=max_shape[0] - 1) return torch.stack([x1, y1, x2, y2], -1) ================================================ FILE: mmdetection/mmdet/core/evaluation/__init__.py ================================================ from .class_names import (voc_classes, imagenet_det_classes, imagenet_vid_classes, coco_classes, dataset_aliases, get_classes) from .coco_utils import coco_eval, fast_eval_recall, results2json from .eval_hooks import (DistEvalHook, DistEvalmAPHook, CocoDistEvalRecallHook, CocoDistEvalmAPHook) from .mean_ap import average_precision, eval_map, print_map_summary from .recall import (eval_recalls, print_recall_summary, plot_num_recall, plot_iou_recall) __all__ = [ 'voc_classes', 'imagenet_det_classes', 'imagenet_vid_classes', 'coco_classes', 'dataset_aliases', 'get_classes', 'coco_eval', 'fast_eval_recall', 'results2json', 'DistEvalHook', 'DistEvalmAPHook', 'CocoDistEvalRecallHook', 'CocoDistEvalmAPHook', 'average_precision', 'eval_map', 'print_map_summary', 'eval_recalls', 'print_recall_summary', 'plot_num_recall', 'plot_iou_recall' ] ================================================ FILE: mmdetection/mmdet/core/evaluation/bbox_overlaps.py ================================================ import numpy as np def bbox_overlaps(bboxes1, bboxes2, mode='iou'): """Calculate the ious between each bbox of bboxes1 and bboxes2. Args: bboxes1(ndarray): shape (n, 4) bboxes2(ndarray): shape (k, 4) mode(str): iou (intersection over union) or iof (intersection over foreground) Returns: ious(ndarray): shape (n, k) """ assert mode in ['iou', 'iof'] bboxes1 = bboxes1.astype(np.float32) bboxes2 = bboxes2.astype(np.float32) rows = bboxes1.shape[0] cols = bboxes2.shape[0] ious = np.zeros((rows, cols), dtype=np.float32) if rows * cols == 0: return ious exchange = False if bboxes1.shape[0] > bboxes2.shape[0]: bboxes1, bboxes2 = bboxes2, bboxes1 ious = np.zeros((cols, rows), dtype=np.float32) exchange = True area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * ( bboxes1[:, 3] - bboxes1[:, 1] + 1) area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * ( bboxes2[:, 3] - bboxes2[:, 1] + 1) for i in range(bboxes1.shape[0]): x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0]) y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1]) x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2]) y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3]) overlap = np.maximum(x_end - x_start + 1, 0) * np.maximum( y_end - y_start + 1, 0) if mode == 'iou': union = area1[i] + area2 - overlap else: union = area1[i] if not exchange else area2 ious[i, :] = overlap / union if exchange: ious = ious.T return ious ================================================ FILE: mmdetection/mmdet/core/evaluation/class_names.py ================================================ import mmcv def voc_classes(): return [ 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor' ] def imagenet_det_classes(): return [ 'accordion', 'airplane', 'ant', 'antelope', 'apple', 'armadillo', 'artichoke', 'axe', 'baby_bed', 'backpack', 'bagel', 'balance_beam', 'banana', 'band_aid', 'banjo', 'baseball', 'basketball', 'bathing_cap', 'beaker', 'bear', 'bee', 'bell_pepper', 'bench', 'bicycle', 'binder', 'bird', 'bookshelf', 'bow_tie', 'bow', 'bowl', 'brassiere', 'burrito', 'bus', 'butterfly', 'camel', 'can_opener', 'car', 'cart', 'cattle', 'cello', 'centipede', 'chain_saw', 'chair', 'chime', 'cocktail_shaker', 'coffee_maker', 'computer_keyboard', 'computer_mouse', 'corkscrew', 'cream', 'croquet_ball', 'crutch', 'cucumber', 'cup_or_mug', 'diaper', 'digital_clock', 'dishwasher', 'dog', 'domestic_cat', 'dragonfly', 'drum', 'dumbbell', 'electric_fan', 'elephant', 'face_powder', 'fig', 'filing_cabinet', 'flower_pot', 'flute', 'fox', 'french_horn', 'frog', 'frying_pan', 'giant_panda', 'goldfish', 'golf_ball', 'golfcart', 'guacamole', 'guitar', 'hair_dryer', 'hair_spray', 'hamburger', 'hammer', 'hamster', 'harmonica', 'harp', 'hat_with_a_wide_brim', 'head_cabbage', 'helmet', 'hippopotamus', 'horizontal_bar', 'horse', 'hotdog', 'iPod', 'isopod', 'jellyfish', 'koala_bear', 'ladle', 'ladybug', 'lamp', 'laptop', 'lemon', 'lion', 'lipstick', 'lizard', 'lobster', 'maillot', 'maraca', 'microphone', 'microwave', 'milk_can', 'miniskirt', 'monkey', 'motorcycle', 'mushroom', 'nail', 'neck_brace', 'oboe', 'orange', 'otter', 'pencil_box', 'pencil_sharpener', 'perfume', 'person', 'piano', 'pineapple', 'ping-pong_ball', 'pitcher', 'pizza', 'plastic_bag', 'plate_rack', 'pomegranate', 'popsicle', 'porcupine', 'power_drill', 'pretzel', 'printer', 'puck', 'punching_bag', 'purse', 'rabbit', 'racket', 'ray', 'red_panda', 'refrigerator', 'remote_control', 'rubber_eraser', 'rugby_ball', 'ruler', 'salt_or_pepper_shaker', 'saxophone', 'scorpion', 'screwdriver', 'seal', 'sheep', 'ski', 'skunk', 'snail', 'snake', 'snowmobile', 'snowplow', 'soap_dispenser', 'soccer_ball', 'sofa', 'spatula', 'squirrel', 'starfish', 'stethoscope', 'stove', 'strainer', 'strawberry', 'stretcher', 'sunglasses', 'swimming_trunks', 'swine', 'syringe', 'table', 'tape_player', 'tennis_ball', 'tick', 'tie', 'tiger', 'toaster', 'traffic_light', 'train', 'trombone', 'trumpet', 'turtle', 'tv_or_monitor', 'unicycle', 'vacuum', 'violin', 'volleyball', 'waffle_iron', 'washer', 'water_bottle', 'watercraft', 'whale', 'wine_bottle', 'zebra' ] def imagenet_vid_classes(): return [ 'airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car', 'cattle', 'dog', 'domestic_cat', 'elephant', 'fox', 'giant_panda', 'hamster', 'horse', 'lion', 'lizard', 'monkey', 'motorcycle', 'rabbit', 'red_panda', 'sheep', 'snake', 'squirrel', 'tiger', 'train', 'turtle', 'watercraft', 'whale', 'zebra' ] def coco_classes(): return [ 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic_light', 'fire_hydrant', 'stop_sign', 'parking_meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports_ball', 'kite', 'baseball_bat', 'baseball_glove', 'skateboard', 'surfboard', 'tennis_racket', 'bottle', 'wine_glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot_dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted_plant', 'bed', 'dining_table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy_bear', 'hair_drier', 'toothbrush' ] dataset_aliases = { 'voc': ['voc', 'pascal_voc', 'voc07', 'voc12'], 'imagenet_det': ['det', 'imagenet_det', 'ilsvrc_det'], 'imagenet_vid': ['vid', 'imagenet_vid', 'ilsvrc_vid'], 'coco': ['coco', 'mscoco', 'ms_coco'] } def get_classes(dataset): """Get class names of a dataset.""" alias2name = {} for name, aliases in dataset_aliases.items(): for alias in aliases: alias2name[alias] = name if mmcv.is_str(dataset): if dataset in alias2name: labels = eval(alias2name[dataset] + '_classes()') else: raise ValueError('Unrecognized dataset: {}'.format(dataset)) else: raise TypeError('dataset must a str, but got {}'.format(type(dataset))) return labels ================================================ FILE: mmdetection/mmdet/core/evaluation/coco_utils.py ================================================ import mmcv import numpy as np from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval from .recall import eval_recalls def coco_eval(result_file, result_types, coco, max_dets=(100, 300, 1000)): for res_type in result_types: assert res_type in [ 'proposal', 'proposal_fast', 'bbox', 'segm', 'keypoints' ] if mmcv.is_str(coco): coco = COCO(coco) assert isinstance(coco, COCO) if result_types == ['proposal_fast']: ar = fast_eval_recall(result_file, coco, np.array(max_dets)) for i, num in enumerate(max_dets): print('AR@{}\t= {:.4f}'.format(num, ar[i])) return assert result_file.endswith('.json') coco_dets = coco.loadRes(result_file) img_ids = coco.getImgIds() for res_type in result_types: iou_type = 'bbox' if res_type == 'proposal' else res_type cocoEval = COCOeval(coco, coco_dets, iou_type) cocoEval.params.imgIds = img_ids if res_type == 'proposal': cocoEval.params.useCats = 0 cocoEval.params.maxDets = list(max_dets) cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() def fast_eval_recall(results, coco, max_dets, iou_thrs=np.arange(0.5, 0.96, 0.05)): if mmcv.is_str(results): assert results.endswith('.pkl') results = mmcv.load(results) elif not isinstance(results, list): raise TypeError( 'results must be a list of numpy arrays or a filename, not {}'. format(type(results))) gt_bboxes = [] img_ids = coco.getImgIds() for i in range(len(img_ids)): ann_ids = coco.getAnnIds(imgIds=img_ids[i]) ann_info = coco.loadAnns(ann_ids) if len(ann_info) == 0: gt_bboxes.append(np.zeros((0, 4))) continue bboxes = [] for ann in ann_info: if ann.get('ignore', False) or ann['iscrowd']: continue x1, y1, w, h = ann['bbox'] bboxes.append([x1, y1, x1 + w - 1, y1 + h - 1]) bboxes = np.array(bboxes, dtype=np.float32) if bboxes.shape[0] == 0: bboxes = np.zeros((0, 4)) gt_bboxes.append(bboxes) recalls = eval_recalls( gt_bboxes, results, max_dets, iou_thrs, print_summary=False) ar = recalls.mean(axis=1) return ar def xyxy2xywh(bbox): _bbox = bbox.tolist() return [ _bbox[0], _bbox[1], _bbox[2] - _bbox[0] + 1, _bbox[3] - _bbox[1] + 1, ] def proposal2json(dataset, results): json_results = [] for idx in range(len(dataset)): img_id = dataset.img_ids[idx] bboxes = results[idx] for i in range(bboxes.shape[0]): data = dict() data['image_id'] = img_id data['bbox'] = xyxy2xywh(bboxes[i]) data['score'] = float(bboxes[i][4]) data['category_id'] = 1 json_results.append(data) return json_results def det2json(dataset, results): json_results = [] for idx in range(len(dataset)): img_id = dataset.img_ids[idx] result = results[idx] for label in range(len(result)): bboxes = result[label] for i in range(bboxes.shape[0]): data = dict() data['image_id'] = img_id data['bbox'] = xyxy2xywh(bboxes[i]) data['score'] = float(bboxes[i][4]) data['category_id'] = dataset.cat_ids[label] json_results.append(data) return json_results def segm2json(dataset, results): json_results = [] for idx in range(len(dataset)): img_id = dataset.img_ids[idx] det, seg = results[idx] for label in range(len(det)): bboxes = det[label] segms = seg[label] for i in range(bboxes.shape[0]): data = dict() data['image_id'] = img_id data['bbox'] = xyxy2xywh(bboxes[i]) data['score'] = float(bboxes[i][4]) data['category_id'] = dataset.cat_ids[label] segms[i]['counts'] = segms[i]['counts'].decode() data['segmentation'] = segms[i] json_results.append(data) return json_results def results2json(dataset, results, out_file): if isinstance(results[0], list): json_results = det2json(dataset, results) elif isinstance(results[0], tuple): json_results = segm2json(dataset, results) elif isinstance(results[0], np.ndarray): json_results = proposal2json(dataset, results) else: raise TypeError('invalid type of results') mmcv.dump(json_results, out_file) ================================================ FILE: mmdetection/mmdet/core/evaluation/eval_hooks.py ================================================ import os import os.path as osp import mmcv import numpy as np import torch import torch.distributed as dist from mmcv.runner import Hook, obj_from_dict from mmcv.parallel import scatter, collate from pycocotools.cocoeval import COCOeval from torch.utils.data import Dataset from .coco_utils import results2json, fast_eval_recall from .mean_ap import eval_map from mmdet import datasets class DistEvalHook(Hook): def __init__(self, dataset, interval=1): if isinstance(dataset, Dataset): self.dataset = dataset elif isinstance(dataset, dict): self.dataset = obj_from_dict(dataset, datasets, {'test_mode': True}) else: raise TypeError( 'dataset must be a Dataset object or a dict, not {}'.format( type(dataset))) self.interval = interval def after_train_epoch(self, runner): if not self.every_n_epochs(runner, self.interval): return runner.model.eval() results = [None for _ in range(len(self.dataset))] if runner.rank == 0: prog_bar = mmcv.ProgressBar(len(self.dataset)) for idx in range(runner.rank, len(self.dataset), runner.world_size): data = self.dataset[idx] data_gpu = scatter( collate([data], samples_per_gpu=1), [torch.cuda.current_device()])[0] # compute output with torch.no_grad(): result = runner.model( return_loss=False, rescale=True, **data_gpu) results[idx] = result batch_size = runner.world_size if runner.rank == 0: for _ in range(batch_size): prog_bar.update() if runner.rank == 0: print('\n') dist.barrier() for i in range(1, runner.world_size): tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(i)) tmp_results = mmcv.load(tmp_file) for idx in range(i, len(results), runner.world_size): results[idx] = tmp_results[idx] os.remove(tmp_file) self.evaluate(runner, results) else: tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(runner.rank)) mmcv.dump(results, tmp_file) dist.barrier() dist.barrier() def evaluate(self): raise NotImplementedError class DistEvalmAPHook(DistEvalHook): def evaluate(self, runner, results): gt_bboxes = [] gt_labels = [] gt_ignore = [] if self.dataset.with_crowd else None for i in range(len(self.dataset)): ann = self.dataset.get_ann_info(i) bboxes = ann['bboxes'] labels = ann['labels'] if gt_ignore is not None: ignore = np.concatenate([ np.zeros(bboxes.shape[0], dtype=np.bool), np.ones(ann['bboxes_ignore'].shape[0], dtype=np.bool) ]) gt_ignore.append(ignore) bboxes = np.vstack([bboxes, ann['bboxes_ignore']]) labels = np.concatenate([labels, ann['labels_ignore']]) gt_bboxes.append(bboxes) gt_labels.append(labels) # If the dataset is VOC2007, then use 11 points mAP evaluation. if hasattr(self.dataset, 'year') and self.dataset.year == 2007: ds_name = 'voc07' else: ds_name = self.dataset.CLASSES bbox_results = [] for result in results: bbox_results.append(result[0]) mean_ap, eval_results = eval_map( bbox_results, gt_bboxes, gt_labels, gt_ignore=gt_ignore, scale_ranges=None, iou_thr=0.5, dataset=ds_name, print_summary=True) runner.log_buffer.output['mAP'] = mean_ap runner.log_buffer.ready = True class CocoDistEvalRecallHook(DistEvalHook): def __init__(self, dataset, proposal_nums=(100, 300, 1000), iou_thrs=np.arange(0.5, 0.96, 0.05)): super(CocoDistEvalRecallHook, self).__init__(dataset) self.proposal_nums = np.array(proposal_nums, dtype=np.int32) self.iou_thrs = np.array(iou_thrs, dtype=np.float32) def evaluate(self, runner, results): # the official coco evaluation is too slow, here we use our own # implementation instead, which may get slightly different results ar = fast_eval_recall(results, self.dataset.coco, self.proposal_nums, self.iou_thrs) for i, num in enumerate(self.proposal_nums): runner.log_buffer.output['AR@{}'.format(num)] = ar[i] runner.log_buffer.ready = True class CocoDistEvalmAPHook(DistEvalHook): def evaluate(self, runner, results): tmp_file = osp.join(runner.work_dir, 'temp_0.json') results2json(self.dataset, results, tmp_file) res_types = ['bbox', 'segm'] if runner.model.module.with_mask else ['bbox'] cocoGt = self.dataset.coco cocoDt = cocoGt.loadRes(tmp_file) imgIds = cocoGt.getImgIds() for res_type in res_types: iou_type = res_type cocoEval = COCOeval(cocoGt, cocoDt, iou_type) cocoEval.params.imgIds = imgIds cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() metrics = ['mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'] for i in range(len(metrics)): key = '{}_{}'.format(res_type, metrics[i]) val = float('{:.3f}'.format(cocoEval.stats[i])) runner.log_buffer.output[key] = val runner.log_buffer.output['{}_mAP_copypaste'.format(res_type)] = ( '{ap[0]:.3f} {ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} ' '{ap[4]:.3f} {ap[5]:.3f}').format(ap=cocoEval.stats[:6]) runner.log_buffer.ready = True os.remove(tmp_file) ================================================ FILE: mmdetection/mmdet/core/evaluation/mean_ap.py ================================================ import mmcv import numpy as np from terminaltables import AsciiTable from .bbox_overlaps import bbox_overlaps from .class_names import get_classes def average_precision(recalls, precisions, mode='area'): """Calculate average precision (for single or multiple scales). Args: recalls (ndarray): shape (num_scales, num_dets) or (num_dets, ) precisions (ndarray): shape (num_scales, num_dets) or (num_dets, ) mode (str): 'area' or '11points', 'area' means calculating the area under precision-recall curve, '11points' means calculating the average precision of recalls at [0, 0.1, ..., 1] Returns: float or ndarray: calculated average precision """ no_scale = False if recalls.ndim == 1: no_scale = True recalls = recalls[np.newaxis, :] precisions = precisions[np.newaxis, :] assert recalls.shape == precisions.shape and recalls.ndim == 2 num_scales = recalls.shape[0] ap = np.zeros(num_scales, dtype=np.float32) if mode == 'area': zeros = np.zeros((num_scales, 1), dtype=recalls.dtype) ones = np.ones((num_scales, 1), dtype=recalls.dtype) mrec = np.hstack((zeros, recalls, ones)) mpre = np.hstack((zeros, precisions, zeros)) for i in range(mpre.shape[1] - 1, 0, -1): mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i]) for i in range(num_scales): ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0] ap[i] = np.sum( (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1]) elif mode == '11points': for i in range(num_scales): for thr in np.arange(0, 1 + 1e-3, 0.1): precs = precisions[i, recalls[i, :] >= thr] prec = precs.max() if precs.size > 0 else 0 ap[i] += prec ap /= 11 else: raise ValueError( 'Unrecognized mode, only "area" and "11points" are supported') if no_scale: ap = ap[0] return ap def tpfp_imagenet(det_bboxes, gt_bboxes, gt_ignore, default_iou_thr, area_ranges=None): """Check if detected bboxes are true positive or false positive. Args: det_bbox (ndarray): the detected bbox gt_bboxes (ndarray): ground truth bboxes of this image gt_ignore (ndarray): indicate if gts are ignored for evaluation or not default_iou_thr (float): the iou thresholds for medium and large bboxes area_ranges (list or None): gt bbox area ranges Returns: tuple: two arrays (tp, fp) whose elements are 0 and 1 """ num_dets = det_bboxes.shape[0] num_gts = gt_bboxes.shape[0] if area_ranges is None: area_ranges = [(None, None)] num_scales = len(area_ranges) # tp and fp are of shape (num_scales, num_gts), each row is tp or fp # of a certain scale. tp = np.zeros((num_scales, num_dets), dtype=np.float32) fp = np.zeros((num_scales, num_dets), dtype=np.float32) if gt_bboxes.shape[0] == 0: if area_ranges == [(None, None)]: fp[...] = 1 else: det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0] + 1) * ( det_bboxes[:, 3] - det_bboxes[:, 1] + 1) for i, (min_area, max_area) in enumerate(area_ranges): fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1 return tp, fp ious = bbox_overlaps(det_bboxes, gt_bboxes - 1) gt_w = gt_bboxes[:, 2] - gt_bboxes[:, 0] + 1 gt_h = gt_bboxes[:, 3] - gt_bboxes[:, 1] + 1 iou_thrs = np.minimum((gt_w * gt_h) / ((gt_w + 10.0) * (gt_h + 10.0)), default_iou_thr) # sort all detections by scores in descending order sort_inds = np.argsort(-det_bboxes[:, -1]) for k, (min_area, max_area) in enumerate(area_ranges): gt_covered = np.zeros(num_gts, dtype=bool) # if no area range is specified, gt_area_ignore is all False if min_area is None: gt_area_ignore = np.zeros_like(gt_ignore, dtype=bool) else: gt_areas = gt_w * gt_h gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area) for i in sort_inds: max_iou = -1 matched_gt = -1 # find best overlapped available gt for j in range(num_gts): # different from PASCAL VOC: allow finding other gts if the # best overlaped ones are already matched by other det bboxes if gt_covered[j]: continue elif ious[i, j] >= iou_thrs[j] and ious[i, j] > max_iou: max_iou = ious[i, j] matched_gt = j # there are 4 cases for a det bbox: # 1. it matches a gt, tp = 1, fp = 0 # 2. it matches an ignored gt, tp = 0, fp = 0 # 3. it matches no gt and within area range, tp = 0, fp = 1 # 4. it matches no gt but is beyond area range, tp = 0, fp = 0 if matched_gt >= 0: gt_covered[matched_gt] = 1 if not (gt_ignore[matched_gt] or gt_area_ignore[matched_gt]): tp[k, i] = 1 elif min_area is None: fp[k, i] = 1 else: bbox = det_bboxes[i, :4] area = (bbox[2] - bbox[0] + 1) * (bbox[3] - bbox[1] + 1) if area >= min_area and area < max_area: fp[k, i] = 1 return tp, fp def tpfp_default(det_bboxes, gt_bboxes, gt_ignore, iou_thr, area_ranges=None): """Check if detected bboxes are true positive or false positive. Args: det_bbox (ndarray): the detected bbox gt_bboxes (ndarray): ground truth bboxes of this image gt_ignore (ndarray): indicate if gts are ignored for evaluation or not iou_thr (float): the iou thresholds Returns: tuple: (tp, fp), two arrays whose elements are 0 and 1 """ num_dets = det_bboxes.shape[0] num_gts = gt_bboxes.shape[0] if area_ranges is None: area_ranges = [(None, None)] num_scales = len(area_ranges) # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of # a certain scale tp = np.zeros((num_scales, num_dets), dtype=np.float32) fp = np.zeros((num_scales, num_dets), dtype=np.float32) # if there is no gt bboxes in this image, then all det bboxes # within area range are false positives if gt_bboxes.shape[0] == 0: if area_ranges == [(None, None)]: fp[...] = 1 else: det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0] + 1) * ( det_bboxes[:, 3] - det_bboxes[:, 1] + 1) for i, (min_area, max_area) in enumerate(area_ranges): fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1 return tp, fp ious = bbox_overlaps(det_bboxes, gt_bboxes) ious_max = ious.max(axis=1) ious_argmax = ious.argmax(axis=1) sort_inds = np.argsort(-det_bboxes[:, -1]) for k, (min_area, max_area) in enumerate(area_ranges): gt_covered = np.zeros(num_gts, dtype=bool) # if no area range is specified, gt_area_ignore is all False if min_area is None: gt_area_ignore = np.zeros_like(gt_ignore, dtype=bool) else: gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0] + 1) * ( gt_bboxes[:, 3] - gt_bboxes[:, 1] + 1) gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area) for i in sort_inds: if ious_max[i] >= iou_thr: matched_gt = ious_argmax[i] if not (gt_ignore[matched_gt] or gt_area_ignore[matched_gt]): if not gt_covered[matched_gt]: gt_covered[matched_gt] = True tp[k, i] = 1 else: fp[k, i] = 1 # otherwise ignore this detected bbox, tp = 0, fp = 0 elif min_area is None: fp[k, i] = 1 else: bbox = det_bboxes[i, :4] area = (bbox[2] - bbox[0] + 1) * (bbox[3] - bbox[1] + 1) if area >= min_area and area < max_area: fp[k, i] = 1 return tp, fp def get_cls_results(det_results, gt_bboxes, gt_labels, gt_ignore, class_id): """Get det results and gt information of a certain class.""" cls_dets = [det[class_id] for det in det_results] # det bboxes of this class cls_gts = [] # gt bboxes of this class cls_gt_ignore = [] for j in range(len(gt_bboxes)): gt_bbox = gt_bboxes[j] cls_inds = (gt_labels[j] == class_id + 1) cls_gt = gt_bbox[cls_inds, :] if gt_bbox.shape[0] > 0 else gt_bbox cls_gts.append(cls_gt) if gt_ignore is None: cls_gt_ignore.append(np.zeros(cls_gt.shape[0], dtype=np.int32)) else: cls_gt_ignore.append(gt_ignore[j][cls_inds]) return cls_dets, cls_gts, cls_gt_ignore def eval_map(det_results, gt_bboxes, gt_labels, gt_ignore=None, scale_ranges=None, iou_thr=0.5, dataset=None, print_summary=True): """Evaluate mAP of a dataset. Args: det_results (list): a list of list, [[cls1_det, cls2_det, ...], ...] gt_bboxes (list): ground truth bboxes of each image, a list of K*4 array. gt_labels (list): ground truth labels of each image, a list of K array gt_ignore (list): gt ignore indicators of each image, a list of K array scale_ranges (list, optional): [(min1, max1), (min2, max2), ...] iou_thr (float): IoU threshold dataset (None or str or list): dataset name or dataset classes, there are minor differences in metrics for different datsets, e.g. "voc07", "imagenet_det", etc. print_summary (bool): whether to print the mAP summary Returns: tuple: (mAP, [dict, dict, ...]) """ assert len(det_results) == len(gt_bboxes) == len(gt_labels) if gt_ignore is not None: assert len(gt_ignore) == len(gt_labels) for i in range(len(gt_ignore)): assert len(gt_labels[i]) == len(gt_ignore[i]) area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges] if scale_ranges is not None else None) num_scales = len(scale_ranges) if scale_ranges is not None else 1 eval_results = [] num_classes = len(det_results[0]) # positive class num gt_labels = [ label if label.ndim == 1 else label[:, 0] for label in gt_labels ] for i in range(num_classes): # get gt and det bboxes of this class cls_dets, cls_gts, cls_gt_ignore = get_cls_results( det_results, gt_bboxes, gt_labels, gt_ignore, i) # calculate tp and fp for each image tpfp_func = (tpfp_imagenet if dataset in ['det', 'vid'] else tpfp_default) tpfp = [ tpfp_func(cls_dets[j], cls_gts[j], cls_gt_ignore[j], iou_thr, area_ranges) for j in range(len(cls_dets)) ] tp, fp = tuple(zip(*tpfp)) # calculate gt number of each scale, gts ignored or beyond scale # are not counted num_gts = np.zeros(num_scales, dtype=int) for j, bbox in enumerate(cls_gts): if area_ranges is None: num_gts[0] += np.sum(np.logical_not(cls_gt_ignore[j])) else: gt_areas = (bbox[:, 2] - bbox[:, 0] + 1) * ( bbox[:, 3] - bbox[:, 1] + 1) for k, (min_area, max_area) in enumerate(area_ranges): num_gts[k] += np.sum( np.logical_not(cls_gt_ignore[j]) & (gt_areas >= min_area) & (gt_areas < max_area)) # sort all det bboxes by score, also sort tp and fp cls_dets = np.vstack(cls_dets) num_dets = cls_dets.shape[0] sort_inds = np.argsort(-cls_dets[:, -1]) tp = np.hstack(tp)[:, sort_inds] fp = np.hstack(fp)[:, sort_inds] # calculate recall and precision with tp and fp tp = np.cumsum(tp, axis=1) fp = np.cumsum(fp, axis=1) eps = np.finfo(np.float32).eps recalls = tp / np.maximum(num_gts[:, np.newaxis], eps) precisions = tp / np.maximum((tp + fp), eps) # calculate AP if scale_ranges is None: recalls = recalls[0, :] precisions = precisions[0, :] num_gts = num_gts.item() mode = 'area' if dataset != 'voc07' else '11points' ap = average_precision(recalls, precisions, mode) eval_results.append({ 'num_gts': num_gts, 'num_dets': num_dets, 'recall': recalls, 'precision': precisions, 'ap': ap }) if scale_ranges is not None: # shape (num_classes, num_scales) all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results]) all_num_gts = np.vstack( [cls_result['num_gts'] for cls_result in eval_results]) mean_ap = [ all_ap[all_num_gts[:, i] > 0, i].mean() if np.any(all_num_gts[:, i] > 0) else 0.0 for i in range(num_scales) ] else: aps = [] for cls_result in eval_results: if cls_result['num_gts'] > 0: aps.append(cls_result['ap']) mean_ap = np.array(aps).mean().item() if aps else 0.0 if print_summary: print_map_summary(mean_ap, eval_results, dataset) return mean_ap, eval_results def print_map_summary(mean_ap, results, dataset=None): """Print mAP and results of each class. Args: mean_ap(float): calculated from `eval_map` results(list): calculated from `eval_map` dataset(None or str or list): dataset name or dataset classes. """ num_scales = len(results[0]['ap']) if isinstance(results[0]['ap'], np.ndarray) else 1 num_classes = len(results) recalls = np.zeros((num_scales, num_classes), dtype=np.float32) precisions = np.zeros((num_scales, num_classes), dtype=np.float32) aps = np.zeros((num_scales, num_classes), dtype=np.float32) num_gts = np.zeros((num_scales, num_classes), dtype=int) for i, cls_result in enumerate(results): if cls_result['recall'].size > 0: recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1] precisions[:, i] = np.array( cls_result['precision'], ndmin=2)[:, -1] aps[:, i] = cls_result['ap'] num_gts[:, i] = cls_result['num_gts'] if dataset is None: label_names = [str(i) for i in range(1, num_classes + 1)] elif mmcv.is_str(dataset): label_names = get_classes(dataset) else: label_names = dataset if not isinstance(mean_ap, list): mean_ap = [mean_ap] header = ['class', 'gts', 'dets', 'recall', 'precision', 'ap'] for i in range(num_scales): table_data = [header] for j in range(num_classes): row_data = [ label_names[j], num_gts[i, j], results[j]['num_dets'], '{:.3f}'.format(recalls[i, j]), '{:.3f}'.format( precisions[i, j]), '{:.3f}'.format(aps[i, j]) ] table_data.append(row_data) table_data.append(['mAP', '', '', '', '', '{:.3f}'.format(mean_ap[i])]) table = AsciiTable(table_data) table.inner_footing_row_border = True print(table.table) ================================================ FILE: mmdetection/mmdet/core/evaluation/recall.py ================================================ import numpy as np from terminaltables import AsciiTable from .bbox_overlaps import bbox_overlaps def _recalls(all_ious, proposal_nums, thrs): img_num = all_ious.shape[0] total_gt_num = sum([ious.shape[0] for ious in all_ious]) _ious = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32) for k, proposal_num in enumerate(proposal_nums): tmp_ious = np.zeros(0) for i in range(img_num): ious = all_ious[i][:, :proposal_num].copy() gt_ious = np.zeros((ious.shape[0])) if ious.size == 0: tmp_ious = np.hstack((tmp_ious, gt_ious)) continue for j in range(ious.shape[0]): gt_max_overlaps = ious.argmax(axis=1) max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps] gt_idx = max_ious.argmax() gt_ious[j] = max_ious[gt_idx] box_idx = gt_max_overlaps[gt_idx] ious[gt_idx, :] = -1 ious[:, box_idx] = -1 tmp_ious = np.hstack((tmp_ious, gt_ious)) _ious[k, :] = tmp_ious _ious = np.fliplr(np.sort(_ious, axis=1)) recalls = np.zeros((proposal_nums.size, thrs.size)) for i, thr in enumerate(thrs): recalls[:, i] = (_ious >= thr).sum(axis=1) / float(total_gt_num) return recalls def set_recall_param(proposal_nums, iou_thrs): """Check proposal_nums and iou_thrs and set correct format. """ if isinstance(proposal_nums, list): _proposal_nums = np.array(proposal_nums) elif isinstance(proposal_nums, int): _proposal_nums = np.array([proposal_nums]) else: _proposal_nums = proposal_nums if iou_thrs is None: _iou_thrs = np.array([0.5]) elif isinstance(iou_thrs, list): _iou_thrs = np.array(iou_thrs) elif isinstance(iou_thrs, float): _iou_thrs = np.array([iou_thrs]) else: _iou_thrs = iou_thrs return _proposal_nums, _iou_thrs def eval_recalls(gts, proposals, proposal_nums=None, iou_thrs=None, print_summary=True): """Calculate recalls. Args: gts(list or ndarray): a list of arrays of shape (n, 4) proposals(list or ndarray): a list of arrays of shape (k, 4) or (k, 5) proposal_nums(int or list of int or ndarray): top N proposals thrs(float or list or ndarray): iou thresholds Returns: ndarray: recalls of different ious and proposal nums """ img_num = len(gts) assert img_num == len(proposals) proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs) all_ious = [] for i in range(img_num): if proposals[i].ndim == 2 and proposals[i].shape[1] == 5: scores = proposals[i][:, 4] sort_idx = np.argsort(scores)[::-1] img_proposal = proposals[i][sort_idx, :] else: img_proposal = proposals[i] prop_num = min(img_proposal.shape[0], proposal_nums[-1]) if gts[i] is None or gts[i].shape[0] == 0: ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32) else: ious = bbox_overlaps(gts[i], img_proposal[:prop_num, :4]) all_ious.append(ious) all_ious = np.array(all_ious) recalls = _recalls(all_ious, proposal_nums, iou_thrs) if print_summary: print_recall_summary(recalls, proposal_nums, iou_thrs) return recalls def print_recall_summary(recalls, proposal_nums, iou_thrs, row_idxs=None, col_idxs=None): """Print recalls in a table. Args: recalls(ndarray): calculated from `bbox_recalls` proposal_nums(ndarray or list): top N proposals iou_thrs(ndarray or list): iou thresholds row_idxs(ndarray): which rows(proposal nums) to print col_idxs(ndarray): which cols(iou thresholds) to print """ proposal_nums = np.array(proposal_nums, dtype=np.int32) iou_thrs = np.array(iou_thrs) if row_idxs is None: row_idxs = np.arange(proposal_nums.size) if col_idxs is None: col_idxs = np.arange(iou_thrs.size) row_header = [''] + iou_thrs[col_idxs].tolist() table_data = [row_header] for i, num in enumerate(proposal_nums[row_idxs]): row = [ '{:.3f}'.format(val) for val in recalls[row_idxs[i], col_idxs].tolist() ] row.insert(0, num) table_data.append(row) table = AsciiTable(table_data) print(table.table) def plot_num_recall(recalls, proposal_nums): """Plot Proposal_num-Recalls curve. Args: recalls(ndarray or list): shape (k,) proposal_nums(ndarray or list): same shape as `recalls` """ if isinstance(proposal_nums, np.ndarray): _proposal_nums = proposal_nums.tolist() else: _proposal_nums = proposal_nums if isinstance(recalls, np.ndarray): _recalls = recalls.tolist() else: _recalls = recalls import matplotlib.pyplot as plt f = plt.figure() plt.plot([0] + _proposal_nums, [0] + _recalls) plt.xlabel('Proposal num') plt.ylabel('Recall') plt.axis([0, proposal_nums.max(), 0, 1]) f.show() def plot_iou_recall(recalls, iou_thrs): """Plot IoU-Recalls curve. Args: recalls(ndarray or list): shape (k,) iou_thrs(ndarray or list): same shape as `recalls` """ if isinstance(iou_thrs, np.ndarray): _iou_thrs = iou_thrs.tolist() else: _iou_thrs = iou_thrs if isinstance(recalls, np.ndarray): _recalls = recalls.tolist() else: _recalls = recalls import matplotlib.pyplot as plt f = plt.figure() plt.plot(_iou_thrs + [1.0], _recalls + [0.]) plt.xlabel('IoU') plt.ylabel('Recall') plt.axis([iou_thrs.min(), 1, 0, 1]) f.show() ================================================ FILE: mmdetection/mmdet/core/loss/__init__.py ================================================ from .losses import ( weighted_nll_loss, weighted_cross_entropy, weighted_binary_cross_entropy, sigmoid_focal_loss, py_sigmoid_focal_loss, weighted_sigmoid_focal_loss, mask_cross_entropy, smooth_l1_loss, weighted_smoothl1, accuracy, iou_loss) __all__ = [ 'weighted_nll_loss', 'weighted_cross_entropy', 'weighted_binary_cross_entropy', 'sigmoid_focal_loss', 'py_sigmoid_focal_loss', 'weighted_sigmoid_focal_loss', 'mask_cross_entropy', 'smooth_l1_loss', 'weighted_smoothl1', 'accuracy', 'iou_loss' ] ================================================ FILE: mmdetection/mmdet/core/loss/losses.py ================================================ # TODO merge naive and weighted loss. import torch import torch.nn.functional as F from ..bbox import bbox_overlaps from ...ops import sigmoid_focal_loss def weighted_nll_loss(pred, label, weight, avg_factor=None): if avg_factor is None: avg_factor = max(torch.sum(weight > 0).float().item(), 1.) raw = F.nll_loss(pred, label, reduction='none') return torch.sum(raw * weight)[None] / avg_factor def weighted_cross_entropy(pred, label, weight, avg_factor=None, reduce=True): if avg_factor is None: avg_factor = max(torch.sum(weight > 0).float().item(), 1.) raw = F.cross_entropy(pred, label, reduction='none') if reduce: return torch.sum(raw * weight)[None] / avg_factor else: return raw * weight / avg_factor def weighted_binary_cross_entropy(pred, label, weight, avg_factor=None): if pred.dim() != label.dim(): label, weight = _expand_binary_labels(label, weight, pred.size(-1)) if avg_factor is None: avg_factor = max(torch.sum(weight > 0).float().item(), 1.) return F.binary_cross_entropy_with_logits( pred, label.float(), weight.float(), reduction='sum')[None] / avg_factor def py_sigmoid_focal_loss(pred, target, weight, gamma=2.0, alpha=0.25, reduction='mean'): pred_sigmoid = pred.sigmoid() target = target.type_as(pred) pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target) weight = (alpha * target + (1 - alpha) * (1 - target)) * weight weight = weight * pt.pow(gamma) loss = F.binary_cross_entropy_with_logits( pred, target, reduction='none') * weight reduction_enum = F._Reduction.get_enum(reduction) # none: 0, mean:1, sum: 2 if reduction_enum == 0: return loss elif reduction_enum == 1: return loss.mean() elif reduction_enum == 2: return loss.sum() def weighted_sigmoid_focal_loss(pred, target, weight, gamma=2.0, alpha=0.25, avg_factor=None, num_classes=80): if avg_factor is None: avg_factor = torch.sum(weight > 0).float().item() / num_classes + 1e-6 return torch.sum( sigmoid_focal_loss(pred, target, gamma, alpha, 'none') * weight.view( -1, 1))[None] / avg_factor def mask_cross_entropy(pred, target, label): num_rois = pred.size()[0] inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device) pred_slice = pred[inds, label].squeeze(1) return F.binary_cross_entropy_with_logits( pred_slice, target, reduction='mean')[None] def smooth_l1_loss(pred, target, beta=1.0, reduction='mean'): assert beta > 0 assert pred.size() == target.size() and target.numel() > 0 diff = torch.abs(pred - target) loss = torch.where(diff < beta, 0.5 * diff * diff / beta, diff - 0.5 * beta) reduction_enum = F._Reduction.get_enum(reduction) # none: 0, mean:1, sum: 2 if reduction_enum == 0: return loss elif reduction_enum == 1: return loss.sum() / pred.numel() elif reduction_enum == 2: return loss.sum() def weighted_smoothl1(pred, target, weight, beta=1.0, avg_factor=None): if avg_factor is None: avg_factor = torch.sum(weight > 0).float().item() / 4 + 1e-6 loss = smooth_l1_loss(pred, target, beta, reduction='none') return torch.sum(loss * weight)[None] / avg_factor def accuracy(pred, target, topk=1): if isinstance(topk, int): topk = (topk, ) return_single = True else: return_single = False maxk = max(topk) _, pred_label = pred.topk(maxk, 1, True, True) pred_label = pred_label.t() correct = pred_label.eq(target.view(1, -1).expand_as(pred_label)) res = [] for k in topk: correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) res.append(correct_k.mul_(100.0 / pred.size(0))) return res[0] if return_single else res def _expand_binary_labels(labels, label_weights, label_channels): bin_labels = labels.new_full((labels.size(0), label_channels), 0) inds = torch.nonzero(labels >= 1).squeeze() if inds.numel() > 0: bin_labels[inds, labels[inds] - 1] = 1 bin_label_weights = label_weights.view(-1, 1).expand( label_weights.size(0), label_channels) return bin_labels, bin_label_weights def iou_loss(pred_bboxes, target_bboxes, reduction='mean'): ious = bbox_overlaps(pred_bboxes, target_bboxes, is_aligned=True) loss = -ious.log() reduction_enum = F._Reduction.get_enum(reduction) if reduction_enum == 0: return loss elif reduction_enum == 1: return loss.mean() elif reduction_enum == 2: return loss.sum() ================================================ FILE: mmdetection/mmdet/core/mask/__init__.py ================================================ from .utils import split_combined_polys from .mask_target import mask_target __all__ = ['split_combined_polys', 'mask_target'] ================================================ FILE: mmdetection/mmdet/core/mask/mask_target.py ================================================ import torch import numpy as np import mmcv def mask_target(pos_proposals_list, pos_assigned_gt_inds_list, gt_masks_list, cfg): cfg_list = [cfg for _ in range(len(pos_proposals_list))] mask_targets = map(mask_target_single, pos_proposals_list, pos_assigned_gt_inds_list, gt_masks_list, cfg_list) mask_targets = torch.cat(list(mask_targets)) return mask_targets def mask_target_single(pos_proposals, pos_assigned_gt_inds, gt_masks, cfg): mask_size = cfg.mask_size num_pos = pos_proposals.size(0) mask_targets = [] if num_pos > 0: proposals_np = pos_proposals.cpu().numpy() pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy() for i in range(num_pos): gt_mask = gt_masks[pos_assigned_gt_inds[i]] bbox = proposals_np[i, :].astype(np.int32) x1, y1, x2, y2 = bbox w = np.maximum(x2 - x1 + 1, 1) h = np.maximum(y2 - y1 + 1, 1) # mask is uint8 both before and after resizing target = mmcv.imresize(gt_mask[y1:y1 + h, x1:x1 + w], (mask_size, mask_size)) mask_targets.append(target) mask_targets = torch.from_numpy(np.stack(mask_targets)).float().to( pos_proposals.device) else: mask_targets = pos_proposals.new_zeros((0, mask_size, mask_size)) return mask_targets ================================================ FILE: mmdetection/mmdet/core/mask/utils.py ================================================ import mmcv def split_combined_polys(polys, poly_lens, polys_per_mask): """Split the combined 1-D polys into masks. A mask is represented as a list of polys, and a poly is represented as a 1-D array. In dataset, all masks are concatenated into a single 1-D tensor. Here we need to split the tensor into original representations. Args: polys (list): a list (length = image num) of 1-D tensors poly_lens (list): a list (length = image num) of poly length polys_per_mask (list): a list (length = image num) of poly number of each mask Returns: list: a list (length = image num) of list (length = mask num) of list (length = poly num) of numpy array """ mask_polys_list = [] for img_id in range(len(polys)): polys_single = polys[img_id] polys_lens_single = poly_lens[img_id].tolist() polys_per_mask_single = polys_per_mask[img_id].tolist() split_polys = mmcv.slice_list(polys_single, polys_lens_single) mask_polys = mmcv.slice_list(split_polys, polys_per_mask_single) mask_polys_list.append(mask_polys) return mask_polys_list ================================================ FILE: mmdetection/mmdet/core/post_processing/__init__.py ================================================ from .bbox_nms import multiclass_nms from .merge_augs import (merge_aug_proposals, merge_aug_bboxes, merge_aug_scores, merge_aug_masks) __all__ = [ 'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes', 'merge_aug_scores', 'merge_aug_masks' ] ================================================ FILE: mmdetection/mmdet/core/post_processing/bbox_nms.py ================================================ import torch from mmdet.ops.nms import nms_wrapper def multiclass_nms(multi_bboxes, multi_scores, score_thr, nms_cfg, max_num=-1, score_factors=None): """NMS for multi-class bboxes. Args: multi_bboxes (Tensor): shape (n, #class*4) or (n, 4) multi_scores (Tensor): shape (n, #class) score_thr (float): bbox threshold, bboxes with scores lower than it will not be considered. nms_thr (float): NMS IoU threshold max_num (int): if there are more than max_num bboxes after NMS, only top max_num will be kept. score_factors (Tensor): The factors multiplied to scores before applying NMS Returns: tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels are 0-based. """ num_classes = multi_scores.shape[1] bboxes, labels = [], [] nms_cfg_ = nms_cfg.copy() nms_type = nms_cfg_.pop('type', 'nms') nms_op = getattr(nms_wrapper, nms_type) for i in range(1, num_classes): cls_inds = multi_scores[:, i] > score_thr if not cls_inds.any(): continue # get bboxes and scores of this class if multi_bboxes.shape[1] == 4: _bboxes = multi_bboxes[cls_inds, :] else: _bboxes = multi_bboxes[cls_inds, i * 4:(i + 1) * 4] _scores = multi_scores[cls_inds, i] if score_factors is not None: _scores *= score_factors[cls_inds] cls_dets = torch.cat([_bboxes, _scores[:, None]], dim=1) cls_dets, _ = nms_op(cls_dets, **nms_cfg_) cls_labels = multi_bboxes.new_full((cls_dets.shape[0], ), i - 1, dtype=torch.long) bboxes.append(cls_dets) labels.append(cls_labels) if bboxes: bboxes = torch.cat(bboxes) labels = torch.cat(labels) if bboxes.shape[0] > max_num: _, inds = bboxes[:, -1].sort(descending=True) inds = inds[:max_num] bboxes = bboxes[inds] labels = labels[inds] else: bboxes = multi_bboxes.new_zeros((0, 5)) labels = multi_bboxes.new_zeros((0, ), dtype=torch.long) return bboxes, labels ================================================ FILE: mmdetection/mmdet/core/post_processing/merge_augs.py ================================================ import torch import numpy as np from mmdet.ops import nms from ..bbox import bbox_mapping_back def merge_aug_proposals(aug_proposals, img_metas, rpn_test_cfg): """Merge augmented proposals (multiscale, flip, etc.) Args: aug_proposals (list[Tensor]): proposals from different testing schemes, shape (n, 5). Note that they are not rescaled to the original image size. img_metas (list[dict]): image info including "shape_scale" and "flip". rpn_test_cfg (dict): rpn test config. Returns: Tensor: shape (n, 4), proposals corresponding to original image scale. """ recovered_proposals = [] for proposals, img_info in zip(aug_proposals, img_metas): img_shape = img_info['img_shape'] scale_factor = img_info['scale_factor'] flip = img_info['flip'] _proposals = proposals.clone() _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], img_shape, scale_factor, flip) recovered_proposals.append(_proposals) aug_proposals = torch.cat(recovered_proposals, dim=0) merged_proposals, _ = nms(aug_proposals, rpn_test_cfg.nms_thr) scores = merged_proposals[:, 4] _, order = scores.sort(0, descending=True) num = min(rpn_test_cfg.max_num, merged_proposals.shape[0]) order = order[:num] merged_proposals = merged_proposals[order, :] return merged_proposals def merge_aug_bboxes(aug_bboxes, aug_scores, img_metas, rcnn_test_cfg): """Merge augmented detection bboxes and scores. Args: aug_bboxes (list[Tensor]): shape (n, 4*#class) aug_scores (list[Tensor] or None): shape (n, #class) img_shapes (list[Tensor]): shape (3, ). rcnn_test_cfg (dict): rcnn test config. Returns: tuple: (bboxes, scores) """ recovered_bboxes = [] for bboxes, img_info in zip(aug_bboxes, img_metas): img_shape = img_info[0]['img_shape'] scale_factor = img_info[0]['scale_factor'] flip = img_info[0]['flip'] bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip) recovered_bboxes.append(bboxes) bboxes = torch.stack(recovered_bboxes).mean(dim=0) if aug_scores is None: return bboxes else: scores = torch.stack(aug_scores).mean(dim=0) return bboxes, scores def merge_aug_scores(aug_scores): """Merge augmented bbox scores.""" if isinstance(aug_scores[0], torch.Tensor): return torch.mean(torch.stack(aug_scores), dim=0) else: return np.mean(aug_scores, axis=0) def merge_aug_masks(aug_masks, img_metas, rcnn_test_cfg, weights=None): """Merge augmented mask prediction. Args: aug_masks (list[ndarray]): shape (n, #class, h, w) img_shapes (list[ndarray]): shape (3, ). rcnn_test_cfg (dict): rcnn test config. Returns: tuple: (bboxes, scores) """ recovered_masks = [ mask if not img_info[0]['flip'] else mask[..., ::-1] for mask, img_info in zip(aug_masks, img_metas) ] if weights is None: merged_masks = np.mean(recovered_masks, axis=0) else: merged_masks = np.average( np.array(recovered_masks), axis=0, weights=np.array(weights)) return merged_masks ================================================ FILE: mmdetection/mmdet/core/utils/__init__.py ================================================ from .dist_utils import allreduce_grads, DistOptimizerHook from .misc import tensor2imgs, unmap, multi_apply __all__ = [ 'allreduce_grads', 'DistOptimizerHook', 'tensor2imgs', 'unmap', 'multi_apply' ] ================================================ FILE: mmdetection/mmdet/core/utils/dist_utils.py ================================================ from collections import OrderedDict import torch.distributed as dist from torch._utils import (_flatten_dense_tensors, _unflatten_dense_tensors, _take_tensors) from mmcv.runner import OptimizerHook def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): if bucket_size_mb > 0: bucket_size_bytes = bucket_size_mb * 1024 * 1024 buckets = _take_tensors(tensors, bucket_size_bytes) else: buckets = OrderedDict() for tensor in tensors: tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) buckets = buckets.values() for bucket in buckets: flat_tensors = _flatten_dense_tensors(bucket) dist.all_reduce(flat_tensors) flat_tensors.div_(world_size) for tensor, synced in zip( bucket, _unflatten_dense_tensors(flat_tensors, bucket)): tensor.copy_(synced) def allreduce_grads(model, coalesce=True, bucket_size_mb=-1): grads = [ param.grad.data for param in model.parameters() if param.requires_grad and param.grad is not None ] world_size = dist.get_world_size() if coalesce: _allreduce_coalesced(grads, world_size, bucket_size_mb) else: for tensor in grads: dist.all_reduce(tensor.div_(world_size)) class DistOptimizerHook(OptimizerHook): def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1): self.grad_clip = grad_clip self.coalesce = coalesce self.bucket_size_mb = bucket_size_mb def after_train_iter(self, runner): runner.optimizer.zero_grad() runner.outputs['loss'].backward() allreduce_grads(runner.model, self.coalesce, self.bucket_size_mb) if self.grad_clip is not None: self.clip_grads(runner.model.parameters()) runner.optimizer.step() ================================================ FILE: mmdetection/mmdet/core/utils/misc.py ================================================ from functools import partial import mmcv import numpy as np from six.moves import map, zip def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True): num_imgs = tensor.size(0) mean = np.array(mean, dtype=np.float32) std = np.array(std, dtype=np.float32) imgs = [] for img_id in range(num_imgs): img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0) img = mmcv.imdenormalize( img, mean, std, to_bgr=to_rgb).astype(np.uint8) imgs.append(np.ascontiguousarray(img)) return imgs def multi_apply(func, *args, **kwargs): pfunc = partial(func, **kwargs) if kwargs else func map_results = map(pfunc, *args) return tuple(map(list, zip(*map_results))) def unmap(data, count, inds, fill=0): """ Unmap a subset of item (data) back to the original set of items (of size count) """ if data.dim() == 1: ret = data.new_full((count, ), fill) ret[inds] = data else: new_size = (count, ) + data.size()[1:] ret = data.new_full(new_size, fill) ret[inds, :] = data return ret ================================================ FILE: mmdetection/mmdet/datasets/__init__.py ================================================ from .custom import CustomDataset from .xml_style import XMLDataset from .coco import CocoDataset from .voc import VOCDataset from .loader import GroupSampler, DistributedGroupSampler, build_dataloader from .utils import to_tensor, random_scale, show_ann, get_dataset from .concat_dataset import ConcatDataset from .repeat_dataset import RepeatDataset from .extra_aug import ExtraAugmentation __all__ = [ 'CustomDataset', 'XMLDataset', 'CocoDataset', 'VOCDataset', 'GroupSampler', 'DistributedGroupSampler', 'build_dataloader', 'to_tensor', 'random_scale', 'show_ann', 'get_dataset', 'ConcatDataset', 'RepeatDataset', 'ExtraAugmentation' ] ================================================ FILE: mmdetection/mmdet/datasets/coco.py ================================================ import numpy as np from pycocotools.coco import COCO from .custom import CustomDataset class CocoDataset(CustomDataset): CLASSES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic_light', 'fire_hydrant', 'stop_sign', 'parking_meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports_ball', 'kite', 'baseball_bat', 'baseball_glove', 'skateboard', 'surfboard', 'tennis_racket', 'bottle', 'wine_glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot_dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted_plant', 'bed', 'dining_table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy_bear', 'hair_drier', 'toothbrush') def load_annotations(self, ann_file): self.coco = COCO(ann_file) self.cat_ids = self.coco.getCatIds() self.cat2label = { cat_id: i + 1 for i, cat_id in enumerate(self.cat_ids) } self.img_ids = self.coco.getImgIds() img_infos = [] for i in self.img_ids: info = self.coco.loadImgs([i])[0] info['filename'] = info['file_name'] img_infos.append(info) return img_infos def get_ann_info(self, idx): img_id = self.img_infos[idx]['id'] ann_ids = self.coco.getAnnIds(imgIds=[img_id]) ann_info = self.coco.loadAnns(ann_ids) return self._parse_ann_info(ann_info, self.with_mask) def _filter_imgs(self, min_size=32): """Filter images too small or without ground truths.""" valid_inds = [] ids_with_ann = set(_['image_id'] for _ in self.coco.anns.values()) for i, img_info in enumerate(self.img_infos): if self.img_ids[i] not in ids_with_ann: continue if min(img_info['width'], img_info['height']) >= min_size: valid_inds.append(i) return valid_inds def _parse_ann_info(self, ann_info, with_mask=True): """Parse bbox and mask annotation. Args: ann_info (list[dict]): Annotation info of an image. with_mask (bool): Whether to parse mask annotations. Returns: dict: A dict containing the following keys: bboxes, bboxes_ignore, labels, masks, mask_polys, poly_lens. """ gt_bboxes = [] gt_labels = [] gt_bboxes_ignore = [] # Two formats are provided. # 1. mask: a binary map of the same size of the image. # 2. polys: each mask consists of one or several polys, each poly is a # list of float. if with_mask: gt_masks = [] gt_mask_polys = [] gt_poly_lens = [] for i, ann in enumerate(ann_info): if ann.get('ignore', False): continue x1, y1, w, h = ann['bbox'] if ann['area'] <= 0 or w < 1 or h < 1: continue bbox = [x1, y1, x1 + w - 1, y1 + h - 1] if ann['iscrowd']: gt_bboxes_ignore.append(bbox) else: gt_bboxes.append(bbox) gt_labels.append(self.cat2label[ann['category_id']]) if with_mask: gt_masks.append(self.coco.annToMask(ann)) mask_polys = [ p for p in ann['segmentation'] if len(p) >= 6 ] # valid polygons have >= 3 points (6 coordinates) poly_lens = [len(p) for p in mask_polys] gt_mask_polys.append(mask_polys) gt_poly_lens.extend(poly_lens) if gt_bboxes: gt_bboxes = np.array(gt_bboxes, dtype=np.float32) gt_labels = np.array(gt_labels, dtype=np.int64) else: gt_bboxes = np.zeros((0, 4), dtype=np.float32) gt_labels = np.array([], dtype=np.int64) if gt_bboxes_ignore: gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32) else: gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32) ann = dict( bboxes=gt_bboxes, labels=gt_labels, bboxes_ignore=gt_bboxes_ignore) if with_mask: ann['masks'] = gt_masks # poly format is not used in the current implementation ann['mask_polys'] = gt_mask_polys ann['poly_lens'] = gt_poly_lens return ann ================================================ FILE: mmdetection/mmdet/datasets/concat_dataset.py ================================================ import numpy as np from torch.utils.data.dataset import ConcatDataset as _ConcatDataset class ConcatDataset(_ConcatDataset): """A wrapper of concatenated dataset. Same as :obj:`torch.utils.data.dataset.ConcatDataset`, but concat the group flag for image aspect ratio. Args: datasets (list[:obj:`Dataset`]): A list of datasets. """ def __init__(self, datasets): super(ConcatDataset, self).__init__(datasets) self.CLASSES = datasets[0].CLASSES if hasattr(datasets[0], 'flag'): flags = [] for i in range(0, len(datasets)): flags.append(datasets[i].flag) self.flag = np.concatenate(flags) ================================================ FILE: mmdetection/mmdet/datasets/custom.py ================================================ import os.path as osp import mmcv import numpy as np from mmcv.parallel import DataContainer as DC from torch.utils.data import Dataset from pycocotools.mask import decode from .transforms import (ImageTransform, BboxTransform, MaskTransform, SegMapTransform, Numpy2Tensor) from .utils import to_tensor, random_scale from .extra_aug import ExtraAugmentation class CustomDataset(Dataset): """Custom dataset for detection. Annotation format: [ { 'filename': 'a.jpg', 'width': 1280, 'height': 720, 'ann': { 'bboxes': (n, 4), 'labels': (n, ), 'bboxes_ignore': (k, 4), 'labels_ignore': (k, 4) (optional field) } }, ... ] The `ann` field is optional for testing. """ CLASSES = None def __init__(self, ann_file, img_prefix, img_scale, img_norm_cfg, multiscale_mode='value', size_divisor=None, proposal_file=None, num_max_proposals=1000, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True, with_semantic_seg=False, seg_prefix=None, seg_scale_factor=1, extra_aug=None, resize_keep_ratio=True, test_mode=False): # prefix of images path self.img_prefix = img_prefix # load annotations (and proposals) self.img_infos = self.load_annotations(ann_file) if proposal_file is not None: self.proposals = self.load_proposals(proposal_file) else: self.proposals = None # filter images with no annotation during training if not test_mode: valid_inds = self._filter_imgs() self.img_infos = [self.img_infos[i] for i in valid_inds] if self.proposals is not None: self.proposals = [self.proposals[i] for i in valid_inds] # (long_edge, short_edge) or [(long1, short1), (long2, short2), ...] self.img_scales = img_scale if isinstance(img_scale, list) else [img_scale] assert mmcv.is_list_of(self.img_scales, tuple) # normalization configs self.img_norm_cfg = img_norm_cfg # multi-scale mode (only applicable for multi-scale training) self.multiscale_mode = multiscale_mode assert multiscale_mode in ['value', 'range'] # max proposals per image self.num_max_proposals = num_max_proposals # flip ratio self.flip_ratio = flip_ratio assert flip_ratio >= 0 and flip_ratio <= 1 # padding border to ensure the image size can be divided by # size_divisor (used for FPN) self.size_divisor = size_divisor # with mask or not (reserved field, takes no effect) self.with_mask = with_mask # some datasets provide bbox annotations as ignore/crowd/difficult, # if `with_crowd` is True, then these info is returned. self.with_crowd = with_crowd # with label is False for RPN self.with_label = with_label # with semantic segmentation (stuff) annotation or not self.with_seg = with_semantic_seg # prefix of semantic segmentation map path self.seg_prefix = seg_prefix # rescale factor for segmentation maps self.seg_scale_factor = seg_scale_factor # in test mode or not self.test_mode = test_mode # set group flag for the sampler if not self.test_mode: self._set_group_flag() # transforms self.img_transform = ImageTransform( size_divisor=self.size_divisor, **self.img_norm_cfg) self.bbox_transform = BboxTransform() self.mask_transform = MaskTransform() self.seg_transform = SegMapTransform(self.size_divisor) self.numpy2tensor = Numpy2Tensor() # if use extra augmentation if extra_aug is not None: self.extra_aug = ExtraAugmentation(**extra_aug) else: self.extra_aug = None # image rescale if keep ratio self.resize_keep_ratio = resize_keep_ratio def __len__(self): return len(self.img_infos) def load_annotations(self, ann_file): return mmcv.load(ann_file) def load_proposals(self, proposal_file): return mmcv.load(proposal_file) def get_ann_info(self, idx): return self.img_infos[idx]['ann'] def _filter_imgs(self, min_size=32): """Filter images too small.""" valid_inds = [] for i, img_info in enumerate(self.img_infos): if min(img_info['width'], img_info['height']) >= min_size: valid_inds.append(i) return valid_inds def _set_group_flag(self): """Set flag according to image aspect ratio. Images with aspect ratio greater than 1 will be set as group 1, otherwise group 0. """ self.flag = np.zeros(len(self), dtype=np.uint8) for i in range(len(self)): img_info = self.img_infos[i] if img_info['width'] / img_info['height'] > 1: self.flag[i] = 1 def _rand_another(self, idx): pool = np.where(self.flag == self.flag[idx])[0] return np.random.choice(pool) def __getitem__(self, idx): if self.test_mode: return self.prepare_test_img(idx) while True: data = self.prepare_train_img(idx) if data is None: idx = self._rand_another(idx) continue return data def prepare_train_img(self, idx): img_info = self.img_infos[idx] # load image img = mmcv.imread(osp.join(self.img_prefix, img_info['filename'])) # load proposals if necessary if self.proposals is not None: proposals = self.proposals[idx][:self.num_max_proposals] # TODO: Handle empty proposals properly. Currently images with # no proposals are just ignored, but they can be used for # training in concept. if len(proposals) == 0: return None if not (proposals.shape[1] == 4 or proposals.shape[1] == 5): raise AssertionError( 'proposals should have shapes (n, 4) or (n, 5), ' 'but found {}'.format(proposals.shape)) if proposals.shape[1] == 5: scores = proposals[:, 4, None] proposals = proposals[:, :4] else: scores = None ann = self.get_ann_info(idx) gt_bboxes = ann['bboxes'] gt_labels = ann['labels'] if self.with_crowd: gt_bboxes_ignore = ann['bboxes_ignore'] if self.with_mask: gt_masks = decode(ann['masks']) gt_masks = [gt_masks[..., i] for i in range(gt_masks.shape[-1])] # extra augmentation if self.extra_aug is not None: img = self.extra_aug(img) # skip the image if there is no valid gt bbox if len(gt_bboxes) == 0: return None # apply transforms flip = True if np.random.rand() < self.flip_ratio else False # randomly sample a scale img_scale = random_scale(self.img_scales, self.multiscale_mode) img, img_shape, pad_shape, scale_factor = self.img_transform( img, img_scale, flip, keep_ratio=self.resize_keep_ratio) img = img.copy() if self.with_seg: gt_seg = mmcv.imread( osp.join(self.seg_prefix, img_info['file_name'].replace( 'jpg', 'png')), flag='unchanged') gt_seg = self.seg_transform(gt_seg.squeeze(), img_scale, flip) gt_seg = mmcv.imrescale( gt_seg, self.seg_scale_factor, interpolation='nearest') gt_seg = gt_seg[None, ...] if self.proposals is not None: proposals = self.bbox_transform(proposals, img_shape, scale_factor, flip) proposals = np.hstack( [proposals, scores]) if scores is not None else proposals gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor, flip) if self.with_crowd: gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape, scale_factor, flip) if self.with_mask: gt_masks = self.mask_transform(gt_masks, pad_shape, scale_factor, flip) ori_shape = (img_info['height'], img_info['width'], 3) img_meta = dict( ori_shape=ori_shape, img_shape=img_shape, pad_shape=pad_shape, scale_factor=scale_factor, flip=flip) data = dict( img=DC(to_tensor(img), stack=True), img_meta=DC(img_meta, cpu_only=True), gt_bboxes=DC(to_tensor(gt_bboxes))) if self.proposals is not None: data['proposals'] = DC(to_tensor(proposals)) if self.with_label: data['gt_labels'] = DC(to_tensor(gt_labels)) if self.with_crowd: data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore)) if self.with_mask: data['gt_masks'] = DC(gt_masks, cpu_only=True) if self.with_seg: data['gt_semantic_seg'] = DC(to_tensor(gt_seg), stack=True) return data def prepare_test_img(self, idx): """Prepare an image for testing (multi-scale and flipping)""" img_info = self.img_infos[idx] img = mmcv.imread(osp.join(self.img_prefix, img_info['filename'])) if self.proposals is not None: proposal = self.proposals[idx][:self.num_max_proposals] if not (proposal.shape[1] == 4 or proposal.shape[1] == 5): raise AssertionError( 'proposals should have shapes (n, 4) or (n, 5), ' 'but found {}'.format(proposal.shape)) else: proposal = None def prepare_single(img, scale, flip, proposal=None): _img, img_shape, pad_shape, scale_factor = self.img_transform( img, scale, flip, keep_ratio=self.resize_keep_ratio) _img = to_tensor(_img) _img_meta = dict( ori_shape=(img_info['height'], img_info['width'], 3), img_shape=img_shape, pad_shape=pad_shape, scale_factor=scale_factor, flip=flip) if proposal is not None: if proposal.shape[1] == 5: score = proposal[:, 4, None] proposal = proposal[:, :4] else: score = None _proposal = self.bbox_transform(proposal, img_shape, scale_factor, flip) _proposal = np.hstack( [_proposal, score]) if score is not None else _proposal _proposal = to_tensor(_proposal) else: _proposal = None return _img, _img_meta, _proposal imgs = [] img_metas = [] proposals = [] for scale in self.img_scales: _img, _img_meta, _proposal = prepare_single( img, scale, False, proposal) imgs.append(_img) img_metas.append(DC(_img_meta, cpu_only=True)) proposals.append(_proposal) if self.flip_ratio > 0: _img, _img_meta, _proposal = prepare_single( img, scale, True, proposal) imgs.append(_img) img_metas.append(DC(_img_meta, cpu_only=True)) proposals.append(_proposal) data = dict(img=imgs, img_meta=img_metas) if self.proposals is not None: data['proposals'] = proposals return data ================================================ FILE: mmdetection/mmdet/datasets/extra_aug.py ================================================ import numpy as np import albumentations as A from mmcv.runner import obj_from_dict from . import transforms class ExtraAugmentation(object): def __init__(self, **kwargs): self.transform = self.transform_from_dict(**kwargs) def transform_from_dict(self, **kwargs): if 'transforms' in kwargs: kwargs['transforms'] = [self.transform_from_dict(**transform) for transform in kwargs['transforms']] try: return obj_from_dict(kwargs, transforms) except AttributeError: return obj_from_dict(kwargs, A) def __call__(self, img): data = self.transform( image=img, ) return data['image'] ================================================ FILE: mmdetection/mmdet/datasets/loader/__init__.py ================================================ from .build_loader import build_dataloader from .sampler import GroupSampler, DistributedGroupSampler __all__ = [ 'GroupSampler', 'DistributedGroupSampler', 'build_dataloader' ] ================================================ FILE: mmdetection/mmdet/datasets/loader/build_loader.py ================================================ from functools import partial from mmcv.runner import get_dist_info from mmcv.parallel import collate from torch.utils.data import DataLoader from .sampler import GroupSampler, DistributedGroupSampler, DistributedSampler # https://github.com/pytorch/pytorch/issues/973 import resource rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1])) def build_dataloader(dataset, imgs_per_gpu, workers_per_gpu, num_gpus=1, dist=True, **kwargs): shuffle = kwargs.get('shuffle', True) if dist: rank, world_size = get_dist_info() if shuffle: sampler = DistributedGroupSampler(dataset, imgs_per_gpu, world_size, rank) else: sampler = DistributedSampler(dataset, world_size, rank, shuffle=False) batch_size = imgs_per_gpu num_workers = workers_per_gpu else: sampler = GroupSampler(dataset, imgs_per_gpu) if shuffle else None batch_size = num_gpus * imgs_per_gpu num_workers = num_gpus * workers_per_gpu data_loader = DataLoader(dataset, batch_size=batch_size, sampler=sampler, num_workers=num_workers, collate_fn=partial(collate, samples_per_gpu=imgs_per_gpu), pin_memory=False, **kwargs) return data_loader ================================================ FILE: mmdetection/mmdet/datasets/loader/sampler.py ================================================ from __future__ import division import math import torch import numpy as np from torch.distributed import get_world_size, get_rank from torch.utils.data import Sampler from torch.utils.data import DistributedSampler as _DistributedSampler class DistributedSampler(_DistributedSampler): def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): super().__init__(dataset, num_replicas=num_replicas, rank=rank) self.shuffle = shuffle def __iter__(self): # deterministically shuffle based on epoch if self.shuffle: g = torch.Generator() g.manual_seed(self.epoch) indices = torch.randperm(len(self.dataset), generator=g).tolist() else: indices = torch.arange(len(self.dataset)).tolist() # add extra samples to make it evenly divisible indices += indices[:(self.total_size - len(indices))] assert len(indices) == self.total_size # subsample indices = indices[self.rank:self.total_size:self.num_replicas] assert len(indices) == self.num_samples return iter(indices) class GroupSampler(Sampler): def __init__(self, dataset, samples_per_gpu=1): assert hasattr(dataset, 'flag') self.dataset = dataset self.samples_per_gpu = samples_per_gpu self.flag = dataset.flag.astype(np.int64) self.group_sizes = np.bincount(self.flag) self.num_samples = 0 for i, size in enumerate(self.group_sizes): self.num_samples += int(np.ceil( size / self.samples_per_gpu)) * self.samples_per_gpu def __iter__(self): indices = [] for i, size in enumerate(self.group_sizes): if size == 0: continue indice = np.where(self.flag == i)[0] assert len(indice) == size np.random.shuffle(indice) num_extra = int(np.ceil(size / self.samples_per_gpu) ) * self.samples_per_gpu - len(indice) indice = np.concatenate([indice, indice[:num_extra]]) indices.append(indice) indices = np.concatenate(indices) indices = [ indices[i * self.samples_per_gpu:(i + 1) * self.samples_per_gpu] for i in np.random.permutation( range(len(indices) // self.samples_per_gpu)) ] indices = np.concatenate(indices) indices = torch.from_numpy(indices).long() assert len(indices) == self.num_samples return iter(indices) def __len__(self): return self.num_samples class DistributedGroupSampler(Sampler): """Sampler that restricts data loading to a subset of the dataset. It is especially useful in conjunction with :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each process can pass a DistributedSampler instance as a DataLoader sampler, and load a subset of the original dataset that is exclusive to it. .. note:: Dataset is assumed to be of constant size. Arguments: dataset: Dataset used for sampling. num_replicas (optional): Number of processes participating in distributed training. rank (optional): Rank of the current process within num_replicas. """ def __init__(self, dataset, samples_per_gpu=1, num_replicas=None, rank=None): if num_replicas is None: num_replicas = get_world_size() if rank is None: rank = get_rank() self.dataset = dataset self.samples_per_gpu = samples_per_gpu self.num_replicas = num_replicas self.rank = rank self.epoch = 0 assert hasattr(self.dataset, 'flag') self.flag = self.dataset.flag self.group_sizes = np.bincount(self.flag) self.num_samples = 0 for i, j in enumerate(self.group_sizes): self.num_samples += int( math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu / self.num_replicas)) * self.samples_per_gpu self.total_size = self.num_samples * self.num_replicas def __iter__(self): # deterministically shuffle based on epoch g = torch.Generator() g.manual_seed(self.epoch) indices = [] for i, size in enumerate(self.group_sizes): if size > 0: indice = np.where(self.flag == i)[0] assert len(indice) == size indice = indice[list(torch.randperm(int(size), generator=g))].tolist() extra = int( math.ceil( size * 1.0 / self.samples_per_gpu / self.num_replicas) ) * self.samples_per_gpu * self.num_replicas - len(indice) indice += indice[:extra] indices += indice assert len(indices) == self.total_size indices = [ indices[j] for i in list( torch.randperm(len(indices) // self.samples_per_gpu, generator=g)) for j in range(i * self.samples_per_gpu, (i + 1) * self.samples_per_gpu) ] # subsample offset = self.num_samples * self.rank indices = indices[offset:offset + self.num_samples] assert len(indices) == self.num_samples return iter(indices) def __len__(self): return self.num_samples def set_epoch(self, epoch): self.epoch = epoch ================================================ FILE: mmdetection/mmdet/datasets/repeat_dataset.py ================================================ import numpy as np class RepeatDataset(object): def __init__(self, dataset, times): self.dataset = dataset self.times = times self.CLASSES = dataset.CLASSES if hasattr(self.dataset, 'flag'): self.flag = np.tile(self.dataset.flag, times) self._ori_len = len(self.dataset) def __getitem__(self, idx): return self.dataset[idx % self._ori_len] def __len__(self): return self.times * self._ori_len ================================================ FILE: mmdetection/mmdet/datasets/transforms.py ================================================ import random import albumentations as A import albumentations.augmentations.functional as F import mmcv import numpy as np import torch from numpy import random __all__ = [ 'ImageTransform', 'BboxTransform', 'MaskTransform', 'SegMapTransform', 'Numpy2Tensor' ] class ImageTransform(object): """Preprocess an image. 1. rescale the image to expected size 2. normalize the image 3. flip the image (if needed) 4. pad the image (if needed) 5. transpose to (c, h, w) """ def __init__(self, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True, size_divisor=None): self.mean = np.array(mean, dtype=np.float32) self.std = np.array(std, dtype=np.float32) self.to_rgb = to_rgb self.size_divisor = size_divisor def __call__(self, img, scale, flip=False, keep_ratio=True): if keep_ratio: img, scale_factor = mmcv.imrescale(img, scale, return_scale=True) else: img, w_scale, h_scale = mmcv.imresize( img, scale, return_scale=True) scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], dtype=np.float32) img_shape = img.shape img = mmcv.imnormalize(img, self.mean, self.std, self.to_rgb) if flip: img = mmcv.imflip(img) if self.size_divisor is not None: img = mmcv.impad_to_multiple(img, self.size_divisor) pad_shape = img.shape else: pad_shape = img_shape img = img.transpose(2, 0, 1) return img, img_shape, pad_shape, scale_factor def bbox_flip(bboxes, img_shape): """Flip bboxes horizontally. Args: bboxes(ndarray): shape (..., 4*k) img_shape(tuple): (height, width) """ assert bboxes.shape[-1] % 4 == 0 w = img_shape[1] flipped = bboxes.copy() flipped[..., 0::4] = w - bboxes[..., 2::4] - 1 flipped[..., 2::4] = w - bboxes[..., 0::4] - 1 return flipped class BboxTransform(object): """Preprocess gt bboxes. 1. rescale bboxes according to image size 2. flip bboxes (if needed) 3. pad the first dimension to `max_num_gts` """ def __init__(self, max_num_gts=None): self.max_num_gts = max_num_gts def __call__(self, bboxes, img_shape, scale_factor, flip=False): gt_bboxes = bboxes * scale_factor if flip: gt_bboxes = bbox_flip(gt_bboxes, img_shape) gt_bboxes[:, 0::2] = np.clip(gt_bboxes[:, 0::2], 0, img_shape[1] - 1) gt_bboxes[:, 1::2] = np.clip(gt_bboxes[:, 1::2], 0, img_shape[0] - 1) if self.max_num_gts is None: return gt_bboxes else: num_gts = gt_bboxes.shape[0] padded_bboxes = np.zeros((self.max_num_gts, 4), dtype=np.float32) padded_bboxes[:num_gts, :] = gt_bboxes return padded_bboxes class MaskTransform(object): """Preprocess masks. 1. resize masks to expected size and stack to a single array 2. flip the masks (if needed) 3. pad the masks (if needed) """ def __call__(self, masks, pad_shape, scale_factor, flip=False): masks = [ mmcv.imrescale(mask, scale_factor, interpolation='nearest') for mask in masks ] if flip: masks = [mask[:, ::-1] for mask in masks] padded_masks = [ mmcv.impad(mask, pad_shape[:2], pad_val=0) for mask in masks ] padded_masks = np.stack(padded_masks, axis=0) return padded_masks class SegMapTransform(object): """Preprocess semantic segmentation maps. 1. rescale the segmentation map to expected size 3. flip the image (if needed) 4. pad the image (if needed) """ def __init__(self, size_divisor=None): self.size_divisor = size_divisor def __call__(self, img, scale, flip=False, keep_ratio=True): if keep_ratio: img = mmcv.imrescale(img, scale, interpolation='nearest') else: img = mmcv.imresize(img, scale, interpolation='nearest') if flip: img = mmcv.imflip(img) if self.size_divisor is not None: img = mmcv.impad_to_multiple(img, self.size_divisor) return img class Numpy2Tensor(object): def __init__(self): pass def __call__(self, *args): if len(args) == 1: return torch.from_numpy(args[0]) else: return tuple([torch.from_numpy(np.array(array)) for array in args]) class RandomCropNearBBox(A.DualTransform): """Crop bbox from image with random shift by x,y coordinates Args: max_part_shift (float): float value in (0.0, 1.0) range. Default 0.3 p (float): probability of applying the transform. Default: 1. Targets: image Image types: uint8, float32 """ def __init__(self, max_part_shift=0.3, always_apply=False, p=1.0): super(RandomCropNearBBox, self).__init__(always_apply, p) self.max_part_shift = max_part_shift def apply(self, img, x_min=0, x_max=0, y_min=0, y_max=0, **params): return F.clamping_crop(img, x_min, y_min, x_max, y_max) def get_params_dependent_on_targets(self, params): bbox = params['cropping_bbox'] h_max_shift = int((bbox[3] - bbox[1]) * self.max_part_shift) w_max_shift = int((bbox[2] - bbox[0]) * self.max_part_shift) x_min = bbox[0] - random.randint(-w_max_shift, w_max_shift) x_max = bbox[2] + random.randint(-w_max_shift, w_max_shift) y_min = bbox[1] - random.randint(-h_max_shift, h_max_shift) y_max = bbox[3] + random.randint(-h_max_shift, h_max_shift) return {'x_min': x_min, 'x_max': x_max, 'y_min': y_min, 'y_max': y_max } def apply_to_bbox(self, bbox, x_min=0, x_max=0, y_min=0, y_max=0, **params): h, w = params['rows'], params['cols'] if x_min < 0: x_min = 0 if y_min < 0: y_min = 0 if y_max >= h: y_max = h - 1 if x_max >= w: x_max = w - 1 return F.bbox_crop(bbox, x_min, y_min, x_max, y_max, **params) @property def targets_as_params(self): return ['cropping_bbox'] def get_transform_init_args_names(self): return 'max_part_shift', ================================================ FILE: mmdetection/mmdet/datasets/utils.py ================================================ import copy from collections import Sequence import mmcv from mmcv.runner import obj_from_dict import torch import matplotlib.pyplot as plt import numpy as np from .concat_dataset import ConcatDataset from .repeat_dataset import RepeatDataset from .. import datasets def to_tensor(data): """Convert objects of various python types to :obj:`torch.Tensor`. Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, :class:`Sequence`, :class:`int` and :class:`float`. """ if isinstance(data, torch.Tensor): return data elif isinstance(data, np.ndarray): return torch.from_numpy(data) elif isinstance(data, Sequence) and not mmcv.is_str(data): return torch.tensor(data) elif isinstance(data, int): return torch.LongTensor([data]) elif isinstance(data, float): return torch.FloatTensor([data]) else: raise TypeError('type {} cannot be converted to tensor.'.format( type(data))) def random_scale(img_scales, mode='range'): """Randomly select a scale from a list of scales or scale ranges. Args: img_scales (list[tuple]): Image scale or scale range. mode (str): "range" or "value". Returns: tuple: Sampled image scale. """ num_scales = len(img_scales) if num_scales == 1: # fixed scale is specified img_scale = img_scales[0] elif num_scales == 2: # randomly sample a scale if mode == 'range': img_scale_long = [max(s) for s in img_scales] img_scale_short = [min(s) for s in img_scales] long_edge = np.random.randint( min(img_scale_long), max(img_scale_long) + 1) short_edge = np.random.randint( min(img_scale_short), max(img_scale_short) + 1) img_scale = (long_edge, short_edge) elif mode == 'value': img_scale = img_scales[np.random.randint(num_scales)] else: if mode != 'value': raise ValueError( 'Only "value" mode supports more than 2 image scales') img_scale = img_scales[np.random.randint(num_scales)] return img_scale def show_ann(coco, img, ann_info): plt.imshow(mmcv.bgr2rgb(img)) plt.axis('off') coco.showAnns(ann_info) plt.show() def get_dataset(data_cfg): if data_cfg['type'] == 'RepeatDataset': return RepeatDataset( get_dataset(data_cfg['dataset']), data_cfg['times']) if isinstance(data_cfg['ann_file'], (list, tuple)): ann_files = data_cfg['ann_file'] num_dset = len(ann_files) else: ann_files = [data_cfg['ann_file']] num_dset = 1 if 'proposal_file' in data_cfg.keys(): if isinstance(data_cfg['proposal_file'], (list, tuple)): proposal_files = data_cfg['proposal_file'] else: proposal_files = [data_cfg['proposal_file']] else: proposal_files = [None] * num_dset assert len(proposal_files) == num_dset if isinstance(data_cfg['img_prefix'], (list, tuple)): img_prefixes = data_cfg['img_prefix'] else: img_prefixes = [data_cfg['img_prefix']] * num_dset assert len(img_prefixes) == num_dset dsets = [] for i in range(num_dset): data_info = copy.deepcopy(data_cfg) data_info['ann_file'] = ann_files[i] data_info['proposal_file'] = proposal_files[i] data_info['img_prefix'] = img_prefixes[i] dset = obj_from_dict(data_info, datasets) dsets.append(dset) if len(dsets) > 1: dset = ConcatDataset(dsets) else: dset = dsets[0] return dset def rle_decode(rle, h: int, w: int): s = rle.split() starts, lengths = map(np.asarray, (s[0::2], s[1::2])) starts -= 1 ends = starts + lengths img = np.zeros(h * w, dtype=np.uint8) for start, end in zip(starts, ends): img[start:end] = 1 return img.reshape((h, w)) ================================================ FILE: mmdetection/mmdet/datasets/voc.py ================================================ from .xml_style import XMLDataset class VOCDataset(XMLDataset): CLASSES = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor') def __init__(self, **kwargs): super(VOCDataset, self).__init__(**kwargs) if 'VOC2007' in self.img_prefix: self.year = 2007 elif 'VOC2012' in self.img_prefix: self.year = 2012 else: raise ValueError('Cannot infer dataset year from img_prefix') ================================================ FILE: mmdetection/mmdet/datasets/xml_style.py ================================================ import os.path as osp import xml.etree.ElementTree as ET import mmcv import numpy as np from .custom import CustomDataset class XMLDataset(CustomDataset): def __init__(self, **kwargs): super(XMLDataset, self).__init__(**kwargs) self.cat2label = {cat: i + 1 for i, cat in enumerate(self.CLASSES)} def load_annotations(self, ann_file): img_infos = [] img_ids = mmcv.list_from_file(ann_file) for img_id in img_ids: filename = 'JPEGImages/{}.jpg'.format(img_id) xml_path = osp.join(self.img_prefix, 'Annotations', '{}.xml'.format(img_id)) tree = ET.parse(xml_path) root = tree.getroot() size = root.find('size') width = int(size.find('width').text) height = int(size.find('height').text) img_infos.append( dict(id=img_id, filename=filename, width=width, height=height)) return img_infos def get_ann_info(self, idx): img_id = self.img_infos[idx]['id'] xml_path = osp.join(self.img_prefix, 'Annotations', '{}.xml'.format(img_id)) tree = ET.parse(xml_path) root = tree.getroot() bboxes = [] labels = [] bboxes_ignore = [] labels_ignore = [] for obj in root.findall('object'): name = obj.find('name').text label = self.cat2label[name] difficult = int(obj.find('difficult').text) bnd_box = obj.find('bndbox') bbox = [ int(bnd_box.find('xmin').text), int(bnd_box.find('ymin').text), int(bnd_box.find('xmax').text), int(bnd_box.find('ymax').text) ] if difficult: bboxes_ignore.append(bbox) labels_ignore.append(label) else: bboxes.append(bbox) labels.append(label) if not bboxes: bboxes = np.zeros((0, 4)) labels = np.zeros((0, )) else: bboxes = np.array(bboxes, ndmin=2) - 1 labels = np.array(labels) if not bboxes_ignore: bboxes_ignore = np.zeros((0, 4)) labels_ignore = np.zeros((0, )) else: bboxes_ignore = np.array(bboxes_ignore, ndmin=2) - 1 labels_ignore = np.array(labels_ignore) ann = dict( bboxes=bboxes.astype(np.float32), labels=labels.astype(np.int64), bboxes_ignore=bboxes_ignore.astype(np.float32), labels_ignore=labels_ignore.astype(np.int64)) return ann ================================================ FILE: mmdetection/mmdet/models/__init__.py ================================================ from .backbones import * # noqa: F401,F403 from .necks import * # noqa: F401,F403 from .roi_extractors import * # noqa: F401,F403 from .anchor_heads import * # noqa: F401,F403 from .shared_heads import * # noqa: F401,F403 from .bbox_heads import * # noqa: F401,F403 from .mask_heads import * # noqa: F401,F403 from .detectors import * # noqa: F401,F403 from .registry import (BACKBONES, NECKS, ROI_EXTRACTORS, SHARED_HEADS, HEADS, DETECTORS) from .builder import (build_backbone, build_neck, build_roi_extractor, build_shared_head, build_head, build_detector) __all__ = [ 'BACKBONES', 'NECKS', 'ROI_EXTRACTORS', 'SHARED_HEADS', 'HEADS', 'DETECTORS', 'build_backbone', 'build_neck', 'build_roi_extractor', 'build_shared_head', 'build_head', 'build_detector' ] ================================================ FILE: mmdetection/mmdet/models/anchor_heads/__init__.py ================================================ from .anchor_head import AnchorHead from .fcos_head import FCOSHead from .retina_head import RetinaHead from .rpn_head import RPNHead from .ssd_head import SSDHead __all__ = ['AnchorHead', 'RPNHead', 'RetinaHead', 'SSDHead', 'FCOSHead'] ================================================ FILE: mmdetection/mmdet/models/anchor_heads/anchor_head.py ================================================ from __future__ import division import numpy as np import torch import torch.nn as nn from mmcv.cnn import normal_init from mmdet.core import (AnchorGenerator, anchor_target, delta2bbox, multi_apply, weighted_cross_entropy, weighted_smoothl1, weighted_binary_cross_entropy, weighted_sigmoid_focal_loss, multiclass_nms) from ..registry import HEADS @HEADS.register_module class AnchorHead(nn.Module): """Anchor-based head (RPN, RetinaNet, SSD, etc.). Args: in_channels (int): Number of channels in the input feature map. feat_channels (int): Number of channels of the feature map. anchor_scales (Iterable): Anchor scales. anchor_ratios (Iterable): Anchor aspect ratios. anchor_strides (Iterable): Anchor strides. anchor_base_sizes (Iterable): Anchor base sizes. target_means (Iterable): Mean values of regression targets. target_stds (Iterable): Std values of regression targets. use_sigmoid_cls (bool): Whether to use sigmoid loss for classification. (softmax by default) cls_focal_loss (bool): Whether to use focal loss for classification. """ # noqa: W605 def __init__(self, num_classes, in_channels, feat_channels=256, anchor_scales=[8, 16, 32], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], anchor_base_sizes=None, target_means=(.0, .0, .0, .0), target_stds=(1.0, 1.0, 1.0, 1.0), use_sigmoid_cls=False, cls_focal_loss=False): super(AnchorHead, self).__init__() self.in_channels = in_channels self.num_classes = num_classes self.feat_channels = feat_channels self.anchor_scales = anchor_scales self.anchor_ratios = anchor_ratios self.anchor_strides = anchor_strides self.anchor_base_sizes = list( anchor_strides) if anchor_base_sizes is None else anchor_base_sizes self.target_means = target_means self.target_stds = target_stds self.use_sigmoid_cls = use_sigmoid_cls self.cls_focal_loss = cls_focal_loss self.anchor_generators = [] for anchor_base in self.anchor_base_sizes: self.anchor_generators.append( AnchorGenerator(anchor_base, anchor_scales, anchor_ratios)) self.num_anchors = len(self.anchor_ratios) * len(self.anchor_scales) if self.use_sigmoid_cls: self.cls_out_channels = self.num_classes - 1 else: self.cls_out_channels = self.num_classes self._init_layers() def _init_layers(self): self.conv_cls = nn.Conv2d(self.feat_channels, self.num_anchors * self.cls_out_channels, 1) self.conv_reg = nn.Conv2d(self.feat_channels, self.num_anchors * 4, 1) def init_weights(self): normal_init(self.conv_cls, std=0.01) normal_init(self.conv_reg, std=0.01) def forward_single(self, x): cls_score = self.conv_cls(x) bbox_pred = self.conv_reg(x) return cls_score, bbox_pred def forward(self, feats): return multi_apply(self.forward_single, feats) def get_anchors(self, featmap_sizes, img_metas): """Get anchors according to feature map sizes. Args: featmap_sizes (list[tuple]): Multi-level feature map sizes. img_metas (list[dict]): Image meta info. Returns: tuple: anchors of each image, valid flags of each image """ num_imgs = len(img_metas) num_levels = len(featmap_sizes) # since feature map sizes of all images are the same, we only compute # anchors for one time multi_level_anchors = [] for i in range(num_levels): anchors = self.anchor_generators[i].grid_anchors( featmap_sizes[i], self.anchor_strides[i]) multi_level_anchors.append(anchors) anchor_list = [multi_level_anchors for _ in range(num_imgs)] # for each image, we compute valid flags of multi level anchors valid_flag_list = [] for img_id, img_meta in enumerate(img_metas): multi_level_flags = [] for i in range(num_levels): anchor_stride = self.anchor_strides[i] feat_h, feat_w = featmap_sizes[i] h, w, _ = img_meta['pad_shape'] valid_feat_h = min(int(np.ceil(h / anchor_stride)), feat_h) valid_feat_w = min(int(np.ceil(w / anchor_stride)), feat_w) flags = self.anchor_generators[i].valid_flags( (feat_h, feat_w), (valid_feat_h, valid_feat_w)) multi_level_flags.append(flags) valid_flag_list.append(multi_level_flags) return anchor_list, valid_flag_list def loss_single(self, cls_score, bbox_pred, labels, label_weights, bbox_targets, bbox_weights, num_total_samples, cfg): # classification loss labels = labels.reshape(-1) label_weights = label_weights.reshape(-1) cls_score = cls_score.permute(0, 2, 3, 1).reshape( -1, self.cls_out_channels) if self.use_sigmoid_cls: if self.cls_focal_loss: cls_criterion = weighted_sigmoid_focal_loss else: cls_criterion = weighted_binary_cross_entropy else: if self.cls_focal_loss: raise NotImplementedError else: cls_criterion = weighted_cross_entropy if self.cls_focal_loss: loss_cls = cls_criterion( cls_score, labels, label_weights, gamma=cfg.gamma, alpha=cfg.alpha, avg_factor=num_total_samples) else: loss_cls = cls_criterion( cls_score, labels, label_weights, avg_factor=num_total_samples) # regression loss bbox_targets = bbox_targets.reshape(-1, 4) bbox_weights = bbox_weights.reshape(-1, 4) bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) loss_reg = weighted_smoothl1( bbox_pred, bbox_targets, bbox_weights, beta=cfg.smoothl1_beta, avg_factor=num_total_samples) return loss_cls, loss_reg def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas, cfg, gt_bboxes_ignore=None): featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] assert len(featmap_sizes) == len(self.anchor_generators) anchor_list, valid_flag_list = self.get_anchors( featmap_sizes, img_metas) sampling = False if self.cls_focal_loss else True label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1 cls_reg_targets = anchor_target( anchor_list, valid_flag_list, gt_bboxes, img_metas, self.target_means, self.target_stds, cfg, gt_bboxes_ignore_list=gt_bboxes_ignore, gt_labels_list=gt_labels, label_channels=label_channels, sampling=sampling) if cls_reg_targets is None: return None (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets num_total_samples = (num_total_pos if self.cls_focal_loss else num_total_pos + num_total_neg) losses_cls, losses_reg = multi_apply( self.loss_single, cls_scores, bbox_preds, labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_samples=num_total_samples, cfg=cfg) return dict(loss_cls=losses_cls, loss_reg=losses_reg) def get_bboxes(self, cls_scores, bbox_preds, img_metas, cfg, rescale=False): assert len(cls_scores) == len(bbox_preds) num_levels = len(cls_scores) mlvl_anchors = [ self.anchor_generators[i].grid_anchors(cls_scores[i].size()[-2:], self.anchor_strides[i]) for i in range(num_levels) ] result_list = [] for img_id in range(len(img_metas)): cls_score_list = [ cls_scores[i][img_id].detach() for i in range(num_levels) ] bbox_pred_list = [ bbox_preds[i][img_id].detach() for i in range(num_levels) ] img_shape = img_metas[img_id]['img_shape'] scale_factor = img_metas[img_id]['scale_factor'] proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list, mlvl_anchors, img_shape, scale_factor, cfg, rescale) result_list.append(proposals) return result_list def get_bboxes_single(self, cls_scores, bbox_preds, mlvl_anchors, img_shape, scale_factor, cfg, rescale=False): assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors) mlvl_bboxes = [] mlvl_scores = [] for cls_score, bbox_pred, anchors in zip(cls_scores, bbox_preds, mlvl_anchors): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] cls_score = cls_score.permute(1, 2, 0).reshape( -1, self.cls_out_channels) if self.use_sigmoid_cls: scores = cls_score.sigmoid() else: scores = cls_score.softmax(-1) bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) nms_pre = cfg.get('nms_pre', -1) if nms_pre > 0 and scores.shape[0] > nms_pre: if self.use_sigmoid_cls: max_scores, _ = scores.max(dim=1) else: max_scores, _ = scores[:, 1:].max(dim=1) _, topk_inds = max_scores.topk(nms_pre) anchors = anchors[topk_inds, :] bbox_pred = bbox_pred[topk_inds, :] scores = scores[topk_inds, :] bboxes = delta2bbox(anchors, bbox_pred, self.target_means, self.target_stds, img_shape) mlvl_bboxes.append(bboxes) mlvl_scores.append(scores) mlvl_bboxes = torch.cat(mlvl_bboxes) if rescale: mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor) mlvl_scores = torch.cat(mlvl_scores) if self.use_sigmoid_cls: padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) mlvl_scores = torch.cat([padding, mlvl_scores], dim=1) det_bboxes, det_labels = multiclass_nms( mlvl_bboxes, mlvl_scores, cfg.score_thr, cfg.nms, cfg.max_per_img) return det_bboxes, det_labels ================================================ FILE: mmdetection/mmdet/models/anchor_heads/fcos_head.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import normal_init from mmdet.core import (sigmoid_focal_loss, iou_loss, multi_apply, multiclass_nms, distance2bbox) from ..registry import HEADS from ..utils import bias_init_with_prob, Scale, ConvModule INF = 1e8 @HEADS.register_module class FCOSHead(nn.Module): def __init__(self, num_classes, in_channels, feat_channels=256, stacked_convs=4, strides=(4, 8, 16, 32, 64), regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 512), (512, INF)), conv_cfg=None, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)): super(FCOSHead, self).__init__() self.num_classes = num_classes self.cls_out_channels = num_classes - 1 self.in_channels = in_channels self.feat_channels = feat_channels self.stacked_convs = stacked_convs self.strides = strides self.regress_ranges = regress_ranges self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self._init_layers() def _init_layers(self): self.cls_convs = nn.ModuleList() self.reg_convs = nn.ModuleList() for i in range(self.stacked_convs): chn = self.in_channels if i == 0 else self.feat_channels self.cls_convs.append( ConvModule( chn, self.feat_channels, 3, stride=1, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, bias=self.norm_cfg is None)) self.reg_convs.append( ConvModule( chn, self.feat_channels, 3, stride=1, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, bias=self.norm_cfg is None)) self.fcos_cls = nn.Conv2d( self.feat_channels, self.cls_out_channels, 3, padding=1) self.fcos_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1) self.fcos_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1) self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides]) def init_weights(self): for m in self.cls_convs: normal_init(m.conv, std=0.01) for m in self.reg_convs: normal_init(m.conv, std=0.01) bias_cls = bias_init_with_prob(0.01) normal_init(self.fcos_cls, std=0.01, bias=bias_cls) normal_init(self.fcos_reg, std=0.01) normal_init(self.fcos_centerness, std=0.01) def forward(self, feats): return multi_apply(self.forward_single, feats, self.scales) def forward_single(self, x, scale): cls_feat = x reg_feat = x for cls_layer in self.cls_convs: cls_feat = cls_layer(cls_feat) cls_score = self.fcos_cls(cls_feat) centerness = self.fcos_centerness(cls_feat) for reg_layer in self.reg_convs: reg_feat = reg_layer(reg_feat) # scale the bbox_pred of different level bbox_pred = scale(self.fcos_reg(reg_feat)).exp() return cls_score, bbox_pred, centerness def loss(self, cls_scores, bbox_preds, centernesses, gt_bboxes, gt_labels, img_metas, cfg, gt_bboxes_ignore=None): assert len(cls_scores) == len(bbox_preds) == len(centernesses) featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype, bbox_preds[0].device) labels, bbox_targets = self.fcos_target(all_level_points, gt_bboxes, gt_labels) num_imgs = cls_scores[0].size(0) # flatten cls_scores, bbox_preds and centerness flatten_cls_scores = [ cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) for cls_score in cls_scores ] flatten_bbox_preds = [ bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) for bbox_pred in bbox_preds ] flatten_centerness = [ centerness.permute(0, 2, 3, 1).reshape(-1) for centerness in centernesses ] flatten_cls_scores = torch.cat(flatten_cls_scores) flatten_bbox_preds = torch.cat(flatten_bbox_preds) flatten_centerness = torch.cat(flatten_centerness) flatten_labels = torch.cat(labels) flatten_bbox_targets = torch.cat(bbox_targets) # repeat points to align with bbox_preds flatten_points = torch.cat( [points.repeat(num_imgs, 1) for points in all_level_points]) pos_inds = flatten_labels.nonzero().reshape(-1) num_pos = len(pos_inds) loss_cls = sigmoid_focal_loss( flatten_cls_scores, flatten_labels, cfg.gamma, cfg.alpha, 'none').sum()[None] / (num_pos + num_imgs) # avoid num_pos is 0 pos_bbox_preds = flatten_bbox_preds[pos_inds] pos_bbox_targets = flatten_bbox_targets[pos_inds] pos_centerness = flatten_centerness[pos_inds] pos_centerness_targets = self.centerness_target(pos_bbox_targets) if num_pos > 0: pos_points = flatten_points[pos_inds] pos_decoded_bbox_preds = distance2bbox(pos_points, pos_bbox_preds) pos_decoded_target_preds = distance2bbox(pos_points, pos_bbox_targets) # centerness weighted iou loss loss_reg = ((iou_loss( pos_decoded_bbox_preds, pos_decoded_target_preds, reduction='none') * pos_centerness_targets).sum() / pos_centerness_targets.sum())[None] loss_centerness = F.binary_cross_entropy_with_logits( pos_centerness, pos_centerness_targets, reduction='mean')[None] else: loss_reg = pos_bbox_preds.sum()[None] loss_centerness = pos_centerness.sum()[None] return dict( loss_cls=loss_cls, loss_reg=loss_reg, loss_centerness=loss_centerness) def get_bboxes(self, cls_scores, bbox_preds, centernesses, img_metas, cfg, rescale=None): assert len(cls_scores) == len(bbox_preds) num_levels = len(cls_scores) featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype, bbox_preds[0].device) result_list = [] for img_id in range(len(img_metas)): cls_score_list = [ cls_scores[i][img_id].detach() for i in range(num_levels) ] bbox_pred_list = [ bbox_preds[i][img_id].detach() for i in range(num_levels) ] centerness_pred_list = [ centernesses[i][img_id].detach() for i in range(num_levels) ] img_shape = img_metas[img_id]['img_shape'] scale_factor = img_metas[img_id]['scale_factor'] det_bboxes = self.get_bboxes_single( cls_score_list, bbox_pred_list, centerness_pred_list, mlvl_points, img_shape, scale_factor, cfg, rescale) result_list.append(det_bboxes) return result_list def get_bboxes_single(self, cls_scores, bbox_preds, centernesses, mlvl_points, img_shape, scale_factor, cfg, rescale=False): assert len(cls_scores) == len(bbox_preds) == len(mlvl_points) mlvl_bboxes = [] mlvl_scores = [] mlvl_centerness = [] for cls_score, bbox_pred, centerness, points in zip( cls_scores, bbox_preds, centernesses, mlvl_points): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] scores = cls_score.permute(1, 2, 0).reshape( -1, self.cls_out_channels).sigmoid() centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid() bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) nms_pre = cfg.get('nms_pre', -1) if nms_pre > 0 and scores.shape[0] > nms_pre: max_scores, _ = (scores * centerness[:, None]).max(dim=1) _, topk_inds = max_scores.topk(nms_pre) points = points[topk_inds, :] bbox_pred = bbox_pred[topk_inds, :] scores = scores[topk_inds, :] centerness = centerness[topk_inds] bboxes = distance2bbox(points, bbox_pred, max_shape=img_shape) mlvl_bboxes.append(bboxes) mlvl_scores.append(scores) mlvl_centerness.append(centerness) mlvl_bboxes = torch.cat(mlvl_bboxes) if rescale: mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor) mlvl_scores = torch.cat(mlvl_scores) padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) mlvl_scores = torch.cat([padding, mlvl_scores], dim=1) mlvl_centerness = torch.cat(mlvl_centerness) det_bboxes, det_labels = multiclass_nms( mlvl_bboxes, mlvl_scores, cfg.score_thr, cfg.nms, cfg.max_per_img, score_factors=mlvl_centerness) return det_bboxes, det_labels def get_points(self, featmap_sizes, dtype, device): """Get points according to feature map sizes. Args: featmap_sizes (list[tuple]): Multi-level feature map sizes. dtype (torch.dtype): Type of points. device (torch.device): Device of points. Returns: tuple: points of each image. """ mlvl_points = [] for i in range(len(featmap_sizes)): mlvl_points.append( self.get_points_single(featmap_sizes[i], self.strides[i], dtype, device)) return mlvl_points def get_points_single(self, featmap_size, stride, dtype, device): h, w = featmap_size x_range = torch.arange( 0, w * stride, stride, dtype=dtype, device=device) y_range = torch.arange( 0, h * stride, stride, dtype=dtype, device=device) y, x = torch.meshgrid(y_range, x_range) points = torch.stack( (x.reshape(-1), y.reshape(-1)), dim=-1) + stride // 2 return points def fcos_target(self, points, gt_bboxes_list, gt_labels_list): assert len(points) == len(self.regress_ranges) num_levels = len(points) # expand regress ranges to align with points expanded_regress_ranges = [ points[i].new_tensor(self.regress_ranges[i])[None].expand_as( points[i]) for i in range(num_levels) ] # concat all levels points and regress ranges concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0) concat_points = torch.cat(points, dim=0) # get labels and bbox_targets of each image labels_list, bbox_targets_list = multi_apply( self.fcos_target_single, gt_bboxes_list, gt_labels_list, points=concat_points, regress_ranges=concat_regress_ranges) # split to per img, per level num_points = [center.size(0) for center in points] labels_list = [labels.split(num_points, 0) for labels in labels_list] bbox_targets_list = [ bbox_targets.split(num_points, 0) for bbox_targets in bbox_targets_list ] # concat per level image concat_lvl_labels = [] concat_lvl_bbox_targets = [] for i in range(num_levels): concat_lvl_labels.append( torch.cat([labels[i] for labels in labels_list])) concat_lvl_bbox_targets.append( torch.cat( [bbox_targets[i] for bbox_targets in bbox_targets_list])) return concat_lvl_labels, concat_lvl_bbox_targets def fcos_target_single(self, gt_bboxes, gt_labels, points, regress_ranges): num_points = points.size(0) num_gts = gt_labels.size(0) areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0] + 1) * ( gt_bboxes[:, 3] - gt_bboxes[:, 1] + 1) # TODO: figure out why these two are different # areas = areas[None].expand(num_points, num_gts) areas = areas[None].repeat(num_points, 1) regress_ranges = regress_ranges[:, None, :].expand( num_points, num_gts, 2) gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4) xs, ys = points[:, 0], points[:, 1] xs = xs[:, None].expand(num_points, num_gts) ys = ys[:, None].expand(num_points, num_gts) left = xs - gt_bboxes[..., 0] right = gt_bboxes[..., 2] - xs top = ys - gt_bboxes[..., 1] bottom = gt_bboxes[..., 3] - ys bbox_targets = torch.stack((left, top, right, bottom), -1) # condition1: inside a gt bbox inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0 # condition2: limit the regression range for each location max_regress_distance = bbox_targets.max(-1)[0] inside_regress_range = ( max_regress_distance >= regress_ranges[..., 0]) & ( max_regress_distance <= regress_ranges[..., 1]) # if there are still more than one objects for a location, # we choose the one with minimal area areas[inside_gt_bbox_mask == 0] = INF areas[inside_regress_range == 0] = INF min_area, min_area_inds = areas.min(dim=1) labels = gt_labels[min_area_inds] labels[min_area == INF] = 0 bbox_targets = bbox_targets[range(num_points), min_area_inds] return labels, bbox_targets def centerness_target(self, pos_bbox_targets): # only calculate pos centerness targets, otherwise there may be nan left_right = pos_bbox_targets[:, [0, 2]] top_bottom = pos_bbox_targets[:, [1, 3]] centerness_targets = ( left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * ( top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]) return torch.sqrt(centerness_targets) ================================================ FILE: mmdetection/mmdet/models/anchor_heads/retina_head.py ================================================ import numpy as np import torch.nn as nn from mmcv.cnn import normal_init from .anchor_head import AnchorHead from ..registry import HEADS from ..utils import bias_init_with_prob, ConvModule @HEADS.register_module class RetinaHead(AnchorHead): def __init__(self, num_classes, in_channels, stacked_convs=4, octave_base_scale=4, scales_per_octave=3, conv_cfg=None, norm_cfg=None, **kwargs): self.stacked_convs = stacked_convs self.octave_base_scale = octave_base_scale self.scales_per_octave = scales_per_octave self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg octave_scales = np.array( [2**(i / scales_per_octave) for i in range(scales_per_octave)]) anchor_scales = octave_scales * octave_base_scale super(RetinaHead, self).__init__( num_classes, in_channels, anchor_scales=anchor_scales, use_sigmoid_cls=True, cls_focal_loss=True, **kwargs) def _init_layers(self): self.relu = nn.ReLU(inplace=True) self.cls_convs = nn.ModuleList() self.reg_convs = nn.ModuleList() for i in range(self.stacked_convs): chn = self.in_channels if i == 0 else self.feat_channels self.cls_convs.append( ConvModule( chn, self.feat_channels, 3, stride=1, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg)) self.reg_convs.append( ConvModule( chn, self.feat_channels, 3, stride=1, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg)) self.retina_cls = nn.Conv2d( self.feat_channels, self.num_anchors * self.cls_out_channels, 3, padding=1) self.retina_reg = nn.Conv2d( self.feat_channels, self.num_anchors * 4, 3, padding=1) def init_weights(self): for m in self.cls_convs: normal_init(m.conv, std=0.01) for m in self.reg_convs: normal_init(m.conv, std=0.01) bias_cls = bias_init_with_prob(0.01) normal_init(self.retina_cls, std=0.01, bias=bias_cls) normal_init(self.retina_reg, std=0.01) def forward_single(self, x): cls_feat = x reg_feat = x for cls_conv in self.cls_convs: cls_feat = cls_conv(cls_feat) for reg_conv in self.reg_convs: reg_feat = reg_conv(reg_feat) cls_score = self.retina_cls(cls_feat) bbox_pred = self.retina_reg(reg_feat) return cls_score, bbox_pred ================================================ FILE: mmdetection/mmdet/models/anchor_heads/rpn_head.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import normal_init from mmdet.core import delta2bbox from mmdet.ops import nms from .anchor_head import AnchorHead from ..registry import HEADS @HEADS.register_module class RPNHead(AnchorHead): def __init__(self, in_channels, **kwargs): super(RPNHead, self).__init__(2, in_channels, **kwargs) def _init_layers(self): self.rpn_conv = nn.Conv2d( self.in_channels, self.feat_channels, 3, padding=1) self.rpn_cls = nn.Conv2d(self.feat_channels, self.num_anchors * self.cls_out_channels, 1) self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_anchors * 4, 1) def init_weights(self): normal_init(self.rpn_conv, std=0.01) normal_init(self.rpn_cls, std=0.01) normal_init(self.rpn_reg, std=0.01) def forward_single(self, x): x = self.rpn_conv(x) x = F.relu(x, inplace=True) rpn_cls_score = self.rpn_cls(x) rpn_bbox_pred = self.rpn_reg(x) return rpn_cls_score, rpn_bbox_pred def loss(self, cls_scores, bbox_preds, gt_bboxes, img_metas, cfg, gt_bboxes_ignore=None): losses = super(RPNHead, self).loss( cls_scores, bbox_preds, gt_bboxes, None, img_metas, cfg, gt_bboxes_ignore=gt_bboxes_ignore) return dict( loss_rpn_cls=losses['loss_cls'], loss_rpn_reg=losses['loss_reg']) def get_bboxes_single(self, cls_scores, bbox_preds, mlvl_anchors, img_shape, scale_factor, cfg, rescale=False): mlvl_proposals = [] for idx in range(len(cls_scores)): rpn_cls_score = cls_scores[idx] rpn_bbox_pred = bbox_preds[idx] assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:] anchors = mlvl_anchors[idx] rpn_cls_score = rpn_cls_score.permute(1, 2, 0) if self.use_sigmoid_cls: rpn_cls_score = rpn_cls_score.reshape(-1) scores = rpn_cls_score.sigmoid() else: rpn_cls_score = rpn_cls_score.reshape(-1, 2) scores = rpn_cls_score.softmax(dim=1)[:, 1] rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4) if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre: _, topk_inds = scores.topk(cfg.nms_pre) rpn_bbox_pred = rpn_bbox_pred[topk_inds, :] anchors = anchors[topk_inds, :] scores = scores[topk_inds] proposals = delta2bbox(anchors, rpn_bbox_pred, self.target_means, self.target_stds, img_shape) if cfg.min_bbox_size > 0: w = proposals[:, 2] - proposals[:, 0] + 1 h = proposals[:, 3] - proposals[:, 1] + 1 valid_inds = torch.nonzero((w >= cfg.min_bbox_size) & (h >= cfg.min_bbox_size)).squeeze() proposals = proposals[valid_inds, :] scores = scores[valid_inds] proposals = torch.cat([proposals, scores.unsqueeze(-1)], dim=-1) proposals, _ = nms(proposals, cfg.nms_thr) proposals = proposals[:cfg.nms_post, :] mlvl_proposals.append(proposals) proposals = torch.cat(mlvl_proposals, 0) if cfg.nms_across_levels: proposals, _ = nms(proposals, cfg.nms_thr) proposals = proposals[:cfg.max_num, :] else: scores = proposals[:, 4] num = min(cfg.max_num, proposals.shape[0]) _, topk_inds = scores.topk(num) proposals = proposals[topk_inds, :] return proposals ================================================ FILE: mmdetection/mmdet/models/anchor_heads/ssd_head.py ================================================ import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import xavier_init from mmdet.core import (AnchorGenerator, anchor_target, weighted_smoothl1, multi_apply) from .anchor_head import AnchorHead from ..registry import HEADS @HEADS.register_module class SSDHead(AnchorHead): def __init__(self, input_size=300, num_classes=81, in_channels=(512, 1024, 512, 256, 256, 256), anchor_strides=(8, 16, 32, 64, 100, 300), basesize_ratio_range=(0.1, 0.9), anchor_ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]), target_means=(.0, .0, .0, .0), target_stds=(1.0, 1.0, 1.0, 1.0)): super(AnchorHead, self).__init__() self.input_size = input_size self.num_classes = num_classes self.in_channels = in_channels self.cls_out_channels = num_classes num_anchors = [len(ratios) * 2 + 2 for ratios in anchor_ratios] reg_convs = [] cls_convs = [] for i in range(len(in_channels)): reg_convs.append( nn.Conv2d( in_channels[i], num_anchors[i] * 4, kernel_size=3, padding=1)) cls_convs.append( nn.Conv2d( in_channels[i], num_anchors[i] * num_classes, kernel_size=3, padding=1)) self.reg_convs = nn.ModuleList(reg_convs) self.cls_convs = nn.ModuleList(cls_convs) min_ratio, max_ratio = basesize_ratio_range min_ratio = int(min_ratio * 100) max_ratio = int(max_ratio * 100) step = int(np.floor(max_ratio - min_ratio) / (len(in_channels) - 2)) min_sizes = [] max_sizes = [] for r in range(int(min_ratio), int(max_ratio) + 1, step): min_sizes.append(int(input_size * r / 100)) max_sizes.append(int(input_size * (r + step) / 100)) if input_size == 300: if basesize_ratio_range[0] == 0.15: # SSD300 COCO min_sizes.insert(0, int(input_size * 7 / 100)) max_sizes.insert(0, int(input_size * 15 / 100)) elif basesize_ratio_range[0] == 0.2: # SSD300 VOC min_sizes.insert(0, int(input_size * 10 / 100)) max_sizes.insert(0, int(input_size * 20 / 100)) elif input_size == 512: if basesize_ratio_range[0] == 0.1: # SSD512 COCO min_sizes.insert(0, int(input_size * 4 / 100)) max_sizes.insert(0, int(input_size * 10 / 100)) elif basesize_ratio_range[0] == 0.15: # SSD512 VOC min_sizes.insert(0, int(input_size * 7 / 100)) max_sizes.insert(0, int(input_size * 15 / 100)) self.anchor_generators = [] self.anchor_strides = anchor_strides for k in range(len(anchor_strides)): base_size = min_sizes[k] stride = anchor_strides[k] ctr = ((stride - 1) / 2., (stride - 1) / 2.) scales = [1., np.sqrt(max_sizes[k] / min_sizes[k])] ratios = [1.] for r in anchor_ratios[k]: ratios += [1 / r, r] # 4 or 6 ratio anchor_generator = AnchorGenerator( base_size, scales, ratios, scale_major=False, ctr=ctr) indices = list(range(len(ratios))) indices.insert(1, len(indices)) anchor_generator.base_anchors = torch.index_select( anchor_generator.base_anchors, 0, torch.LongTensor(indices)) self.anchor_generators.append(anchor_generator) self.target_means = target_means self.target_stds = target_stds self.use_sigmoid_cls = False self.cls_focal_loss = False def init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): xavier_init(m, distribution='uniform', bias=0) def forward(self, feats): cls_scores = [] bbox_preds = [] for feat, reg_conv, cls_conv in zip(feats, self.reg_convs, self.cls_convs): cls_scores.append(cls_conv(feat)) bbox_preds.append(reg_conv(feat)) return cls_scores, bbox_preds def loss_single(self, cls_score, bbox_pred, labels, label_weights, bbox_targets, bbox_weights, num_total_samples, cfg): loss_cls_all = F.cross_entropy( cls_score, labels, reduction='none') * label_weights pos_inds = (labels > 0).nonzero().view(-1) neg_inds = (labels == 0).nonzero().view(-1) num_pos_samples = pos_inds.size(0) num_neg_samples = cfg.neg_pos_ratio * num_pos_samples if num_neg_samples > neg_inds.size(0): num_neg_samples = neg_inds.size(0) topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples) loss_cls_pos = loss_cls_all[pos_inds].sum() loss_cls_neg = topk_loss_cls_neg.sum() loss_cls = (loss_cls_pos + loss_cls_neg) / num_total_samples loss_reg = weighted_smoothl1( bbox_pred, bbox_targets, bbox_weights, beta=cfg.smoothl1_beta, avg_factor=num_total_samples) return loss_cls[None], loss_reg def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas, cfg, gt_bboxes_ignore=None): featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] assert len(featmap_sizes) == len(self.anchor_generators) anchor_list, valid_flag_list = self.get_anchors( featmap_sizes, img_metas) cls_reg_targets = anchor_target( anchor_list, valid_flag_list, gt_bboxes, img_metas, self.target_means, self.target_stds, cfg, gt_bboxes_ignore_list=gt_bboxes_ignore, gt_labels_list=gt_labels, label_channels=1, sampling=False, unmap_outputs=False) if cls_reg_targets is None: return None (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets num_images = len(img_metas) all_cls_scores = torch.cat([ s.permute(0, 2, 3, 1).reshape( num_images, -1, self.cls_out_channels) for s in cls_scores ], 1) all_labels = torch.cat(labels_list, -1).view(num_images, -1) all_label_weights = torch.cat(label_weights_list, -1).view( num_images, -1) all_bbox_preds = torch.cat([ b.permute(0, 2, 3, 1).reshape(num_images, -1, 4) for b in bbox_preds ], -2) all_bbox_targets = torch.cat(bbox_targets_list, -2).view( num_images, -1, 4) all_bbox_weights = torch.cat(bbox_weights_list, -2).view( num_images, -1, 4) losses_cls, losses_reg = multi_apply( self.loss_single, all_cls_scores, all_bbox_preds, all_labels, all_label_weights, all_bbox_targets, all_bbox_weights, num_total_samples=num_total_pos, cfg=cfg) return dict(loss_cls=losses_cls, loss_reg=losses_reg) ================================================ FILE: mmdetection/mmdet/models/backbones/__init__.py ================================================ from .resnet import ResNet, make_res_layer from .resnext import ResNeXt from .ssd_vgg import SSDVGG __all__ = ['ResNet', 'make_res_layer', 'ResNeXt', 'SSDVGG'] ================================================ FILE: mmdetection/mmdet/models/backbones/resnet.py ================================================ import logging import torch.nn as nn import torch.utils.checkpoint as cp from torch.nn.modules.batchnorm import _BatchNorm from mmcv.cnn import constant_init, kaiming_init from mmcv.runner import load_checkpoint from mmdet.ops import DeformConv, ModulatedDeformConv from ..registry import BACKBONES from ..utils import build_conv_layer, build_norm_layer class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, style='pytorch', with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN'), dcn=None): super(BasicBlock, self).__init__() assert dcn is None, "Not implemented yet." self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) self.conv1 = build_conv_layer( conv_cfg, inplanes, planes, 3, stride=stride, padding=dilation, dilation=dilation, bias=False) self.add_module(self.norm1_name, norm1) self.conv2 = build_conv_layer( conv_cfg, planes, planes, 3, padding=1, bias=False) self.add_module(self.norm2_name, norm2) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride self.dilation = dilation assert not with_cp @property def norm1(self): return getattr(self, self.norm1_name) @property def norm2(self): return getattr(self, self.norm2_name) def forward(self, x): identity = x out = self.conv1(x) out = self.norm1(out) out = self.relu(out) out = self.conv2(out) out = self.norm2(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, style='pytorch', with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN'), dcn=None): """Bottleneck block for ResNet. If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is "caffe", the stride-two layer is the first 1x1 conv layer. """ super(Bottleneck, self).__init__() assert style in ['pytorch', 'caffe'] assert dcn is None or isinstance(dcn, dict) self.inplanes = inplanes self.planes = planes self.stride = stride self.dilation = dilation self.style = style self.with_cp = with_cp self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.dcn = dcn self.with_dcn = dcn is not None if self.style == 'pytorch': self.conv1_stride = 1 self.conv2_stride = stride else: self.conv1_stride = stride self.conv2_stride = 1 self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) self.norm3_name, norm3 = build_norm_layer( norm_cfg, planes * self.expansion, postfix=3) self.conv1 = build_conv_layer( conv_cfg, inplanes, planes, kernel_size=1, stride=self.conv1_stride, bias=False) self.add_module(self.norm1_name, norm1) fallback_on_stride = False self.with_modulated_dcn = False if self.with_dcn: fallback_on_stride = dcn.get('fallback_on_stride', False) self.with_modulated_dcn = dcn.get('modulated', False) if not self.with_dcn or fallback_on_stride: self.conv2 = build_conv_layer( conv_cfg, planes, planes, kernel_size=3, stride=self.conv2_stride, padding=dilation, dilation=dilation, bias=False) else: assert conv_cfg is None, 'conv_cfg must be None for DCN' deformable_groups = dcn.get('deformable_groups', 1) if not self.with_modulated_dcn: conv_op = DeformConv offset_channels = 18 else: conv_op = ModulatedDeformConv offset_channels = 27 self.conv2_offset = nn.Conv2d( planes, deformable_groups * offset_channels, kernel_size=3, stride=self.conv2_stride, padding=dilation, dilation=dilation) self.conv2 = conv_op( planes, planes, kernel_size=3, stride=self.conv2_stride, padding=dilation, dilation=dilation, deformable_groups=deformable_groups, bias=False) self.add_module(self.norm2_name, norm2) self.conv3 = build_conv_layer( conv_cfg, planes, planes * self.expansion, kernel_size=1, bias=False) self.add_module(self.norm3_name, norm3) self.relu = nn.ReLU(inplace=True) self.downsample = downsample @property def norm1(self): return getattr(self, self.norm1_name) @property def norm2(self): return getattr(self, self.norm2_name) @property def norm3(self): return getattr(self, self.norm3_name) def forward(self, x): def _inner_forward(x): identity = x out = self.conv1(x) out = self.norm1(out) out = self.relu(out) if not self.with_dcn: out = self.conv2(out) elif self.with_modulated_dcn: offset_mask = self.conv2_offset(out) offset = offset_mask[:, :18, :, :] mask = offset_mask[:, -9:, :, :].sigmoid() out = self.conv2(out, offset, mask) else: offset = self.conv2_offset(out) out = self.conv2(out, offset) out = self.norm2(out) out = self.relu(out) out = self.conv3(out) out = self.norm3(out) if self.downsample is not None: identity = self.downsample(x) out += identity return out if self.with_cp and x.requires_grad: out = cp.checkpoint(_inner_forward, x) else: out = _inner_forward(x) out = self.relu(out) return out def make_res_layer(block, inplanes, planes, blocks, stride=1, dilation=1, style='pytorch', with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN'), dcn=None): downsample = None if stride != 1 or inplanes != planes * block.expansion: downsample = nn.Sequential( build_conv_layer( conv_cfg, inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), build_norm_layer(norm_cfg, planes * block.expansion)[1], ) layers = [] layers.append( block( inplanes, planes, stride, dilation, downsample, style=style, with_cp=with_cp, conv_cfg=conv_cfg, norm_cfg=norm_cfg, dcn=dcn)) inplanes = planes * block.expansion for i in range(1, blocks): layers.append( block( inplanes, planes, 1, dilation, style=style, with_cp=with_cp, conv_cfg=conv_cfg, norm_cfg=norm_cfg, dcn=dcn)) return nn.Sequential(*layers) @BACKBONES.register_module class ResNet(nn.Module): """ResNet backbone. Args: depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. num_stages (int): Resnet stages, normally 4. strides (Sequence[int]): Strides of the first block of each stage. dilations (Sequence[int]): Dilation of each stage. out_indices (Sequence[int]): Output from which stages. style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two layer is the 3x3 conv layer, otherwise the stride-two layer is the first 1x1 conv layer. frozen_stages (int): Stages to be frozen (stop grad and set eval mode). -1 means not freezing any parameters. norm_cfg (dict): dictionary to construct and config norm layer. norm_eval (bool): Whether to set norm layers to eval mode, namely, freeze running stats (mean and var). Note: Effect on Batch Norm and its variants only. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. zero_init_residual (bool): whether to use zero init for last norm layer in resblocks to let them behave as identity. """ arch_settings = { 18: (BasicBlock, (2, 2, 2, 2)), 34: (BasicBlock, (3, 4, 6, 3)), 50: (Bottleneck, (3, 4, 6, 3)), 101: (Bottleneck, (3, 4, 23, 3)), 152: (Bottleneck, (3, 8, 36, 3)) } def __init__(self, depth, num_stages=4, strides=(1, 2, 2, 2), dilations=(1, 1, 1, 1), out_indices=(0, 1, 2, 3), style='pytorch', frozen_stages=-1, conv_cfg=None, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, dcn=None, stage_with_dcn=(False, False, False, False), with_cp=False, zero_init_residual=True): super(ResNet, self).__init__() if depth not in self.arch_settings: raise KeyError('invalid depth {} for resnet'.format(depth)) self.depth = depth self.num_stages = num_stages assert num_stages >= 1 and num_stages <= 4 self.strides = strides self.dilations = dilations assert len(strides) == len(dilations) == num_stages self.out_indices = out_indices assert max(out_indices) < num_stages self.style = style self.frozen_stages = frozen_stages self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.with_cp = with_cp self.norm_eval = norm_eval self.dcn = dcn self.stage_with_dcn = stage_with_dcn if dcn is not None: assert len(stage_with_dcn) == num_stages self.zero_init_residual = zero_init_residual self.block, stage_blocks = self.arch_settings[depth] self.stage_blocks = stage_blocks[:num_stages] self.inplanes = 64 self._make_stem_layer() self.res_layers = [] for i, num_blocks in enumerate(self.stage_blocks): stride = strides[i] dilation = dilations[i] dcn = self.dcn if self.stage_with_dcn[i] else None planes = 64 * 2**i res_layer = make_res_layer( self.block, self.inplanes, planes, num_blocks, stride=stride, dilation=dilation, style=self.style, with_cp=with_cp, conv_cfg=conv_cfg, norm_cfg=norm_cfg, dcn=dcn) self.inplanes = planes * self.block.expansion layer_name = 'layer{}'.format(i + 1) self.add_module(layer_name, res_layer) self.res_layers.append(layer_name) self._freeze_stages() self.feat_dim = self.block.expansion * 64 * 2**( len(self.stage_blocks) - 1) @property def norm1(self): return getattr(self, self.norm1_name) def _make_stem_layer(self): self.conv1 = build_conv_layer( self.conv_cfg, 3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1) self.add_module(self.norm1_name, norm1) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) def _freeze_stages(self): if self.frozen_stages >= 0: self.norm1.eval() for m in [self.conv1, self.norm1]: for param in m.parameters(): param.requires_grad = False for i in range(1, self.frozen_stages + 1): m = getattr(self, 'layer{}'.format(i)) m.eval() for param in m.parameters(): param.requires_grad = False def init_weights(self, pretrained=None): if isinstance(pretrained, str): logger = logging.getLogger() load_checkpoint(self, pretrained, strict=False, logger=logger) elif pretrained is None: for m in self.modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) elif isinstance(m, (_BatchNorm, nn.GroupNorm)): constant_init(m, 1) if self.dcn is not None: for m in self.modules(): if isinstance(m, Bottleneck) and hasattr( m, 'conv2_offset'): constant_init(m.conv2_offset, 0) if self.zero_init_residual: for m in self.modules(): if isinstance(m, Bottleneck): constant_init(m.norm3, 0) elif isinstance(m, BasicBlock): constant_init(m.norm2, 0) else: raise TypeError('pretrained must be a str or None') def forward(self, x): x = self.conv1(x) x = self.norm1(x) x = self.relu(x) x = self.maxpool(x) outs = [] for i, layer_name in enumerate(self.res_layers): res_layer = getattr(self, layer_name) x = res_layer(x) if i in self.out_indices: outs.append(x) return tuple(outs) def train(self, mode=True): super(ResNet, self).train(mode) self._freeze_stages() if mode and self.norm_eval: for m in self.modules(): # trick: eval have effect on BatchNorm only if isinstance(m, _BatchNorm): m.eval() ================================================ FILE: mmdetection/mmdet/models/backbones/resnext.py ================================================ import math import torch.nn as nn from mmdet.ops import DeformConv, ModulatedDeformConv from .resnet import Bottleneck as _Bottleneck from .resnet import ResNet from ..registry import BACKBONES from ..utils import build_conv_layer, build_norm_layer class Bottleneck(_Bottleneck): def __init__(self, *args, groups=1, base_width=4, **kwargs): """Bottleneck block for ResNeXt. If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is "caffe", the stride-two layer is the first 1x1 conv layer. """ super(Bottleneck, self).__init__(*args, **kwargs) if groups == 1: width = self.planes else: width = math.floor(self.planes * (base_width / 64)) * groups self.norm1_name, norm1 = build_norm_layer( self.norm_cfg, width, postfix=1) self.norm2_name, norm2 = build_norm_layer( self.norm_cfg, width, postfix=2) self.norm3_name, norm3 = build_norm_layer( self.norm_cfg, self.planes * self.expansion, postfix=3) self.conv1 = build_conv_layer( self.conv_cfg, self.inplanes, width, kernel_size=1, stride=self.conv1_stride, bias=False) self.add_module(self.norm1_name, norm1) fallback_on_stride = False self.with_modulated_dcn = False if self.with_dcn: fallback_on_stride = self.dcn.get('fallback_on_stride', False) self.with_modulated_dcn = self.dcn.get('modulated', False) if not self.with_dcn or fallback_on_stride: self.conv2 = build_conv_layer( self.conv_cfg, width, width, kernel_size=3, stride=self.conv2_stride, padding=self.dilation, dilation=self.dilation, groups=groups, bias=False) else: assert self.conv_cfg is None, 'conv_cfg must be None for DCN' groups = self.dcn.get('groups', 1) deformable_groups = self.dcn.get('deformable_groups', 1) if not self.with_modulated_dcn: conv_op = DeformConv offset_channels = 18 else: conv_op = ModulatedDeformConv offset_channels = 27 self.conv2_offset = nn.Conv2d( width, deformable_groups * offset_channels, kernel_size=3, stride=self.conv2_stride, padding=self.dilation, dilation=self.dilation) self.conv2 = conv_op( width, width, kernel_size=3, stride=self.conv2_stride, padding=self.dilation, dilation=self.dilation, groups=groups, deformable_groups=deformable_groups, bias=False) self.add_module(self.norm2_name, norm2) self.conv3 = build_conv_layer( self.conv_cfg, width, self.planes * self.expansion, kernel_size=1, bias=False) self.add_module(self.norm3_name, norm3) def make_res_layer(block, inplanes, planes, blocks, stride=1, dilation=1, groups=1, base_width=4, style='pytorch', with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN'), dcn=None): downsample = None if stride != 1 or inplanes != planes * block.expansion: downsample = nn.Sequential( build_conv_layer( conv_cfg, inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), build_norm_layer(norm_cfg, planes * block.expansion)[1], ) layers = [] layers.append( block( inplanes, planes, stride=stride, dilation=dilation, downsample=downsample, groups=groups, base_width=base_width, style=style, with_cp=with_cp, conv_cfg=conv_cfg, norm_cfg=norm_cfg, dcn=dcn)) inplanes = planes * block.expansion for i in range(1, blocks): layers.append( block( inplanes, planes, stride=1, dilation=dilation, groups=groups, base_width=base_width, style=style, with_cp=with_cp, conv_cfg=conv_cfg, norm_cfg=norm_cfg, dcn=dcn)) return nn.Sequential(*layers) @BACKBONES.register_module class ResNeXt(ResNet): """ResNeXt backbone. Args: depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. num_stages (int): Resnet stages, normally 4. groups (int): Group of resnext. base_width (int): Base width of resnext. strides (Sequence[int]): Strides of the first block of each stage. dilations (Sequence[int]): Dilation of each stage. out_indices (Sequence[int]): Output from which stages. style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two layer is the 3x3 conv layer, otherwise the stride-two layer is the first 1x1 conv layer. frozen_stages (int): Stages to be frozen (all param fixed). -1 means not freezing any parameters. norm_cfg (dict): dictionary to construct and config norm layer. norm_eval (bool): Whether to set norm layers to eval mode, namely, freeze running stats (mean and var). Note: Effect on Batch Norm and its variants only. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. zero_init_residual (bool): whether to use zero init for last norm layer in resblocks to let them behave as identity. """ arch_settings = { 50: (Bottleneck, (3, 4, 6, 3)), 101: (Bottleneck, (3, 4, 23, 3)), 152: (Bottleneck, (3, 8, 36, 3)) } def __init__(self, groups=1, base_width=4, **kwargs): super(ResNeXt, self).__init__(**kwargs) self.groups = groups self.base_width = base_width self.inplanes = 64 self.res_layers = [] for i, num_blocks in enumerate(self.stage_blocks): stride = self.strides[i] dilation = self.dilations[i] dcn = self.dcn if self.stage_with_dcn[i] else None planes = 64 * 2**i res_layer = make_res_layer( self.block, self.inplanes, planes, num_blocks, stride=stride, dilation=dilation, groups=self.groups, base_width=self.base_width, style=self.style, with_cp=self.with_cp, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, dcn=dcn) self.inplanes = planes * self.block.expansion layer_name = 'layer{}'.format(i + 1) self.add_module(layer_name, res_layer) self.res_layers.append(layer_name) self._freeze_stages() ================================================ FILE: mmdetection/mmdet/models/backbones/ssd_vgg.py ================================================ import logging import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import (VGG, xavier_init, constant_init, kaiming_init, normal_init) from mmcv.runner import load_checkpoint from ..registry import BACKBONES @BACKBONES.register_module class SSDVGG(VGG): extra_setting = { 300: (256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256), 512: (256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256, 128), } def __init__(self, input_size, depth, with_last_pool=False, ceil_mode=True, out_indices=(3, 4), out_feature_indices=(22, 34), l2_norm_scale=20.): super(SSDVGG, self).__init__( depth, with_last_pool=with_last_pool, ceil_mode=ceil_mode, out_indices=out_indices) assert input_size in (300, 512) self.input_size = input_size self.features.add_module( str(len(self.features)), nn.MaxPool2d(kernel_size=3, stride=1, padding=1)) self.features.add_module( str(len(self.features)), nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)) self.features.add_module( str(len(self.features)), nn.ReLU(inplace=True)) self.features.add_module( str(len(self.features)), nn.Conv2d(1024, 1024, kernel_size=1)) self.features.add_module( str(len(self.features)), nn.ReLU(inplace=True)) self.out_feature_indices = out_feature_indices self.inplanes = 1024 self.extra = self._make_extra_layers(self.extra_setting[input_size]) self.l2_norm = L2Norm( self.features[out_feature_indices[0] - 1].out_channels, l2_norm_scale) def init_weights(self, pretrained=None): if isinstance(pretrained, str): logger = logging.getLogger() load_checkpoint(self, pretrained, strict=False, logger=logger) elif pretrained is None: for m in self.features.modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) elif isinstance(m, nn.BatchNorm2d): constant_init(m, 1) elif isinstance(m, nn.Linear): normal_init(m, std=0.01) else: raise TypeError('pretrained must be a str or None') for m in self.extra.modules(): if isinstance(m, nn.Conv2d): xavier_init(m, distribution='uniform') constant_init(self.l2_norm, self.l2_norm.scale) def forward(self, x): outs = [] for i, layer in enumerate(self.features): x = layer(x) if i in self.out_feature_indices: outs.append(x) for i, layer in enumerate(self.extra): x = F.relu(layer(x), inplace=True) if i % 2 == 1: outs.append(x) outs[0] = self.l2_norm(outs[0]) if len(outs) == 1: return outs[0] else: return tuple(outs) def _make_extra_layers(self, outplanes): layers = [] kernel_sizes = (1, 3) num_layers = 0 outplane = None for i in range(len(outplanes)): if self.inplanes == 'S': self.inplanes = outplane continue k = kernel_sizes[num_layers % 2] if outplanes[i] == 'S': outplane = outplanes[i + 1] conv = nn.Conv2d( self.inplanes, outplane, k, stride=2, padding=1) else: outplane = outplanes[i] conv = nn.Conv2d( self.inplanes, outplane, k, stride=1, padding=0) layers.append(conv) self.inplanes = outplanes[i] num_layers += 1 if self.input_size == 512: layers.append(nn.Conv2d(self.inplanes, 256, 4, padding=1)) return nn.Sequential(*layers) class L2Norm(nn.Module): def __init__(self, n_dims, scale=20., eps=1e-10): super(L2Norm, self).__init__() self.n_dims = n_dims self.weight = nn.Parameter(torch.Tensor(self.n_dims)) self.eps = eps self.scale = scale def forward(self, x): norm = x.pow(2).sum(1, keepdim=True).sqrt() + self.eps return self.weight[None, :, None, None].expand_as(x) * x / norm ================================================ FILE: mmdetection/mmdet/models/bbox_heads/__init__.py ================================================ from .bbox_head import BBoxHead from .convfc_bbox_head import ConvFCBBoxHead, SharedFCBBoxHead __all__ = ['BBoxHead', 'ConvFCBBoxHead', 'SharedFCBBoxHead'] ================================================ FILE: mmdetection/mmdet/models/bbox_heads/bbox_head.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from mmdet.core import (delta2bbox, multiclass_nms, bbox_target, weighted_cross_entropy, weighted_smoothl1, accuracy) from ..registry import HEADS @HEADS.register_module class BBoxHead(nn.Module): """Simplest RoI head, with only two fc layers for classification and regression respectively""" def __init__(self, with_avg_pool=False, with_cls=True, with_reg=True, roi_feat_size=7, in_channels=256, num_classes=81, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=False): super(BBoxHead, self).__init__() assert with_cls or with_reg self.with_avg_pool = with_avg_pool self.with_cls = with_cls self.with_reg = with_reg self.roi_feat_size = roi_feat_size self.in_channels = in_channels self.num_classes = num_classes self.target_means = target_means self.target_stds = target_stds self.reg_class_agnostic = reg_class_agnostic in_channels = self.in_channels if self.with_avg_pool: self.avg_pool = nn.AvgPool2d(roi_feat_size) else: in_channels *= (self.roi_feat_size * self.roi_feat_size) if self.with_cls: self.fc_cls = nn.Linear(in_channels, num_classes) if self.with_reg: out_dim_reg = 4 if reg_class_agnostic else 4 * num_classes self.fc_reg = nn.Linear(in_channels, out_dim_reg) self.debug_imgs = None def init_weights(self): if self.with_cls: nn.init.normal_(self.fc_cls.weight, 0, 0.01) nn.init.constant_(self.fc_cls.bias, 0) if self.with_reg: nn.init.normal_(self.fc_reg.weight, 0, 0.001) nn.init.constant_(self.fc_reg.bias, 0) def forward(self, x): if self.with_avg_pool: x = self.avg_pool(x) x = x.view(x.size(0), -1) cls_score = self.fc_cls(x) if self.with_cls else None bbox_pred = self.fc_reg(x) if self.with_reg else None return cls_score, bbox_pred def get_target(self, sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg): pos_proposals = [res.pos_bboxes for res in sampling_results] neg_proposals = [res.neg_bboxes for res in sampling_results] pos_gt_bboxes = [res.pos_gt_bboxes for res in sampling_results] pos_gt_labels = [res.pos_gt_labels for res in sampling_results] reg_classes = 1 if self.reg_class_agnostic else self.num_classes cls_reg_targets = bbox_target( pos_proposals, neg_proposals, pos_gt_bboxes, pos_gt_labels, rcnn_train_cfg, reg_classes, target_means=self.target_means, target_stds=self.target_stds) return cls_reg_targets def loss(self, cls_score, bbox_pred, labels, label_weights, bbox_targets, bbox_weights, reduce=True): losses = dict() if cls_score is not None: losses['loss_cls'] = weighted_cross_entropy( cls_score, labels, label_weights, reduce=reduce) losses['acc'] = accuracy(cls_score, labels) if bbox_pred is not None: pos_inds = labels > 0 if self.reg_class_agnostic: pos_bbox_pred = bbox_pred.view(bbox_pred.size(0), 4)[pos_inds] else: pos_bbox_pred = bbox_pred.view(bbox_pred.size(0), -1, 4)[pos_inds, labels[pos_inds]] losses['loss_reg'] = weighted_smoothl1( pos_bbox_pred, bbox_targets[pos_inds], bbox_weights[pos_inds], avg_factor=bbox_targets.size(0)) return losses def get_det_bboxes(self, rois, cls_score, bbox_pred, img_shape, scale_factor, rescale=False, cfg=None): if isinstance(cls_score, list): cls_score = sum(cls_score) / float(len(cls_score)) scores = F.softmax(cls_score, dim=1) if cls_score is not None else None if bbox_pred is not None: bboxes = delta2bbox(rois[:, 1:], bbox_pred, self.target_means, self.target_stds, img_shape) else: bboxes = rois[:, 1:] # TODO: add clip here if rescale: bboxes /= scale_factor if cfg is None: return bboxes, scores else: det_bboxes, det_labels = multiclass_nms( bboxes, scores, cfg.score_thr, cfg.nms, cfg.max_per_img) return det_bboxes, det_labels def refine_bboxes(self, rois, labels, bbox_preds, pos_is_gts, img_metas): """Refine bboxes during training. Args: rois (Tensor): Shape (n*bs, 5), where n is image number per GPU, and bs is the sampled RoIs per image. labels (Tensor): Shape (n*bs, ). bbox_preds (Tensor): Shape (n*bs, 4) or (n*bs, 4*#class). pos_is_gts (list[Tensor]): Flags indicating if each positive bbox is a gt bbox. img_metas (list[dict]): Meta info of each image. Returns: list[Tensor]: Refined bboxes of each image in a mini-batch. """ img_ids = rois[:, 0].long().unique(sorted=True) assert img_ids.numel() == len(img_metas) bboxes_list = [] for i in range(len(img_metas)): inds = torch.nonzero(rois[:, 0] == i).squeeze() num_rois = inds.numel() bboxes_ = rois[inds, 1:] label_ = labels[inds] bbox_pred_ = bbox_preds[inds] img_meta_ = img_metas[i] pos_is_gts_ = pos_is_gts[i] bboxes = self.regress_by_class(bboxes_, label_, bbox_pred_, img_meta_) # filter gt bboxes pos_keep = 1 - pos_is_gts_ keep_inds = pos_is_gts_.new_ones(num_rois) keep_inds[:len(pos_is_gts_)] = pos_keep bboxes_list.append(bboxes[keep_inds]) return bboxes_list def regress_by_class(self, rois, label, bbox_pred, img_meta): """Regress the bbox for the predicted class. Used in Cascade R-CNN. Args: rois (Tensor): shape (n, 4) or (n, 5) label (Tensor): shape (n, ) bbox_pred (Tensor): shape (n, 4*(#class+1)) or (n, 4) img_meta (dict): Image meta info. Returns: Tensor: Regressed bboxes, the same shape as input rois. """ assert rois.size(1) == 4 or rois.size(1) == 5 if not self.reg_class_agnostic: label = label * 4 inds = torch.stack((label, label + 1, label + 2, label + 3), 1) bbox_pred = torch.gather(bbox_pred, 1, inds) assert bbox_pred.size(1) == 4 if rois.size(1) == 4: new_rois = delta2bbox(rois, bbox_pred, self.target_means, self.target_stds, img_meta['img_shape']) else: bboxes = delta2bbox(rois[:, 1:], bbox_pred, self.target_means, self.target_stds, img_meta['img_shape']) new_rois = torch.cat((rois[:, [0]], bboxes), dim=1) return new_rois ================================================ FILE: mmdetection/mmdet/models/bbox_heads/convfc_bbox_head.py ================================================ import torch.nn as nn from .bbox_head import BBoxHead from ..registry import HEADS from ..utils import ConvModule @HEADS.register_module class ConvFCBBoxHead(BBoxHead): """More general bbox head, with shared conv and fc layers and two optional separated branches. /-> cls convs -> cls fcs -> cls shared convs -> shared fcs \-> reg convs -> reg fcs -> reg """ # noqa: W605 def __init__(self, num_shared_convs=0, num_shared_fcs=0, num_cls_convs=0, num_cls_fcs=0, num_reg_convs=0, num_reg_fcs=0, conv_out_channels=256, fc_out_channels=1024, conv_cfg=None, norm_cfg=None, *args, **kwargs): super(ConvFCBBoxHead, self).__init__(*args, **kwargs) assert (num_shared_convs + num_shared_fcs + num_cls_convs + num_cls_fcs + num_reg_convs + num_reg_fcs > 0) if num_cls_convs > 0 or num_reg_convs > 0: assert num_shared_fcs == 0 if not self.with_cls: assert num_cls_convs == 0 and num_cls_fcs == 0 if not self.with_reg: assert num_reg_convs == 0 and num_reg_fcs == 0 self.num_shared_convs = num_shared_convs self.num_shared_fcs = num_shared_fcs self.num_cls_convs = num_cls_convs self.num_cls_fcs = num_cls_fcs self.num_reg_convs = num_reg_convs self.num_reg_fcs = num_reg_fcs self.conv_out_channels = conv_out_channels self.fc_out_channels = fc_out_channels self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg # add shared convs and fcs self.shared_convs, self.shared_fcs, last_layer_dim = \ self._add_conv_fc_branch( self.num_shared_convs, self.num_shared_fcs, self.in_channels, True) self.shared_out_channels = last_layer_dim # add cls specific branch self.cls_convs, self.cls_fcs, self.cls_last_dim = \ self._add_conv_fc_branch( self.num_cls_convs, self.num_cls_fcs, self.shared_out_channels) # add reg specific branch self.reg_convs, self.reg_fcs, self.reg_last_dim = \ self._add_conv_fc_branch( self.num_reg_convs, self.num_reg_fcs, self.shared_out_channels) if self.num_shared_fcs == 0 and not self.with_avg_pool: if self.num_cls_fcs == 0: self.cls_last_dim *= (self.roi_feat_size * self.roi_feat_size) if self.num_reg_fcs == 0: self.reg_last_dim *= (self.roi_feat_size * self.roi_feat_size) self.relu = nn.ReLU(inplace=True) # reconstruct fc_cls and fc_reg since input channels are changed if self.with_cls: self.fc_cls = nn.Linear(self.cls_last_dim, self.num_classes) if self.with_reg: out_dim_reg = (4 if self.reg_class_agnostic else 4 * self.num_classes) self.fc_reg = nn.Linear(self.reg_last_dim, out_dim_reg) def _add_conv_fc_branch(self, num_branch_convs, num_branch_fcs, in_channels, is_shared=False): """Add shared or separable branch convs -> avg pool (optional) -> fcs """ last_layer_dim = in_channels # add branch specific conv layers branch_convs = nn.ModuleList() if num_branch_convs > 0: for i in range(num_branch_convs): conv_in_channels = (last_layer_dim if i == 0 else self.conv_out_channels) branch_convs.append( ConvModule( conv_in_channels, self.conv_out_channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg)) last_layer_dim = self.conv_out_channels # add branch specific fc layers branch_fcs = nn.ModuleList() if num_branch_fcs > 0: # for shared branch, only consider self.with_avg_pool # for separated branches, also consider self.num_shared_fcs if (is_shared or self.num_shared_fcs == 0) and not self.with_avg_pool: last_layer_dim *= (self.roi_feat_size * self.roi_feat_size) for i in range(num_branch_fcs): fc_in_channels = (last_layer_dim if i == 0 else self.fc_out_channels) branch_fcs.append( nn.Linear(fc_in_channels, self.fc_out_channels)) last_layer_dim = self.fc_out_channels return branch_convs, branch_fcs, last_layer_dim def init_weights(self): super(ConvFCBBoxHead, self).init_weights() for module_list in [self.shared_fcs, self.cls_fcs, self.reg_fcs]: for m in module_list.modules(): if isinstance(m, nn.Linear): nn.init.xavier_uniform_(m.weight) nn.init.constant_(m.bias, 0) def forward(self, x): # shared part if self.num_shared_convs > 0: for conv in self.shared_convs: x = conv(x) if self.num_shared_fcs > 0: if self.with_avg_pool: x = self.avg_pool(x) x = x.view(x.size(0), -1) for fc in self.shared_fcs: x = self.relu(fc(x)) # separate branches x_cls = x x_reg = x for conv in self.cls_convs: x_cls = conv(x_cls) if x_cls.dim() > 2: if self.with_avg_pool: x_cls = self.avg_pool(x_cls) x_cls = x_cls.view(x_cls.size(0), -1) for fc in self.cls_fcs: x_cls = self.relu(fc(x_cls)) for conv in self.reg_convs: x_reg = conv(x_reg) if x_reg.dim() > 2: if self.with_avg_pool: x_reg = self.avg_pool(x_reg) x_reg = x_reg.view(x_reg.size(0), -1) for fc in self.reg_fcs: x_reg = self.relu(fc(x_reg)) cls_score = self.fc_cls(x_cls) if self.with_cls else None bbox_pred = self.fc_reg(x_reg) if self.with_reg else None return cls_score, bbox_pred @HEADS.register_module class SharedFCBBoxHead(ConvFCBBoxHead): def __init__(self, num_fcs=2, fc_out_channels=1024, *args, **kwargs): assert num_fcs >= 1 super(SharedFCBBoxHead, self).__init__( num_shared_convs=0, num_shared_fcs=num_fcs, num_cls_convs=0, num_cls_fcs=0, num_reg_convs=0, num_reg_fcs=0, fc_out_channels=fc_out_channels, *args, **kwargs) ================================================ FILE: mmdetection/mmdet/models/builder.py ================================================ import mmcv from torch import nn from .registry import (BACKBONES, NECKS, ROI_EXTRACTORS, SHARED_HEADS, HEADS, DETECTORS) def _build_module(cfg, registry, default_args): assert isinstance(cfg, dict) and 'type' in cfg assert isinstance(default_args, dict) or default_args is None args = cfg.copy() obj_type = args.pop('type') if mmcv.is_str(obj_type): if obj_type not in registry.module_dict: raise KeyError('{} is not in the {} registry'.format( obj_type, registry.name)) obj_type = registry.module_dict[obj_type] elif not isinstance(obj_type, type): raise TypeError('type must be a str or valid type, but got {}'.format( type(obj_type))) if default_args is not None: for name, value in default_args.items(): args.setdefault(name, value) return obj_type(**args) def build(cfg, registry, default_args=None): if isinstance(cfg, list): modules = [_build_module(cfg_, registry, default_args) for cfg_ in cfg] return nn.Sequential(*modules) else: return _build_module(cfg, registry, default_args) def build_backbone(cfg): return build(cfg, BACKBONES) def build_neck(cfg): return build(cfg, NECKS) def build_roi_extractor(cfg): return build(cfg, ROI_EXTRACTORS) def build_shared_head(cfg): return build(cfg, SHARED_HEADS) def build_head(cfg): return build(cfg, HEADS) def build_detector(cfg, train_cfg=None, test_cfg=None): return build(cfg, DETECTORS, dict(train_cfg=train_cfg, test_cfg=test_cfg)) ================================================ FILE: mmdetection/mmdet/models/detectors/__init__.py ================================================ from .base import BaseDetector from .single_stage import SingleStageDetector from .two_stage import TwoStageDetector from .rpn import RPN from .fast_rcnn import FastRCNN from .faster_rcnn import FasterRCNN from .mask_rcnn import MaskRCNN from .cascade_rcnn import CascadeRCNN from .htc import HybridTaskCascade from .retinanet import RetinaNet from .fcos import FCOS __all__ = [ 'BaseDetector', 'SingleStageDetector', 'TwoStageDetector', 'RPN', 'FastRCNN', 'FasterRCNN', 'MaskRCNN', 'CascadeRCNN', 'HybridTaskCascade', 'RetinaNet', 'FCOS' ] ================================================ FILE: mmdetection/mmdet/models/detectors/base.py ================================================ import logging from abc import ABCMeta, abstractmethod import mmcv import numpy as np import torch.nn as nn import pycocotools.mask as maskUtils from mmdet.core import tensor2imgs, get_classes class BaseDetector(nn.Module): """Base class for detectors""" __metaclass__ = ABCMeta def __init__(self): super(BaseDetector, self).__init__() @property def with_neck(self): return hasattr(self, 'neck') and self.neck is not None @property def with_shared_head(self): return hasattr(self, 'shared_head') and self.shared_head is not None @property def with_bbox(self): return hasattr(self, 'bbox_head') and self.bbox_head is not None @property def with_mask(self): return hasattr(self, 'mask_head') and self.mask_head is not None @abstractmethod def extract_feat(self, imgs): pass def extract_feats(self, imgs): assert isinstance(imgs, list) for img in imgs: yield self.extract_feat(img) @abstractmethod def forward_train(self, imgs, img_metas, **kwargs): pass @abstractmethod def simple_test(self, img, img_meta, **kwargs): pass @abstractmethod def aug_test(self, imgs, img_metas, **kwargs): pass def init_weights(self, pretrained=None): if pretrained is not None: logger = logging.getLogger() logger.info('load model from: {}'.format(pretrained)) def forward_test(self, imgs, img_metas, **kwargs): for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]: if not isinstance(var, list): raise TypeError('{} must be a list, but got {}'.format( name, type(var))) num_augs = len(imgs) if num_augs != len(img_metas): raise ValueError( 'num of augmentations ({}) != num of image meta ({})'.format( len(imgs), len(img_metas))) # TODO: remove the restriction of imgs_per_gpu == 1 when prepared imgs_per_gpu = imgs[0].size(0) assert imgs_per_gpu == 1 if num_augs == 1: return self.simple_test(imgs[0], img_metas[0], **kwargs) else: return self.aug_test(imgs, img_metas, **kwargs) def forward(self, img, img_meta, return_loss=True, **kwargs): if return_loss: return self.forward_train(img, img_meta, **kwargs) else: return self.forward_test(img, img_meta, **kwargs) def show_result(self, data, result, img_norm_cfg, dataset=None, score_thr=0.3): if isinstance(result, tuple): bbox_result, segm_result = result else: bbox_result, segm_result = result, None img_tensor = data['img'][0] img_metas = data['img_meta'][0].data[0] imgs = tensor2imgs(img_tensor, **img_norm_cfg) assert len(imgs) == len(img_metas) if dataset is None: class_names = self.CLASSES elif isinstance(dataset, str): class_names = get_classes(dataset) elif isinstance(dataset, (list, tuple)): class_names = dataset else: raise TypeError( 'dataset must be a valid dataset name or a sequence' ' of class names, not {}'.format(type(dataset))) for img, img_meta in zip(imgs, img_metas): h, w, _ = img_meta['img_shape'] img_show = img[:h, :w, :] bboxes = np.vstack(bbox_result) # draw segmentation masks if segm_result is not None: segms = mmcv.concat_list(segm_result) inds = np.where(bboxes[:, -1] > score_thr)[0] for i in inds: color_mask = np.random.randint( 0, 256, (1, 3), dtype=np.uint8) mask = maskUtils.decode(segms[i]).astype(np.bool) img_show[mask] = img_show[mask] * 0.5 + color_mask * 0.5 # draw bounding boxes labels = [ np.full(bbox.shape[0], i, dtype=np.int32) for i, bbox in enumerate(bbox_result) ] labels = np.concatenate(labels) mmcv.imshow_det_bboxes( img_show, bboxes, labels, class_names=class_names, score_thr=score_thr) ================================================ FILE: mmdetection/mmdet/models/detectors/cascade_rcnn.py ================================================ from __future__ import division import torch import torch.nn as nn from .base import BaseDetector from .test_mixins import RPNTestMixin from .. import builder from ..registry import DETECTORS from mmdet.core import (build_assigner, bbox2roi, bbox2result, build_sampler, merge_aug_masks) @DETECTORS.register_module class CascadeRCNN(BaseDetector, RPNTestMixin): def __init__(self, num_stages, backbone, neck=None, shared_head=None, rpn_head=None, bbox_roi_extractor=None, bbox_head=None, mask_roi_extractor=None, mask_head=None, train_cfg=None, test_cfg=None, pretrained=None): assert bbox_roi_extractor is not None assert bbox_head is not None super(CascadeRCNN, self).__init__() self.num_stages = num_stages self.backbone = builder.build_backbone(backbone) if neck is not None: self.neck = builder.build_neck(neck) if rpn_head is not None: self.rpn_head = builder.build_head(rpn_head) if shared_head is not None: self.shared_head = builder.build_shared_head(shared_head) if bbox_head is not None: self.bbox_roi_extractor = nn.ModuleList() self.bbox_head = nn.ModuleList() if not isinstance(bbox_roi_extractor, list): bbox_roi_extractor = [ bbox_roi_extractor for _ in range(num_stages) ] if not isinstance(bbox_head, list): bbox_head = [bbox_head for _ in range(num_stages)] assert len(bbox_roi_extractor) == len(bbox_head) == self.num_stages for roi_extractor, head in zip(bbox_roi_extractor, bbox_head): self.bbox_roi_extractor.append( builder.build_roi_extractor(roi_extractor)) self.bbox_head.append(builder.build_head(head)) if mask_head is not None: self.mask_head = nn.ModuleList() if not isinstance(mask_head, list): mask_head = [mask_head for _ in range(num_stages)] assert len(mask_head) == self.num_stages for head in mask_head: self.mask_head.append(builder.build_head(head)) if mask_roi_extractor is not None: self.share_roi_extractor = False self.mask_roi_extractor = nn.ModuleList() if not isinstance(mask_roi_extractor, list): mask_roi_extractor = [ mask_roi_extractor for _ in range(num_stages) ] assert len(mask_roi_extractor) == self.num_stages for roi_extractor in mask_roi_extractor: self.mask_roi_extractor.append( builder.build_roi_extractor(roi_extractor)) else: self.share_roi_extractor = True self.mask_roi_extractor = self.bbox_roi_extractor self.train_cfg = train_cfg self.test_cfg = test_cfg self.init_weights(pretrained=pretrained) @property def with_rpn(self): return hasattr(self, 'rpn_head') and self.rpn_head is not None def init_weights(self, pretrained=None): super(CascadeRCNN, self).init_weights(pretrained) self.backbone.init_weights(pretrained=pretrained) if self.with_neck: if isinstance(self.neck, nn.Sequential): for m in self.neck: m.init_weights() else: self.neck.init_weights() if self.with_rpn: self.rpn_head.init_weights() if self.with_shared_head: self.shared_head.init_weights(pretrained=pretrained) for i in range(self.num_stages): if self.with_bbox: self.bbox_roi_extractor[i].init_weights() self.bbox_head[i].init_weights() if self.with_mask: if not self.share_roi_extractor: self.mask_roi_extractor[i].init_weights() self.mask_head[i].init_weights() def extract_feat(self, img): x = self.backbone(img) if self.with_neck: x = self.neck(x) return x def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals for i in range(self.num_stages): self.current_stage = i rcnn_train_cfg = self.train_cfg.rcnn[i] lw = self.train_cfg.stage_loss_weights[i] # assign gts and sample proposals sampling_results = [] if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(rcnn_train_cfg.assigner) bbox_sampler = build_sampler( rcnn_train_cfg.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] for j in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j], gt_labels[j]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[j], gt_bboxes[j], gt_labels[j], feats=[lvl_feat[j][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss bbox_roi_extractor = self.bbox_roi_extractor[i] bbox_head = self.bbox_head[i] rois = bbox2roi([res.bboxes for res in sampling_results]) bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = bbox_head(bbox_feats) bbox_targets = bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg) loss_bbox = bbox_head.loss(cls_score, bbox_pred, *bbox_targets) for name, value in loss_bbox.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: mask_roi_extractor = self.mask_roi_extractor[i] pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = mask_roi_extractor( x[:mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: # reuse positive bbox feats pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones( res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros( res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_head = self.mask_head[i] mask_pred = mask_head(mask_feats) mask_targets = mask_head.get_target(sampling_results, gt_masks, rcnn_train_cfg) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = mask_head.loss(mask_pred, mask_targets, pos_labels) for name, value in loss_mask.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # refine bboxes if i < self.num_stages - 1: pos_is_gts = [res.pos_is_gt for res in sampling_results] roi_labels = bbox_targets[0] # bbox_targets is a tuple with torch.no_grad(): proposal_list = bbox_head.refine_bboxes( rois, roi_labels, bbox_pred, pos_is_gts, img_meta) return losses def simple_test(self, img, img_meta, proposals=None, rescale=False): x = self.extract_feat(img) proposal_list = self.simple_test_rpn( x, img_meta, self.test_cfg.rpn) if proposals is None else proposals img_shape = img_meta[0]['img_shape'] ori_shape = img_meta[0]['ori_shape'] scale_factor = img_meta[0]['scale_factor'] # "ms" in variable names means multi-stage ms_bbox_result = {} ms_segm_result = {} ms_scores = [] rcnn_test_cfg = self.test_cfg.rcnn rois = bbox2roi(proposal_list) for i in range(self.num_stages): bbox_roi_extractor = self.bbox_roi_extractor[i] bbox_head = self.bbox_head[i] bbox_feats = bbox_roi_extractor( x[:len(bbox_roi_extractor.featmap_strides)], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = bbox_head(bbox_feats) ms_scores.append(cls_score) if self.test_cfg.keep_all_stages: det_bboxes, det_labels = bbox_head.get_det_bboxes( rois, cls_score, bbox_pred, img_shape, scale_factor, rescale=rescale, cfg=rcnn_test_cfg) bbox_result = bbox2result(det_bboxes, det_labels, bbox_head.num_classes) ms_bbox_result['stage{}'.format(i)] = bbox_result if self.with_mask: mask_roi_extractor = self.mask_roi_extractor[i] mask_head = self.mask_head[i] if det_bboxes.shape[0] == 0: segm_result = [ [] for _ in range(mask_head.num_classes - 1) ] else: _bboxes = (det_bboxes[:, :4] * scale_factor if rescale else det_bboxes) mask_rois = bbox2roi([_bboxes]) mask_feats = mask_roi_extractor( x[:len(mask_roi_extractor.featmap_strides)], mask_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats, i) mask_pred = mask_head(mask_feats) segm_result = mask_head.get_seg_masks( mask_pred, _bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor, rescale) ms_segm_result['stage{}'.format(i)] = segm_result if i < self.num_stages - 1: bbox_label = cls_score.argmax(dim=1) rois = bbox_head.regress_by_class(rois, bbox_label, bbox_pred, img_meta[0]) cls_score = sum(ms_scores) / self.num_stages det_bboxes, det_labels = self.bbox_head[-1].get_det_bboxes( rois, cls_score, bbox_pred, img_shape, scale_factor, rescale=rescale, cfg=rcnn_test_cfg) bbox_result = bbox2result(det_bboxes, det_labels, self.bbox_head[-1].num_classes) ms_bbox_result['ensemble'] = bbox_result if self.with_mask: if det_bboxes.shape[0] == 0: segm_result = [ [] for _ in range(self.mask_head[-1].num_classes - 1) ] else: _bboxes = (det_bboxes[:, :4] * scale_factor if rescale else det_bboxes) mask_rois = bbox2roi([_bboxes]) aug_masks = [] for i in range(self.num_stages): mask_roi_extractor = self.mask_roi_extractor[i] mask_feats = mask_roi_extractor( x[:len(mask_roi_extractor.featmap_strides)], mask_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) mask_pred = self.mask_head[i](mask_feats) aug_masks.append(mask_pred.sigmoid().cpu().numpy()) merged_masks = merge_aug_masks(aug_masks, [img_meta] * self.num_stages, self.test_cfg.rcnn) segm_result = self.mask_head[-1].get_seg_masks( merged_masks, _bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor, rescale) ms_segm_result['ensemble'] = segm_result if not self.test_cfg.keep_all_stages: if self.with_mask: results = (ms_bbox_result['ensemble'], ms_segm_result['ensemble']) else: results = ms_bbox_result['ensemble'] else: if self.with_mask: results = { stage: (ms_bbox_result[stage], ms_segm_result[stage]) for stage in ms_bbox_result } else: results = ms_bbox_result return results def aug_test(self, img, img_meta, proposals=None, rescale=False): raise NotImplementedError def show_result(self, data, result, img_norm_cfg, **kwargs): if self.with_mask: ms_bbox_result, ms_segm_result = result if isinstance(ms_bbox_result, dict): result = (ms_bbox_result['ensemble'], ms_segm_result['ensemble']) else: if isinstance(result, dict): result = result['ensemble'] super(CascadeRCNN, self).show_result(data, result, img_norm_cfg, **kwargs) ================================================ FILE: mmdetection/mmdet/models/detectors/ensemble_htc.py ================================================ from torch import nn from mmdet.core import (bbox2result, bbox_mapping) from mmdet.core import (bbox2roi, merge_aug_masks, merge_aug_bboxes, multiclass_nms, merge_aug_proposals) from mmdet.models.detectors import BaseDetector class EnsembleHTC(BaseDetector): def __init__(self, models): super().__init__() self.models = nn.ModuleList(models) def simple_test(self, img, img_meta, **kwargs): pass def forward_train(self, imgs, img_metas, **kwargs): pass def extract_feat(self, imgs): pass def aug_test(self, imgs, img_metas, **kwargs): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ rpn_test_cfg = self.models[0].test_cfg.rpn imgs_per_gpu = len(img_metas[0]) aug_proposals = [[] for _ in range(imgs_per_gpu)] for model in self.models: # recompute feats to save memory for x, img_meta in zip(model.extract_feats(imgs), img_metas): proposal_list = model.simple_test_rpn(x, img_meta, rpn_test_cfg) for i, proposals in enumerate(proposal_list): aug_proposals[i].append(proposals) # after merging, proposals will be rescaled to the original image size proposal_list = [ merge_aug_proposals(proposals, img_meta, rpn_test_cfg) for proposals, img_meta in zip(aug_proposals, img_metas) ] rcnn_test_cfg = self.models[0].test_cfg.rcnn aug_bboxes = [] aug_scores = [] aug_img_metas = [] for model in self.models: for x, img_meta in zip(model.extract_feats(imgs), img_metas): # only one image in the batch img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] proposals = bbox_mapping(proposal_list[0][:, :4], img_shape, scale_factor, flip) # "ms" in variable names means multi-stage ms_scores = [] rois = bbox2roi([proposals]) for i in range(model.num_stages): bbox_head = model.bbox_head[i] cls_score, bbox_pred = model._bbox_forward_test(i, x, rois) ms_scores.append(cls_score) if i < model.num_stages - 1: bbox_label = cls_score.argmax(dim=1) rois = bbox_head.regress_by_class(rois, bbox_label, bbox_pred, img_meta[0]) cls_score = sum(ms_scores) / float(len(ms_scores)) bboxes, scores = model.bbox_head[-1].get_det_bboxes( rois, cls_score, bbox_pred, img_shape, scale_factor, rescale=False, cfg=None) aug_bboxes.append(bboxes) aug_scores.append(scores) aug_img_metas.append(img_meta) # after merging, bboxes will be rescaled to the original image size merged_bboxes, merged_scores = merge_aug_bboxes( aug_bboxes, aug_scores, aug_img_metas, rcnn_test_cfg) det_bboxes, det_labels = multiclass_nms( merged_bboxes, merged_scores, rcnn_test_cfg.score_thr, rcnn_test_cfg.nms, rcnn_test_cfg.max_per_img) bbox_result = bbox2result(det_bboxes, det_labels, self.models[0].bbox_head[-1].num_classes) if self.models[0].with_mask: if det_bboxes.shape[0] == 0: segm_result = [[] for _ in range(self.models[0].mask_head[-1].num_classes - 1)] else: aug_masks = [] aug_img_metas = [] for model in self.models: for x, img_meta in zip(model.extract_feats(imgs), img_metas): img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape, scale_factor, flip) mask_rois = bbox2roi([_bboxes]) mask_roi_extractor = model.mask_roi_extractor[-1] mask_feats = mask_roi_extractor( x[:len(mask_roi_extractor.featmap_strides)], mask_rois) last_feat = None for i in range(model.num_stages): mask_head = model.mask_head[i] if model.mask_info_flow: mask_pred, last_feat = mask_head(mask_feats, last_feat) else: mask_pred = mask_head(mask_feats) aug_masks.append(mask_pred.sigmoid().cpu().numpy()) aug_img_metas.append(img_meta) merged_masks = merge_aug_masks(aug_masks, aug_img_metas, rcnn_test_cfg) ori_shape = img_metas[0][0]['ori_shape'] segm_result = self.models[0].mask_head[-1].get_seg_masks( merged_masks, det_bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor=1.0, rescale=False) return bbox_result, segm_result else: return bbox_result ================================================ FILE: mmdetection/mmdet/models/detectors/fast_rcnn.py ================================================ from .two_stage import TwoStageDetector from ..registry import DETECTORS @DETECTORS.register_module class FastRCNN(TwoStageDetector): def __init__(self, backbone, bbox_roi_extractor, bbox_head, train_cfg, test_cfg, neck=None, shared_head=None, mask_roi_extractor=None, mask_head=None, pretrained=None): super(FastRCNN, self).__init__( backbone=backbone, neck=neck, shared_head=shared_head, bbox_roi_extractor=bbox_roi_extractor, bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg, mask_roi_extractor=mask_roi_extractor, mask_head=mask_head, pretrained=pretrained) def forward_test(self, imgs, img_metas, proposals, **kwargs): for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]: if not isinstance(var, list): raise TypeError('{} must be a list, but got {}'.format( name, type(var))) num_augs = len(imgs) if num_augs != len(img_metas): raise ValueError( 'num of augmentations ({}) != num of image meta ({})'.format( len(imgs), len(img_metas))) # TODO: remove the restriction of imgs_per_gpu == 1 when prepared imgs_per_gpu = imgs[0].size(0) assert imgs_per_gpu == 1 if num_augs == 1: return self.simple_test(imgs[0], img_metas[0], proposals[0], **kwargs) else: return self.aug_test(imgs, img_metas, proposals, **kwargs) ================================================ FILE: mmdetection/mmdet/models/detectors/faster_rcnn.py ================================================ from .two_stage import TwoStageDetector from ..registry import DETECTORS @DETECTORS.register_module class FasterRCNN(TwoStageDetector): def __init__(self, backbone, rpn_head, bbox_roi_extractor, bbox_head, train_cfg, test_cfg, neck=None, shared_head=None, pretrained=None): super(FasterRCNN, self).__init__( backbone=backbone, neck=neck, shared_head=shared_head, rpn_head=rpn_head, bbox_roi_extractor=bbox_roi_extractor, bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg, pretrained=pretrained) ================================================ FILE: mmdetection/mmdet/models/detectors/fcos.py ================================================ from .single_stage import SingleStageDetector from ..registry import DETECTORS @DETECTORS.register_module class FCOS(SingleStageDetector): def __init__(self, backbone, neck, bbox_head, train_cfg=None, test_cfg=None, pretrained=None): super(FCOS, self).__init__(backbone, neck, bbox_head, train_cfg, test_cfg, pretrained) ================================================ FILE: mmdetection/mmdet/models/detectors/htc.py ================================================ import torch import torch.nn.functional as F from mmdet.core import (bbox2result, build_assigner, build_sampler, bbox_mapping) from mmdet.core import (bbox2roi, merge_aug_masks, merge_aug_bboxes, multiclass_nms) from .cascade_rcnn import CascadeRCNN from .test_mixins import RPNTestMixin from .. import builder from ..registry import DETECTORS @DETECTORS.register_module class HybridTaskCascade(CascadeRCNN, RPNTestMixin): def __init__(self, num_stages, backbone, semantic_roi_extractor=None, semantic_head=None, semantic_fusion=('bbox', 'mask'), interleaved=True, mask_info_flow=True, **kwargs): super(HybridTaskCascade, self).__init__(num_stages, backbone, **kwargs) assert self.with_bbox and self.with_mask assert not self.with_shared_head # shared head not supported if semantic_head is not None: self.semantic_roi_extractor = builder.build_roi_extractor( semantic_roi_extractor) self.semantic_head = builder.build_head(semantic_head) self.semantic_fusion = semantic_fusion self.interleaved = interleaved self.mask_info_flow = mask_info_flow @property def with_semantic(self): if hasattr(self, 'semantic_head') and self.semantic_head is not None: return True else: return False def _bbox_forward_train(self, stage, x, sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg, semantic_feat=None): rois = bbox2roi([res.bboxes for res in sampling_results]) bbox_roi_extractor = self.bbox_roi_extractor[stage] bbox_head = self.bbox_head[stage] bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], rois) # semantic feature fusion # element-wise sum for original features and pooled semantic features if self.with_semantic and 'bbox' in self.semantic_fusion: bbox_semantic_feat = self.semantic_roi_extractor([semantic_feat], rois) if bbox_semantic_feat.shape[-2:] != bbox_feats.shape[-2:]: bbox_semantic_feat = F.adaptive_avg_pool2d( bbox_semantic_feat, bbox_feats.shape[-2:]) bbox_feats += bbox_semantic_feat cls_score, bbox_pred = bbox_head(bbox_feats) bbox_targets = bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg) loss_bbox = bbox_head.loss(cls_score, bbox_pred, *bbox_targets) return loss_bbox, rois, bbox_targets, bbox_pred def _mask_forward_train(self, stage, x, sampling_results, gt_masks, rcnn_train_cfg, semantic_feat=None): mask_roi_extractor = self.mask_roi_extractor[stage] mask_head = self.mask_head[stage] pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results]) mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs], pos_rois) # semantic feature fusion # element-wise sum for original features and pooled semantic features if self.with_semantic and 'mask' in self.semantic_fusion: mask_semantic_feat = self.semantic_roi_extractor([semantic_feat], pos_rois) if mask_semantic_feat.shape[-2:] != mask_feats.shape[-2:]: mask_semantic_feat = F.adaptive_avg_pool2d( mask_semantic_feat, mask_feats.shape[-2:]) mask_feats += mask_semantic_feat # mask information flow # forward all previous mask heads to obtain last_feat, and fuse it # with the normal mask feature if self.mask_info_flow: last_feat = None for i in range(stage): last_feat = self.mask_head[i]( mask_feats, last_feat, return_logits=False) mask_pred = mask_head(mask_feats, last_feat, return_feat=False) else: mask_pred = mask_head(mask_feats) mask_targets = mask_head.get_target(sampling_results, gt_masks, rcnn_train_cfg) pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results]) loss_mask = mask_head.loss(mask_pred, mask_targets, pos_labels) return loss_mask def _bbox_forward_test(self, stage, x, rois, semantic_feat=None): bbox_roi_extractor = self.bbox_roi_extractor[stage] bbox_head = self.bbox_head[stage] bbox_feats = bbox_roi_extractor( x[:len(bbox_roi_extractor.featmap_strides)], rois) if self.with_semantic and 'bbox' in self.semantic_fusion: bbox_semantic_feat = self.semantic_roi_extractor([semantic_feat], rois) if bbox_semantic_feat.shape[-2:] != bbox_feats.shape[-2:]: bbox_semantic_feat = F.adaptive_avg_pool2d( bbox_semantic_feat, bbox_feats.shape[-2:]) bbox_feats += bbox_semantic_feat cls_score, bbox_pred = bbox_head(bbox_feats) return cls_score, bbox_pred def _mask_forward_test(self, stage, x, bboxes, semantic_feat=None): mask_roi_extractor = self.mask_roi_extractor[stage] mask_head = self.mask_head[stage] mask_rois = bbox2roi([bboxes]) mask_feats = mask_roi_extractor( x[:len(mask_roi_extractor.featmap_strides)], mask_rois) if self.with_semantic and 'mask' in self.semantic_fusion: mask_semantic_feat = self.semantic_roi_extractor([semantic_feat], mask_rois) if mask_semantic_feat.shape[-2:] != mask_feats.shape[-2:]: mask_semantic_feat = F.adaptive_avg_pool2d( mask_semantic_feat, mask_feats.shape[-2:]) mask_feats += mask_semantic_feat if self.mask_info_flow: last_feat = None last_pred = None for i in range(stage): mask_pred, last_feat = self.mask_head[i](mask_feats, last_feat) if last_pred is not None: mask_pred = mask_pred + last_pred last_pred = mask_pred mask_pred = mask_head(mask_feats, last_feat, return_feat=False) if last_pred is not None: mask_pred = mask_pred + last_pred else: mask_pred = mask_head(mask_feats) return mask_pred def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, gt_semantic_seg=None, proposals=None): x = self.extract_feat(img) losses = dict() # RPN part, the same as normal two-stage detectors if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # semantic segmentation part # 2 outputs: segmentation prediction and embedded features if self.with_semantic: semantic_pred, semantic_feat = self.semantic_head(x) loss_seg = self.semantic_head.loss(semantic_pred, gt_semantic_seg) losses['loss_semantic_seg'] = loss_seg else: semantic_feat = None for i in range(self.num_stages): self.current_stage = i rcnn_train_cfg = self.train_cfg.rcnn[i] lw = self.train_cfg.stage_loss_weights[i] # assign gts and sample proposals sampling_results = [] bbox_assigner = build_assigner(rcnn_train_cfg.assigner) bbox_sampler = build_sampler(rcnn_train_cfg.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] for j in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j], gt_labels[j]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[j], gt_bboxes[j], gt_labels[j], feats=[lvl_feat[j][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss loss_bbox, rois, bbox_targets, bbox_pred = \ self._bbox_forward_train( i, x, sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg, semantic_feat) roi_labels = bbox_targets[0] for name, value in loss_bbox.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # mask head forward and loss if self.with_mask: # interleaved execution: use regressed bboxes by the box branch # to train the mask branch if self.interleaved: pos_is_gts = [res.pos_is_gt for res in sampling_results] with torch.no_grad(): proposal_list = self.bbox_head[i].refine_bboxes( rois, roi_labels, bbox_pred, pos_is_gts, img_meta) # re-assign and sample 512 RoIs from 512 RoIs sampling_results = [] for j in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j], gt_labels[j]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[j], gt_bboxes[j], gt_labels[j], feats=[lvl_feat[j][None] for lvl_feat in x]) sampling_results.append(sampling_result) loss_mask = self._mask_forward_train(i, x, sampling_results, gt_masks, rcnn_train_cfg, semantic_feat) for name, value in loss_mask.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # refine bboxes (same as Cascade R-CNN) if i < self.num_stages - 1 and not self.interleaved: pos_is_gts = [res.pos_is_gt for res in sampling_results] with torch.no_grad(): proposal_list = self.bbox_head[i].refine_bboxes( rois, roi_labels, bbox_pred, pos_is_gts, img_meta) return losses def simple_test(self, img, img_meta, proposals=None, rescale=False): x = self.extract_feat(img) proposal_list = self.simple_test_rpn( x, img_meta, self.test_cfg.rpn) if proposals is None else proposals if self.with_semantic: _, semantic_feat = self.semantic_head(x) else: semantic_feat = None img_shape = img_meta[0]['img_shape'] ori_shape = img_meta[0]['ori_shape'] scale_factor = img_meta[0]['scale_factor'] # "ms" in variable names means multi-stage ms_bbox_result = {} ms_segm_result = {} ms_scores = [] rcnn_test_cfg = self.test_cfg.rcnn rois = bbox2roi(proposal_list) for i in range(self.num_stages): bbox_head = self.bbox_head[i] cls_score, bbox_pred = self._bbox_forward_test( i, x, rois, semantic_feat=semantic_feat) ms_scores.append(cls_score) if self.test_cfg.keep_all_stages: det_bboxes, det_labels = bbox_head.get_det_bboxes( rois, cls_score, bbox_pred, img_shape, scale_factor, rescale=rescale, nms_cfg=rcnn_test_cfg) bbox_result = bbox2result(det_bboxes, det_labels, bbox_head.num_classes) ms_bbox_result['stage{}'.format(i)] = bbox_result if self.with_mask: mask_head = self.mask_head[i] if det_bboxes.shape[0] == 0: segm_result = [ [] for _ in range(mask_head.num_classes - 1) ] else: _bboxes = (det_bboxes[:, :4] * scale_factor if rescale else det_bboxes) mask_pred = self._mask_forward_test( i, x, _bboxes, semantic_feat=semantic_feat) segm_result = mask_head.get_seg_masks( mask_pred, _bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor, rescale) ms_segm_result['stage{}'.format(i)] = segm_result if i < self.num_stages - 1: bbox_label = cls_score.argmax(dim=1) rois = bbox_head.regress_by_class(rois, bbox_label, bbox_pred, img_meta[0]) cls_score = sum(ms_scores) / float(len(ms_scores)) det_bboxes, det_labels = self.bbox_head[-1].get_det_bboxes( rois, cls_score, bbox_pred, img_shape, scale_factor, rescale=rescale, cfg=rcnn_test_cfg) bbox_result = bbox2result(det_bboxes, det_labels, self.bbox_head[-1].num_classes) ms_bbox_result['ensemble'] = bbox_result if self.with_mask: if det_bboxes.shape[0] == 0: segm_result = [ [] for _ in range(self.mask_head[-1].num_classes - 1) ] else: _bboxes = (det_bboxes[:, :4] * scale_factor if rescale else det_bboxes) mask_rois = bbox2roi([_bboxes]) aug_masks = [] mask_roi_extractor = self.mask_roi_extractor[-1] mask_feats = mask_roi_extractor( x[:len(mask_roi_extractor.featmap_strides)], mask_rois) if self.with_semantic and 'mask' in self.semantic_fusion: mask_semantic_feat = self.semantic_roi_extractor( [semantic_feat], mask_rois) mask_feats += mask_semantic_feat last_feat = None for i in range(self.num_stages): mask_head = self.mask_head[i] if self.mask_info_flow: mask_pred, last_feat = mask_head(mask_feats, last_feat) else: mask_pred = mask_head(mask_feats) aug_masks.append(mask_pred.sigmoid().cpu().numpy()) merged_masks = merge_aug_masks(aug_masks, [img_meta] * self.num_stages, self.test_cfg.rcnn) segm_result = self.mask_head[-1].get_seg_masks( merged_masks, _bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor, rescale) ms_segm_result['ensemble'] = segm_result if not self.test_cfg.keep_all_stages: if self.with_mask: results = (ms_bbox_result['ensemble'], ms_segm_result['ensemble']) else: results = ms_bbox_result['ensemble'] else: if self.with_mask: results = { stage: (ms_bbox_result[stage], ms_segm_result[stage]) for stage in ms_bbox_result } else: results = ms_bbox_result return results def aug_test(self, imgs, img_metas, proposals=None, rescale=False): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ # recompute feats to save memory proposal_list = self.aug_test_rpn( self.extract_feats(imgs), img_metas, self.test_cfg.rpn) rcnn_test_cfg = self.test_cfg.rcnn aug_bboxes = [] aug_scores = [] for x, img_meta in zip(self.extract_feats(imgs), img_metas): # only one image in the batch img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] proposals = bbox_mapping(proposal_list[0][:, :4], img_shape, scale_factor, flip) # "ms" in variable names means multi-stage ms_scores = [] rois = bbox2roi([proposals]) for i in range(self.num_stages): bbox_head = self.bbox_head[i] cls_score, bbox_pred = self._bbox_forward_test(i, x, rois) ms_scores.append(cls_score) if i < self.num_stages - 1: bbox_label = cls_score.argmax(dim=1) rois = bbox_head.regress_by_class(rois, bbox_label, bbox_pred, img_meta[0]) cls_score = sum(ms_scores) / float(len(ms_scores)) bboxes, scores = self.bbox_head[-1].get_det_bboxes( rois, cls_score, bbox_pred, img_shape, scale_factor, rescale=False, cfg=None) aug_bboxes.append(bboxes) aug_scores.append(scores) # after merging, bboxes will be rescaled to the original image size merged_bboxes, merged_scores = merge_aug_bboxes( aug_bboxes, aug_scores, img_metas, rcnn_test_cfg) det_bboxes, det_labels = multiclass_nms( merged_bboxes, merged_scores, rcnn_test_cfg.score_thr, rcnn_test_cfg.nms, rcnn_test_cfg.max_per_img) bbox_result = bbox2result(det_bboxes, det_labels, self.bbox_head[-1].num_classes) if self.with_mask: if det_bboxes.shape[0] == 0: segm_result = [[] for _ in range(self.mask_head[-1].num_classes - 1)] else: aug_masks = [] aug_img_metas = [] for x, img_meta in zip(self.extract_feats(imgs), img_metas): img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape, scale_factor, flip) mask_rois = bbox2roi([_bboxes]) mask_roi_extractor = self.mask_roi_extractor[-1] mask_feats = mask_roi_extractor( x[:len(mask_roi_extractor.featmap_strides)], mask_rois) last_feat = None for i in range(self.num_stages): mask_head = self.mask_head[i] if self.mask_info_flow: mask_pred, last_feat = mask_head(mask_feats, last_feat) else: mask_pred = mask_head(mask_feats) aug_masks.append(mask_pred.sigmoid().cpu().numpy()) aug_img_metas.append(img_meta) merged_masks = merge_aug_masks(aug_masks, aug_img_metas, self.test_cfg.rcnn) ori_shape = img_metas[0][0]['ori_shape'] segm_result = self.mask_head[-1].get_seg_masks( merged_masks, det_bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor=1.0, rescale=False) return bbox_result, segm_result else: return bbox_result ================================================ FILE: mmdetection/mmdet/models/detectors/mask_rcnn.py ================================================ from .two_stage import TwoStageDetector from ..registry import DETECTORS @DETECTORS.register_module class MaskRCNN(TwoStageDetector): def __init__(self, backbone, rpn_head, bbox_roi_extractor, bbox_head, mask_roi_extractor, mask_head, train_cfg, test_cfg, neck=None, shared_head=None, pretrained=None): super(MaskRCNN, self).__init__( backbone=backbone, neck=neck, shared_head=shared_head, rpn_head=rpn_head, bbox_roi_extractor=bbox_roi_extractor, bbox_head=bbox_head, mask_roi_extractor=mask_roi_extractor, mask_head=mask_head, train_cfg=train_cfg, test_cfg=test_cfg, pretrained=pretrained) ================================================ FILE: mmdetection/mmdet/models/detectors/retinanet.py ================================================ from .single_stage import SingleStageDetector from ..registry import DETECTORS @DETECTORS.register_module class RetinaNet(SingleStageDetector): def __init__(self, backbone, neck, bbox_head, train_cfg=None, test_cfg=None, pretrained=None): super(RetinaNet, self).__init__(backbone, neck, bbox_head, train_cfg, test_cfg, pretrained) ================================================ FILE: mmdetection/mmdet/models/detectors/rpn.py ================================================ import mmcv from mmdet.core import tensor2imgs, bbox_mapping from .base import BaseDetector from .test_mixins import RPNTestMixin from .. import builder from ..registry import DETECTORS @DETECTORS.register_module class RPN(BaseDetector, RPNTestMixin): def __init__(self, backbone, neck, rpn_head, train_cfg, test_cfg, pretrained=None): super(RPN, self).__init__() self.backbone = builder.build_backbone(backbone) self.neck = builder.build_neck(neck) if neck is not None else None self.rpn_head = builder.build_head(rpn_head) self.train_cfg = train_cfg self.test_cfg = test_cfg self.init_weights(pretrained=pretrained) def init_weights(self, pretrained=None): super(RPN, self).init_weights(pretrained) self.backbone.init_weights(pretrained=pretrained) if self.with_neck: self.neck.init_weights() self.rpn_head.init_weights() def extract_feat(self, img): x = self.backbone(img) if self.with_neck: x = self.neck(x) return x def forward_train(self, img, img_meta, gt_bboxes=None, gt_bboxes_ignore=None): if self.train_cfg.rpn.get('debug', False): self.rpn_head.debug_imgs = tensor2imgs(img) x = self.extract_feat(img) rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) return losses def simple_test(self, img, img_meta, rescale=False): x = self.extract_feat(img) proposal_list = self.simple_test_rpn(x, img_meta, self.test_cfg.rpn) if rescale: for proposals, meta in zip(proposal_list, img_meta): proposals[:, :4] /= meta['scale_factor'] # TODO: remove this restriction return proposal_list[0].cpu().numpy() def aug_test(self, imgs, img_metas, rescale=False): proposal_list = self.aug_test_rpn( self.extract_feats(imgs), img_metas, self.test_cfg.rpn) if not rescale: for proposals, img_meta in zip(proposal_list, img_metas[0]): img_shape = img_meta['img_shape'] scale_factor = img_meta['scale_factor'] flip = img_meta['flip'] proposals[:, :4] = bbox_mapping(proposals[:, :4], img_shape, scale_factor, flip) # TODO: remove this restriction return proposal_list[0].cpu().numpy() def show_result(self, data, result, img_norm_cfg, dataset=None, top_k=20): """Show RPN proposals on the image. Although we assume batch size is 1, this method supports arbitrary batch size. """ img_tensor = data['img'][0] img_metas = data['img_meta'][0].data[0] imgs = tensor2imgs(img_tensor, **img_norm_cfg) assert len(imgs) == len(img_metas) for img, img_meta in zip(imgs, img_metas): h, w, _ = img_meta['img_shape'] img_show = img[:h, :w, :] mmcv.imshow_bboxes(img_show, result, top_k=top_k) ================================================ FILE: mmdetection/mmdet/models/detectors/single_stage.py ================================================ import torch.nn as nn from .base import BaseDetector from .. import builder from ..registry import DETECTORS from mmdet.core import bbox2result @DETECTORS.register_module class SingleStageDetector(BaseDetector): def __init__(self, backbone, neck=None, bbox_head=None, train_cfg=None, test_cfg=None, pretrained=None): super(SingleStageDetector, self).__init__() self.backbone = builder.build_backbone(backbone) if neck is not None: self.neck = builder.build_neck(neck) self.bbox_head = builder.build_head(bbox_head) self.train_cfg = train_cfg self.test_cfg = test_cfg self.init_weights(pretrained=pretrained) def init_weights(self, pretrained=None): super(SingleStageDetector, self).init_weights(pretrained) self.backbone.init_weights(pretrained=pretrained) if self.with_neck: if isinstance(self.neck, nn.Sequential): for m in self.neck: m.init_weights() else: self.neck.init_weights() self.bbox_head.init_weights() def extract_feat(self, img): x = self.backbone(img) if self.with_neck: x = self.neck(x) return x def forward_train(self, img, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=None): x = self.extract_feat(img) outs = self.bbox_head(x) loss_inputs = outs + (gt_bboxes, gt_labels, img_metas, self.train_cfg) losses = self.bbox_head.loss( *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) return losses def simple_test(self, img, img_meta, rescale=False): x = self.extract_feat(img) outs = self.bbox_head(x) bbox_inputs = outs + (img_meta, self.test_cfg, rescale) bbox_list = self.bbox_head.get_bboxes(*bbox_inputs) bbox_results = [ bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes) for det_bboxes, det_labels in bbox_list ] return bbox_results[0] def aug_test(self, imgs, img_metas, rescale=False): raise NotImplementedError ================================================ FILE: mmdetection/mmdet/models/detectors/test_mixins.py ================================================ from mmdet.core import (bbox2roi, bbox_mapping, merge_aug_proposals, merge_aug_bboxes, merge_aug_masks, multiclass_nms) class RPNTestMixin(object): def simple_test_rpn(self, x, img_meta, rpn_test_cfg): rpn_outs = self.rpn_head(x) proposal_inputs = rpn_outs + (img_meta, rpn_test_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) return proposal_list def aug_test_rpn(self, feats, img_metas, rpn_test_cfg): imgs_per_gpu = len(img_metas[0]) aug_proposals = [[] for _ in range(imgs_per_gpu)] for x, img_meta in zip(feats, img_metas): proposal_list = self.simple_test_rpn(x, img_meta, rpn_test_cfg) for i, proposals in enumerate(proposal_list): aug_proposals[i].append(proposals) # after merging, proposals will be rescaled to the original image size merged_proposals = [ merge_aug_proposals(proposals, img_meta, rpn_test_cfg) for proposals, img_meta in zip(aug_proposals, img_metas) ] return merged_proposals class BBoxTestMixin(object): def simple_test_bboxes(self, x, img_meta, proposals, rcnn_test_cfg, rescale=False): """Test only det bboxes without augmentation.""" rois = bbox2roi(proposals) roi_feats = self.bbox_roi_extractor( x[:len(self.bbox_roi_extractor.featmap_strides)], rois) if self.with_shared_head: roi_feats = self.shared_head(roi_feats) cls_score, bbox_pred = self.bbox_head(roi_feats) img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] det_bboxes, det_labels = self.bbox_head.get_det_bboxes( rois, cls_score, bbox_pred, img_shape, scale_factor, rescale=rescale, cfg=rcnn_test_cfg) return det_bboxes, det_labels def aug_test_bboxes(self, feats, img_metas, proposal_list, rcnn_test_cfg): aug_bboxes = [] aug_scores = [] for x, img_meta in zip(feats, img_metas): # only one image in the batch img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] # TODO more flexible proposals = bbox_mapping(proposal_list[0][:, :4], img_shape, scale_factor, flip) rois = bbox2roi([proposals]) # recompute feature maps to save GPU memory roi_feats = self.bbox_roi_extractor( x[:len(self.bbox_roi_extractor.featmap_strides)], rois) if self.with_shared_head: roi_feats = self.shared_head(roi_feats) cls_score, bbox_pred = self.bbox_head(roi_feats) bboxes, scores = self.bbox_head.get_det_bboxes( rois, cls_score, bbox_pred, img_shape, scale_factor, rescale=False, cfg=None) aug_bboxes.append(bboxes) aug_scores.append(scores) # after merging, bboxes will be rescaled to the original image size merged_bboxes, merged_scores = merge_aug_bboxes( aug_bboxes, aug_scores, img_metas, rcnn_test_cfg) det_bboxes, det_labels = multiclass_nms( merged_bboxes, merged_scores, rcnn_test_cfg.score_thr, rcnn_test_cfg.nms, rcnn_test_cfg.max_per_img) return det_bboxes, det_labels class MaskTestMixin(object): def simple_test_mask(self, x, img_meta, det_bboxes, det_labels, rescale=False): # image shape of the first image in the batch (only one) ori_shape = img_meta[0]['ori_shape'] scale_factor = img_meta[0]['scale_factor'] if det_bboxes.shape[0] == 0: segm_result = [[] for _ in range(self.mask_head.num_classes - 1)] else: # if det_bboxes is rescaled to the original image size, we need to # rescale it back to the testing scale to obtain RoIs. _bboxes = (det_bboxes[:, :4] * scale_factor if rescale else det_bboxes) mask_rois = bbox2roi([_bboxes]) mask_feats = self.mask_roi_extractor( x[:len(self.mask_roi_extractor.featmap_strides)], mask_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) mask_pred = self.mask_head(mask_feats) segm_result = self.mask_head.get_seg_masks( mask_pred, _bboxes, det_labels, self.test_cfg.rcnn, ori_shape, scale_factor, rescale) return segm_result def aug_test_mask(self, feats, img_metas, det_bboxes, det_labels): if det_bboxes.shape[0] == 0: segm_result = [[] for _ in range(self.mask_head.num_classes - 1)] else: aug_masks = [] for x, img_meta in zip(feats, img_metas): img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape, scale_factor, flip) mask_rois = bbox2roi([_bboxes]) mask_feats = self.mask_roi_extractor( x[:len(self.mask_roi_extractor.featmap_strides)], mask_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) mask_pred = self.mask_head(mask_feats) # convert to numpy array to save memory aug_masks.append(mask_pred.sigmoid().cpu().numpy()) merged_masks = merge_aug_masks(aug_masks, img_metas, self.test_cfg.rcnn) ori_shape = img_metas[0][0]['ori_shape'] segm_result = self.mask_head.get_seg_masks( merged_masks, det_bboxes, det_labels, self.test_cfg.rcnn, ori_shape, scale_factor=1.0, rescale=False) return segm_result ================================================ FILE: mmdetection/mmdet/models/detectors/two_stage.py ================================================ import torch import torch.nn as nn from .base import BaseDetector from .test_mixins import RPNTestMixin, BBoxTestMixin, MaskTestMixin from .. import builder from ..registry import DETECTORS from mmdet.core import bbox2roi, bbox2result, build_assigner, build_sampler @DETECTORS.register_module class TwoStageDetector(BaseDetector, RPNTestMixin, BBoxTestMixin, MaskTestMixin): def __init__(self, backbone, neck=None, shared_head=None, rpn_head=None, bbox_roi_extractor=None, bbox_head=None, mask_roi_extractor=None, mask_head=None, train_cfg=None, test_cfg=None, pretrained=None): super(TwoStageDetector, self).__init__() self.backbone = builder.build_backbone(backbone) if neck is not None: self.neck = builder.build_neck(neck) if shared_head is not None: self.shared_head = builder.build_shared_head(shared_head) if rpn_head is not None: self.rpn_head = builder.build_head(rpn_head) if bbox_head is not None: self.bbox_roi_extractor = builder.build_roi_extractor( bbox_roi_extractor) self.bbox_head = builder.build_head(bbox_head) if mask_head is not None: if mask_roi_extractor is not None: self.mask_roi_extractor = builder.build_roi_extractor( mask_roi_extractor) self.share_roi_extractor = False else: self.share_roi_extractor = True self.mask_roi_extractor = self.bbox_roi_extractor self.mask_head = builder.build_head(mask_head) self.train_cfg = train_cfg self.test_cfg = test_cfg self.init_weights(pretrained=pretrained) @property def with_rpn(self): return hasattr(self, 'rpn_head') and self.rpn_head is not None def init_weights(self, pretrained=None): super(TwoStageDetector, self).init_weights(pretrained) self.backbone.init_weights(pretrained=pretrained) if self.with_neck: if isinstance(self.neck, nn.Sequential): for m in self.neck: m.init_weights() else: self.neck.init_weights() if self.with_shared_head: self.shared_head.init_weights(pretrained=pretrained) if self.with_rpn: self.rpn_head.init_weights() if self.with_bbox: self.bbox_roi_extractor.init_weights() self.bbox_head.init_weights() if self.with_mask: self.mask_head.init_weights() if not self.share_roi_extractor: self.mask_roi_extractor.init_weights() def extract_feat(self, img): x = self.backbone(img) if self.with_neck: x = self.neck(x) return x def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler( self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target( sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones( res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros( res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target( sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses def simple_test(self, img, img_meta, proposals=None, rescale=False): """Test without augmentation.""" assert self.with_bbox, "Bbox head must be implemented." x = self.extract_feat(img) proposal_list = self.simple_test_rpn( x, img_meta, self.test_cfg.rpn) if proposals is None else proposals det_bboxes, det_labels = self.simple_test_bboxes( x, img_meta, proposal_list, self.test_cfg.rcnn, rescale=rescale) bbox_results = bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes) if not self.with_mask: return bbox_results else: segm_results = self.simple_test_mask( x, img_meta, det_bboxes, det_labels, rescale=rescale) return bbox_results, segm_results def aug_test(self, imgs, img_metas, rescale=False): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ # recompute feats to save memory proposal_list = self.aug_test_rpn( self.extract_feats(imgs), img_metas, self.test_cfg.rpn) det_bboxes, det_labels = self.aug_test_bboxes( self.extract_feats(imgs), img_metas, proposal_list, self.test_cfg.rcnn) if rescale: _det_bboxes = det_bboxes else: _det_bboxes = det_bboxes.clone() _det_bboxes[:, :4] *= img_metas[0][0]['scale_factor'] bbox_results = bbox2result(_det_bboxes, det_labels, self.bbox_head.num_classes) # det_bboxes always keep the original scale if self.with_mask: segm_results = self.aug_test_mask( self.extract_feats(imgs), img_metas, det_bboxes, det_labels) return bbox_results, segm_results else: return bbox_results ================================================ FILE: mmdetection/mmdet/models/mask_heads/__init__.py ================================================ from .fcn_mask_head import FCNMaskHead from .htc_mask_head import HTCMaskHead from .fused_semantic_head import FusedSemanticHead __all__ = ['FCNMaskHead', 'HTCMaskHead', 'FusedSemanticHead'] ================================================ FILE: mmdetection/mmdet/models/mask_heads/fcn_mask_head.py ================================================ import mmcv import numpy as np import pycocotools.mask as mask_util import torch import torch.nn as nn from ..registry import HEADS from ..utils import ConvModule from mmdet.core import mask_cross_entropy, mask_target @HEADS.register_module class FCNMaskHead(nn.Module): def __init__(self, num_convs=4, roi_feat_size=14, in_channels=256, conv_kernel_size=3, conv_out_channels=256, upsample_method='deconv', upsample_ratio=2, num_classes=81, class_agnostic=False, conv_cfg=None, norm_cfg=None): super(FCNMaskHead, self).__init__() if upsample_method not in [None, 'deconv', 'nearest', 'bilinear']: raise ValueError( 'Invalid upsample method {}, accepted methods ' 'are "deconv", "nearest", "bilinear"'.format(upsample_method)) self.num_convs = num_convs self.roi_feat_size = roi_feat_size # WARN: not used and reserved self.in_channels = in_channels self.conv_kernel_size = conv_kernel_size self.conv_out_channels = conv_out_channels self.upsample_method = upsample_method self.upsample_ratio = upsample_ratio self.num_classes = num_classes self.class_agnostic = class_agnostic self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.convs = nn.ModuleList() for i in range(self.num_convs): in_channels = (self.in_channels if i == 0 else self.conv_out_channels) padding = (self.conv_kernel_size - 1) // 2 self.convs.append( ConvModule( in_channels, self.conv_out_channels, self.conv_kernel_size, padding=padding, conv_cfg=conv_cfg, norm_cfg=norm_cfg)) upsample_in_channels = (self.conv_out_channels if self.num_convs > 0 else in_channels) if self.upsample_method is None: self.upsample = None elif self.upsample_method == 'deconv': self.upsample = nn.ConvTranspose2d( upsample_in_channels, self.conv_out_channels, self.upsample_ratio, stride=self.upsample_ratio) else: self.upsample = nn.Upsample( scale_factor=self.upsample_ratio, mode=self.upsample_method) out_channels = 1 if self.class_agnostic else self.num_classes logits_in_channel = (self.conv_out_channels if self.upsample_method == 'deconv' else upsample_in_channels) self.conv_logits = nn.Conv2d(logits_in_channel, out_channels, 1) self.relu = nn.ReLU(inplace=True) self.debug_imgs = None def init_weights(self): for m in [self.upsample, self.conv_logits]: if m is None: continue nn.init.kaiming_normal_( m.weight, mode='fan_out', nonlinearity='relu') nn.init.constant_(m.bias, 0) def forward(self, x): for conv in self.convs: x = conv(x) if self.upsample is not None: x = self.upsample(x) if self.upsample_method == 'deconv': x = self.relu(x) mask_pred = self.conv_logits(x) return mask_pred def get_target(self, sampling_results, gt_masks, rcnn_train_cfg): pos_proposals = [res.pos_bboxes for res in sampling_results] pos_assigned_gt_inds = [ res.pos_assigned_gt_inds for res in sampling_results ] mask_targets = mask_target(pos_proposals, pos_assigned_gt_inds, gt_masks, rcnn_train_cfg) return mask_targets def loss(self, mask_pred, mask_targets, labels): loss = dict() if self.class_agnostic: loss_mask = mask_cross_entropy(mask_pred, mask_targets, torch.zeros_like(labels)) else: loss_mask = mask_cross_entropy(mask_pred, mask_targets, labels) loss['loss_mask'] = loss_mask return loss def get_seg_masks(self, mask_pred, det_bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor, rescale): """Get segmentation masks from mask_pred and bboxes. Args: mask_pred (Tensor or ndarray): shape (n, #class+1, h, w). For single-scale testing, mask_pred is the direct output of model, whose type is Tensor, while for multi-scale testing, it will be converted to numpy array outside of this method. det_bboxes (Tensor): shape (n, 4/5) det_labels (Tensor): shape (n, ) img_shape (Tensor): shape (3, ) rcnn_test_cfg (dict): rcnn testing config ori_shape: original image size Returns: list[list]: encoded masks """ if isinstance(mask_pred, torch.Tensor): mask_pred = mask_pred.sigmoid().cpu().numpy() assert isinstance(mask_pred, np.ndarray) cls_segms = [[] for _ in range(self.num_classes - 1)] bboxes = det_bboxes.cpu().numpy()[:, :4] labels = det_labels.cpu().numpy() + 1 if rescale: img_h, img_w = ori_shape[:2] else: img_h = np.round(ori_shape[0] * scale_factor).astype(np.int32) img_w = np.round(ori_shape[1] * scale_factor).astype(np.int32) scale_factor = 1.0 for i in range(bboxes.shape[0]): bbox = (bboxes[i, :] / scale_factor).astype(np.int32) label = labels[i] w = max(bbox[2] - bbox[0] + 1, 1) h = max(bbox[3] - bbox[1] + 1, 1) if not self.class_agnostic: mask_pred_ = mask_pred[i, label, :, :] else: mask_pred_ = mask_pred[i, 0, :, :] im_mask = np.zeros((img_h, img_w), dtype=np.uint8) bbox_mask = mmcv.imresize(mask_pred_, (w, h)) bbox_mask = (bbox_mask > rcnn_test_cfg.mask_thr_binary).astype( np.uint8) im_mask[bbox[1]:bbox[1] + h, bbox[0]:bbox[0] + w] = bbox_mask rle = mask_util.encode( np.array(im_mask[:, :, np.newaxis], order='F'))[0] cls_segms[label - 1].append(rle) return cls_segms ================================================ FILE: mmdetection/mmdet/models/mask_heads/fused_semantic_head.py ================================================ import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import kaiming_init from ..registry import HEADS from ..utils import ConvModule @HEADS.register_module class FusedSemanticHead(nn.Module): """Multi-level fused semantic segmentation head. in_1 -> 1x1 conv --- | in_2 -> 1x1 conv -- | || in_3 -> 1x1 conv - || ||| /-> 1x1 conv (mask prediction) in_4 -> 1x1 conv -----> 3x3 convs (*4) | \-> 1x1 conv (feature) in_5 -> 1x1 conv --- """ # noqa: W605 def __init__(self, num_ins, fusion_level, num_convs=4, in_channels=256, conv_out_channels=256, num_classes=183, ignore_label=255, loss_weight=0.2, conv_cfg=None, norm_cfg=None): super(FusedSemanticHead, self).__init__() self.num_ins = num_ins self.fusion_level = fusion_level self.num_convs = num_convs self.in_channels = in_channels self.conv_out_channels = conv_out_channels self.num_classes = num_classes self.ignore_label = ignore_label self.loss_weight = loss_weight self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.lateral_convs = nn.ModuleList() for i in range(self.num_ins): self.lateral_convs.append( ConvModule( self.in_channels, self.in_channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, inplace=False)) self.convs = nn.ModuleList() for i in range(self.num_convs): in_channels = self.in_channels if i == 0 else conv_out_channels self.convs.append( ConvModule( in_channels, conv_out_channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg)) self.conv_embedding = ConvModule( conv_out_channels, conv_out_channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg) self.conv_logits = nn.Conv2d(conv_out_channels, self.num_classes, 1) self.criterion = nn.CrossEntropyLoss(ignore_index=ignore_label) def init_weights(self): kaiming_init(self.conv_logits) def forward(self, feats): x = self.lateral_convs[self.fusion_level](feats[self.fusion_level]) fused_size = tuple(x.shape[-2:]) for i, feat in enumerate(feats): if i != self.fusion_level: feat = F.interpolate( feat, size=fused_size, mode='bilinear', align_corners=True) x += self.lateral_convs[i](feat) for i in range(self.num_convs): x = self.convs[i](x) mask_pred = self.conv_logits(x) x = self.conv_embedding(x) return mask_pred, x def loss(self, mask_pred, labels): labels = labels.squeeze(1).long() loss_semantic_seg = self.criterion(mask_pred, labels) loss_semantic_seg *= self.loss_weight return loss_semantic_seg ================================================ FILE: mmdetection/mmdet/models/mask_heads/htc_mask_head.py ================================================ from .fcn_mask_head import FCNMaskHead from ..registry import HEADS from ..utils import ConvModule @HEADS.register_module class HTCMaskHead(FCNMaskHead): def __init__(self, *args, **kwargs): super(HTCMaskHead, self).__init__(*args, **kwargs) self.conv_res = ConvModule( self.conv_out_channels, self.conv_out_channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg) def init_weights(self): super(HTCMaskHead, self).init_weights() self.conv_res.init_weights() def forward(self, x, res_feat=None, return_logits=True, return_feat=True): if res_feat is not None: res_feat = self.conv_res(res_feat) x = x + res_feat for conv in self.convs: x = conv(x) res_feat = x outs = [] if return_logits: x = self.upsample(x) if self.upsample_method == 'deconv': x = self.relu(x) mask_pred = self.conv_logits(x) outs.append(mask_pred) if return_feat: outs.append(res_feat) return outs if len(outs) > 1 else outs[0] ================================================ FILE: mmdetection/mmdet/models/necks/__init__.py ================================================ from .fpn import FPN __all__ = ['FPN'] ================================================ FILE: mmdetection/mmdet/models/necks/fpn.py ================================================ import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import xavier_init from ..registry import NECKS from ..utils import ConvModule @NECKS.register_module class FPN(nn.Module): def __init__(self, in_channels, out_channels, num_outs, start_level=0, end_level=-1, add_extra_convs=False, extra_convs_on_inputs=True, relu_before_extra_convs=False, conv_cfg=None, norm_cfg=None, activation=None): super(FPN, self).__init__() assert isinstance(in_channels, list) self.in_channels = in_channels self.out_channels = out_channels self.num_ins = len(in_channels) self.num_outs = num_outs self.activation = activation self.relu_before_extra_convs = relu_before_extra_convs if end_level == -1: self.backbone_end_level = self.num_ins assert num_outs >= self.num_ins - start_level else: # if end_level < inputs, no extra level is allowed self.backbone_end_level = end_level assert end_level <= len(in_channels) assert num_outs == end_level - start_level self.start_level = start_level self.end_level = end_level self.add_extra_convs = add_extra_convs self.extra_convs_on_inputs = extra_convs_on_inputs self.lateral_convs = nn.ModuleList() self.fpn_convs = nn.ModuleList() for i in range(self.start_level, self.backbone_end_level): l_conv = ConvModule( in_channels[i], out_channels, 1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, activation=self.activation, inplace=False) fpn_conv = ConvModule( out_channels, out_channels, 3, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, activation=self.activation, inplace=False) self.lateral_convs.append(l_conv) self.fpn_convs.append(fpn_conv) # add extra conv layers (e.g., RetinaNet) extra_levels = num_outs - self.backbone_end_level + self.start_level if add_extra_convs and extra_levels >= 1: for i in range(extra_levels): if i == 0 and self.extra_convs_on_inputs: in_channels = self.in_channels[self.backbone_end_level - 1] else: in_channels = out_channels extra_fpn_conv = ConvModule( in_channels, out_channels, 3, stride=2, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, activation=self.activation, inplace=False) self.fpn_convs.append(extra_fpn_conv) # default init_weights for conv(msra) and norm in ConvModule def init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): xavier_init(m, distribution='uniform') def forward(self, inputs): assert len(inputs) == len(self.in_channels) # build laterals laterals = [ lateral_conv(inputs[i + self.start_level]) for i, lateral_conv in enumerate(self.lateral_convs) ] # build top-down path used_backbone_levels = len(laterals) for i in range(used_backbone_levels - 1, 0, -1): laterals[i - 1] += F.interpolate( laterals[i], scale_factor=2, mode='nearest') # build outputs # part 1: from original levels outs = [ self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels) ] # part 2: add extra levels if self.num_outs > len(outs): # use max pool to get more levels on top of outputs # (e.g., Faster R-CNN, Mask R-CNN) if not self.add_extra_convs: for i in range(self.num_outs - used_backbone_levels): outs.append(F.max_pool2d(outs[-1], 1, stride=2)) # add conv layers on top of original feature maps (RetinaNet) else: if self.extra_convs_on_inputs: orig = inputs[self.backbone_end_level - 1] outs.append(self.fpn_convs[used_backbone_levels](orig)) else: outs.append(self.fpn_convs[used_backbone_levels](outs[-1])) for i in range(used_backbone_levels + 1, self.num_outs): if self.relu_before_extra_convs: outs.append(self.fpn_convs[i](F.relu(outs[-1]))) else: outs.append(self.fpn_convs[i](outs[-1])) return tuple(outs) ================================================ FILE: mmdetection/mmdet/models/registry.py ================================================ import torch.nn as nn class Registry(object): def __init__(self, name): self._name = name self._module_dict = dict() @property def name(self): return self._name @property def module_dict(self): return self._module_dict def _register_module(self, module_class): """Register a module. Args: module (:obj:`nn.Module`): Module to be registered. """ if not issubclass(module_class, nn.Module): raise TypeError('module must be a child of nn.Module, but got {}'. format(module_class)) module_name = module_class.__name__ if module_name in self._module_dict: raise KeyError('{} is already registered in {}'.format( module_name, self.name)) self._module_dict[module_name] = module_class def register_module(self, cls): self._register_module(cls) return cls BACKBONES = Registry('backbone') NECKS = Registry('neck') ROI_EXTRACTORS = Registry('roi_extractor') SHARED_HEADS = Registry('shared_head') HEADS = Registry('head') DETECTORS = Registry('detector') ================================================ FILE: mmdetection/mmdet/models/roi_extractors/__init__.py ================================================ from .single_level import SingleRoIExtractor __all__ = ['SingleRoIExtractor'] ================================================ FILE: mmdetection/mmdet/models/roi_extractors/single_level.py ================================================ from __future__ import division import torch import torch.nn as nn from mmdet import ops from ..registry import ROI_EXTRACTORS @ROI_EXTRACTORS.register_module class SingleRoIExtractor(nn.Module): """Extract RoI features from a single level feature map. If there are mulitple input feature levels, each RoI is mapped to a level according to its scale. Args: roi_layer (dict): Specify RoI layer type and arguments. out_channels (int): Output channels of RoI layers. featmap_strides (int): Strides of input feature maps. finest_scale (int): Scale threshold of mapping to level 0. """ def __init__(self, roi_layer, out_channels, featmap_strides, finest_scale=56): super(SingleRoIExtractor, self).__init__() self.roi_layers = self.build_roi_layers(roi_layer, featmap_strides) self.out_channels = out_channels self.featmap_strides = featmap_strides self.finest_scale = finest_scale @property def num_inputs(self): """int: Input feature map levels.""" return len(self.featmap_strides) def init_weights(self): pass def build_roi_layers(self, layer_cfg, featmap_strides): cfg = layer_cfg.copy() layer_type = cfg.pop('type') assert hasattr(ops, layer_type) layer_cls = getattr(ops, layer_type) roi_layers = nn.ModuleList( [layer_cls(spatial_scale=1 / s, **cfg) for s in featmap_strides]) return roi_layers def map_roi_levels(self, rois, num_levels): """Map rois to corresponding feature levels by scales. - scale < finest_scale: level 0 - finest_scale <= scale < finest_scale * 2: level 1 - finest_scale * 2 <= scale < finest_scale * 4: level 2 - scale >= finest_scale * 4: level 3 Args: rois (Tensor): Input RoIs, shape (k, 5). num_levels (int): Total level number. Returns: Tensor: Level index (0-based) of each RoI, shape (k, ) """ scale = torch.sqrt( (rois[:, 3] - rois[:, 1] + 1) * (rois[:, 4] - rois[:, 2] + 1)) target_lvls = torch.floor(torch.log2(scale / self.finest_scale + 1e-6)) target_lvls = target_lvls.clamp(min=0, max=num_levels - 1).long() return target_lvls def forward(self, feats, rois): if len(feats) == 1: return self.roi_layers[0](feats[0], rois) out_size = self.roi_layers[0].out_size num_levels = len(feats) target_lvls = self.map_roi_levels(rois, num_levels) roi_feats = torch.cuda.FloatTensor(rois.size()[0], self.out_channels, out_size, out_size).fill_(0) for i in range(num_levels): inds = target_lvls == i if inds.any(): rois_ = rois[inds, :] roi_feats_t = self.roi_layers[i](feats[i], rois_) roi_feats[inds] += roi_feats_t return roi_feats ================================================ FILE: mmdetection/mmdet/models/shared_heads/__init__.py ================================================ from .res_layer import ResLayer __all__ = ['ResLayer'] ================================================ FILE: mmdetection/mmdet/models/shared_heads/res_layer.py ================================================ import logging import torch.nn as nn from mmcv.cnn import constant_init, kaiming_init from mmcv.runner import load_checkpoint from ..backbones import ResNet, make_res_layer from ..registry import SHARED_HEADS @SHARED_HEADS.register_module class ResLayer(nn.Module): def __init__(self, depth, stage=3, stride=2, dilation=1, style='pytorch', norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, with_cp=False, dcn=None): super(ResLayer, self).__init__() self.norm_eval = norm_eval self.norm_cfg = norm_cfg self.stage = stage block, stage_blocks = ResNet.arch_settings[depth] stage_block = stage_blocks[stage] planes = 64 * 2**stage inplanes = 64 * 2**(stage - 1) * block.expansion res_layer = make_res_layer( block, inplanes, planes, stage_block, stride=stride, dilation=dilation, style=style, with_cp=with_cp, norm_cfg=self.norm_cfg, dcn=dcn) self.add_module('layer{}'.format(stage + 1), res_layer) def init_weights(self, pretrained=None): if isinstance(pretrained, str): logger = logging.getLogger() load_checkpoint(self, pretrained, strict=False, logger=logger) elif pretrained is None: for m in self.modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) elif isinstance(m, nn.BatchNorm2d): constant_init(m, 1) else: raise TypeError('pretrained must be a str or None') def forward(self, x): res_layer = getattr(self, 'layer{}'.format(self.stage + 1)) out = res_layer(x) return out def train(self, mode=True): super(ResLayer, self).train(mode) if self.norm_eval: for m in self.modules(): if isinstance(m, nn.BatchNorm2d): m.eval() ================================================ FILE: mmdetection/mmdet/models/utils/__init__.py ================================================ from .conv_ws import conv_ws_2d, ConvWS2d from .conv_module import build_conv_layer, ConvModule from .norm import build_norm_layer from .scale import Scale from .weight_init import (xavier_init, normal_init, uniform_init, kaiming_init, bias_init_with_prob) __all__ = [ 'conv_ws_2d', 'ConvWS2d', 'build_conv_layer', 'ConvModule', 'build_norm_layer', 'xavier_init', 'normal_init', 'uniform_init', 'kaiming_init', 'bias_init_with_prob', 'Scale' ] ================================================ FILE: mmdetection/mmdet/models/utils/conv_module.py ================================================ import warnings import torch.nn as nn from mmcv.cnn import kaiming_init, constant_init from .conv_ws import ConvWS2d from .norm import build_norm_layer conv_cfg = { 'Conv': nn.Conv2d, 'ConvWS': ConvWS2d, # TODO: octave conv } def build_conv_layer(cfg, *args, **kwargs): """ Build convolution layer Args: cfg (None or dict): cfg should contain: type (str): identify conv layer type. layer args: args needed to instantiate a conv layer. Returns: layer (nn.Module): created conv layer """ if cfg is None: cfg_ = dict(type='Conv') else: assert isinstance(cfg, dict) and 'type' in cfg cfg_ = cfg.copy() layer_type = cfg_.pop('type') if layer_type not in conv_cfg: raise KeyError('Unrecognized norm type {}'.format(layer_type)) else: conv_layer = conv_cfg[layer_type] layer = conv_layer(*args, **kwargs, **cfg_) return layer class ConvModule(nn.Module): """Conv-Norm-Activation block. Args: in_channels (int): Same as nn.Conv2d. out_channels (int): Same as nn.Conv2d. kernel_size (int or tuple[int]): Same as nn.Conv2d. stride (int or tuple[int]): Same as nn.Conv2d. padding (int or tuple[int]): Same as nn.Conv2d. dilation (int or tuple[int]): Same as nn.Conv2d. groups (int): Same as nn.Conv2d. bias (bool or str): If specified as `auto`, it will be decided by the norm_cfg. Bias will be set as True if norm_cfg is None, otherwise False. conv_cfg (dict): Config dict for convolution layer. norm_cfg (dict): Config dict for normalization layer. activation (str or None): Activation type, "ReLU" by default. inplace (bool): Whether to use inplace mode for activation. activate_last (bool): Whether to apply the activation layer in the last. (Do not use this flag since the behavior and api may be changed in the future.) """ def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias='auto', conv_cfg=None, norm_cfg=None, activation='relu', inplace=True, activate_last=True): super(ConvModule, self).__init__() assert conv_cfg is None or isinstance(conv_cfg, dict) assert norm_cfg is None or isinstance(norm_cfg, dict) self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.activation = activation self.inplace = inplace self.activate_last = activate_last self.with_norm = norm_cfg is not None self.with_activatation = activation is not None # if the conv layer is before a norm layer, bias is unnecessary. if bias == 'auto': bias = False if self.with_norm else True self.with_bias = bias if self.with_norm and self.with_bias: warnings.warn('ConvModule has norm and bias at the same time') # build convolution layer self.conv = build_conv_layer(conv_cfg, in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) # export the attributes of self.conv to a higher level for convenience self.in_channels = self.conv.in_channels self.out_channels = self.conv.out_channels self.kernel_size = self.conv.kernel_size self.stride = self.conv.stride self.padding = self.conv.padding self.dilation = self.conv.dilation self.transposed = self.conv.transposed self.output_padding = self.conv.output_padding self.groups = self.conv.groups # build normalization layers if self.with_norm: norm_channels = out_channels if self.activate_last else in_channels self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels) self.add_module(self.norm_name, norm) # build activation layer if self.with_activatation: if self.activation not in ['relu']: raise ValueError('{} is currently not supported.'.format( self.activation)) if self.activation == 'relu': self.activate = nn.ReLU(inplace=inplace) # Use msra init by default self.init_weights() @property def norm(self): return getattr(self, self.norm_name) def init_weights(self): nonlinearity = 'relu' if self.activation is None else self.activation kaiming_init(self.conv, nonlinearity=nonlinearity) if self.with_norm: constant_init(self.norm, 1, bias=0) def forward(self, x, activate=True, norm=True): if self.activate_last: x = self.conv(x) if norm and self.with_norm: x = self.norm(x) if activate and self.with_activatation: x = self.activate(x) else: # WARN: this may be removed or modified if norm and self.with_norm: x = self.norm(x) if activate and self.with_activatation: x = self.activate(x) x = self.conv(x) return x ================================================ FILE: mmdetection/mmdet/models/utils/conv_ws.py ================================================ import torch.nn as nn import torch.nn.functional as F def conv_ws_2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1, eps=1e-5): c_in = weight.size(0) weight_flat = weight.view(c_in, -1) mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1) std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1) weight = (weight - mean) / (std + eps) return F.conv2d(input, weight, bias, stride, padding, dilation, groups) class ConvWS2d(nn.Conv2d): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, eps=1e-5): super(ConvWS2d, self).__init__( in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) self.eps = eps def forward(self, x): return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups, self.eps) ================================================ FILE: mmdetection/mmdet/models/utils/norm.py ================================================ import torch.nn as nn norm_cfg = { # format: layer_type: (abbreviation, module) 'BN': ('bn', nn.BatchNorm2d), 'SyncBN': ('bn', nn.SyncBatchNorm), 'GN': ('gn', nn.GroupNorm), # and potentially 'SN' } def build_norm_layer(cfg, num_features, postfix=''): """ Build normalization layer Args: cfg (dict): cfg should contain: type (str): identify norm layer type. layer args: args needed to instantiate a norm layer. requires_grad (bool): [optional] whether stop gradient updates num_features (int): number of channels from input. postfix (int, str): appended into norm abbreviation to create named layer. Returns: name (str): abbreviation + postfix layer (nn.Module): created norm layer """ assert isinstance(cfg, dict) and 'type' in cfg cfg_ = cfg.copy() layer_type = cfg_.pop('type') if layer_type not in norm_cfg: raise KeyError('Unrecognized norm type {}'.format(layer_type)) else: abbr, norm_layer = norm_cfg[layer_type] if norm_layer is None: raise NotImplementedError assert isinstance(postfix, (int, str)) name = abbr + str(postfix) requires_grad = cfg_.pop('requires_grad', True) cfg_.setdefault('eps', 1e-5) if layer_type != 'GN': layer = norm_layer(num_features, **cfg_) if layer_type == 'SyncBN': layer._specify_ddp_gpu_num(1) else: assert 'num_groups' in cfg_ layer = norm_layer(num_channels=num_features, **cfg_) for param in layer.parameters(): param.requires_grad = requires_grad return name, layer ================================================ FILE: mmdetection/mmdet/models/utils/scale.py ================================================ import torch import torch.nn as nn class Scale(nn.Module): def __init__(self, scale=1.0): super(Scale, self).__init__() self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float)) def forward(self, x): return x * self.scale ================================================ FILE: mmdetection/mmdet/models/utils/weight_init.py ================================================ import numpy as np import torch.nn as nn def xavier_init(module, gain=1, bias=0, distribution='normal'): assert distribution in ['uniform', 'normal'] if distribution == 'uniform': nn.init.xavier_uniform_(module.weight, gain=gain) else: nn.init.xavier_normal_(module.weight, gain=gain) if hasattr(module, 'bias'): nn.init.constant_(module.bias, bias) def normal_init(module, mean=0, std=1, bias=0): nn.init.normal_(module.weight, mean, std) if hasattr(module, 'bias'): nn.init.constant_(module.bias, bias) def uniform_init(module, a=0, b=1, bias=0): nn.init.uniform_(module.weight, a, b) if hasattr(module, 'bias'): nn.init.constant_(module.bias, bias) def kaiming_init(module, mode='fan_out', nonlinearity='relu', bias=0, distribution='normal'): assert distribution in ['uniform', 'normal'] if distribution == 'uniform': nn.init.kaiming_uniform_( module.weight, mode=mode, nonlinearity=nonlinearity) else: nn.init.kaiming_normal_( module.weight, mode=mode, nonlinearity=nonlinearity) if hasattr(module, 'bias'): nn.init.constant_(module.bias, bias) def bias_init_with_prob(prior_prob): """ initialize conv/fc bias value according to giving probablity""" bias_init = float(-np.log((1 - prior_prob) / prior_prob)) return bias_init ================================================ FILE: mmdetection/mmdet/ops/__init__.py ================================================ from .dcn import (DeformConv, DeformConvPack, ModulatedDeformConv, ModulatedDeformConvPack, DeformRoIPooling, DeformRoIPoolingPack, ModulatedDeformRoIPoolingPack, deform_conv, modulated_deform_conv, deform_roi_pooling) from .nms import nms, soft_nms from .roi_align import RoIAlign, roi_align from .roi_pool import RoIPool, roi_pool from .sigmoid_focal_loss import SigmoidFocalLoss, sigmoid_focal_loss __all__ = [ 'nms', 'soft_nms', 'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool', 'DeformConv', 'DeformConvPack', 'DeformRoIPooling', 'DeformRoIPoolingPack', 'ModulatedDeformRoIPoolingPack', 'ModulatedDeformConv', 'ModulatedDeformConvPack', 'deform_conv', 'modulated_deform_conv', 'deform_roi_pooling', 'SigmoidFocalLoss', 'sigmoid_focal_loss' ] ================================================ FILE: mmdetection/mmdet/ops/dcn/__init__.py ================================================ from .functions.deform_conv import deform_conv, modulated_deform_conv from .functions.deform_pool import deform_roi_pooling from .modules.deform_conv import (DeformConv, ModulatedDeformConv, DeformConvPack, ModulatedDeformConvPack) from .modules.deform_pool import (DeformRoIPooling, DeformRoIPoolingPack, ModulatedDeformRoIPoolingPack) __all__ = [ 'DeformConv', 'DeformConvPack', 'ModulatedDeformConv', 'ModulatedDeformConvPack', 'DeformRoIPooling', 'DeformRoIPoolingPack', 'ModulatedDeformRoIPoolingPack', 'deform_conv', 'modulated_deform_conv', 'deform_roi_pooling' ] ================================================ FILE: mmdetection/mmdet/ops/dcn/functions/__init__.py ================================================ ================================================ FILE: mmdetection/mmdet/ops/dcn/functions/deform_conv.py ================================================ import torch from torch.autograd import Function from torch.nn.modules.utils import _pair from .. import deform_conv_cuda class DeformConvFunction(Function): @staticmethod def forward(ctx, input, offset, weight, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, im2col_step=64): if input is not None and input.dim() != 4: raise ValueError( "Expected 4D tensor as input, got {}D tensor instead.".format( input.dim())) ctx.stride = _pair(stride) ctx.padding = _pair(padding) ctx.dilation = _pair(dilation) ctx.groups = groups ctx.deformable_groups = deformable_groups ctx.im2col_step = im2col_step ctx.save_for_backward(input, offset, weight) output = input.new_empty( DeformConvFunction._output_size(input, weight, ctx.padding, ctx.dilation, ctx.stride)) ctx.bufs_ = [input.new_empty(0), input.new_empty(0)] # columns, ones if not input.is_cuda: raise NotImplementedError else: cur_im2col_step = min(ctx.im2col_step, input.shape[0]) assert (input.shape[0] % cur_im2col_step) == 0, 'im2col step must divide batchsize' deform_conv_cuda.deform_conv_forward_cuda( input, weight, offset, output, ctx.bufs_[0], ctx.bufs_[1], weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0], ctx.padding[1], ctx.padding[0], ctx.dilation[1], ctx.dilation[0], ctx.groups, ctx.deformable_groups, cur_im2col_step) return output @staticmethod def backward(ctx, grad_output): input, offset, weight = ctx.saved_tensors grad_input = grad_offset = grad_weight = None if not grad_output.is_cuda: raise NotImplementedError else: cur_im2col_step = min(ctx.im2col_step, input.shape[0]) assert (input.shape[0] % cur_im2col_step) == 0, 'im2col step must divide batchsize' if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]: grad_input = torch.zeros_like(input) grad_offset = torch.zeros_like(offset) deform_conv_cuda.deform_conv_backward_input_cuda( input, offset, grad_output, grad_input, grad_offset, weight, ctx.bufs_[0], weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0], ctx.padding[1], ctx.padding[0], ctx.dilation[1], ctx.dilation[0], ctx.groups, ctx.deformable_groups, cur_im2col_step) if ctx.needs_input_grad[2]: grad_weight = torch.zeros_like(weight) deform_conv_cuda.deform_conv_backward_parameters_cuda( input, offset, grad_output, grad_weight, ctx.bufs_[0], ctx.bufs_[1], weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0], ctx.padding[1], ctx.padding[0], ctx.dilation[1], ctx.dilation[0], ctx.groups, ctx.deformable_groups, 1, cur_im2col_step) return (grad_input, grad_offset, grad_weight, None, None, None, None, None) @staticmethod def _output_size(input, weight, padding, dilation, stride): channels = weight.size(0) output_size = (input.size(0), channels) for d in range(input.dim() - 2): in_size = input.size(d + 2) pad = padding[d] kernel = dilation[d] * (weight.size(d + 2) - 1) + 1 stride_ = stride[d] output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, ) if not all(map(lambda s: s > 0, output_size)): raise ValueError( "convolution input is too small (output would be {})".format( 'x'.join(map(str, output_size)))) return output_size class ModulatedDeformConvFunction(Function): @staticmethod def forward(ctx, input, offset, mask, weight, bias=None, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1): ctx.stride = stride ctx.padding = padding ctx.dilation = dilation ctx.groups = groups ctx.deformable_groups = deformable_groups ctx.with_bias = bias is not None if not ctx.with_bias: bias = input.new_empty(1) # fake tensor if not input.is_cuda: raise NotImplementedError if weight.requires_grad or mask.requires_grad or offset.requires_grad \ or input.requires_grad: ctx.save_for_backward(input, offset, mask, weight, bias) output = input.new_empty( ModulatedDeformConvFunction._infer_shape(ctx, input, weight)) ctx._bufs = [input.new_empty(0), input.new_empty(0)] deform_conv_cuda.modulated_deform_conv_cuda_forward( input, weight, bias, ctx._bufs[0], offset, mask, output, ctx._bufs[1], weight.shape[2], weight.shape[3], ctx.stride, ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation, ctx.groups, ctx.deformable_groups, ctx.with_bias) return output @staticmethod def backward(ctx, grad_output): if not grad_output.is_cuda: raise NotImplementedError input, offset, mask, weight, bias = ctx.saved_tensors grad_input = torch.zeros_like(input) grad_offset = torch.zeros_like(offset) grad_mask = torch.zeros_like(mask) grad_weight = torch.zeros_like(weight) grad_bias = torch.zeros_like(bias) deform_conv_cuda.modulated_deform_conv_cuda_backward( input, weight, bias, ctx._bufs[0], offset, mask, ctx._bufs[1], grad_input, grad_weight, grad_bias, grad_offset, grad_mask, grad_output, weight.shape[2], weight.shape[3], ctx.stride, ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation, ctx.groups, ctx.deformable_groups, ctx.with_bias) if not ctx.with_bias: grad_bias = None return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias, None, None, None, None, None) @staticmethod def _infer_shape(ctx, input, weight): n = input.size(0) channels_out = weight.size(0) height, width = input.shape[2:4] kernel_h, kernel_w = weight.shape[2:4] height_out = (height + 2 * ctx.padding - (ctx.dilation * (kernel_h - 1) + 1)) // ctx.stride + 1 width_out = (width + 2 * ctx.padding - (ctx.dilation * (kernel_w - 1) + 1)) // ctx.stride + 1 return n, channels_out, height_out, width_out deform_conv = DeformConvFunction.apply modulated_deform_conv = ModulatedDeformConvFunction.apply ================================================ FILE: mmdetection/mmdet/ops/dcn/functions/deform_pool.py ================================================ import torch from torch.autograd import Function from .. import deform_pool_cuda class DeformRoIPoolingFunction(Function): @staticmethod def forward(ctx, data, rois, offset, spatial_scale, out_size, out_channels, no_trans, group_size=1, part_size=None, sample_per_part=4, trans_std=.0): ctx.spatial_scale = spatial_scale ctx.out_size = out_size ctx.out_channels = out_channels ctx.no_trans = no_trans ctx.group_size = group_size ctx.part_size = out_size if part_size is None else part_size ctx.sample_per_part = sample_per_part ctx.trans_std = trans_std assert 0.0 <= ctx.trans_std <= 1.0 if not data.is_cuda: raise NotImplementedError n = rois.shape[0] output = data.new_empty(n, out_channels, out_size, out_size) output_count = data.new_empty(n, out_channels, out_size, out_size) deform_pool_cuda.deform_psroi_pooling_cuda_forward( data, rois, offset, output, output_count, ctx.no_trans, ctx.spatial_scale, ctx.out_channels, ctx.group_size, ctx.out_size, ctx.part_size, ctx.sample_per_part, ctx.trans_std) if data.requires_grad or rois.requires_grad or offset.requires_grad: ctx.save_for_backward(data, rois, offset) ctx.output_count = output_count return output @staticmethod def backward(ctx, grad_output): if not grad_output.is_cuda: raise NotImplementedError data, rois, offset = ctx.saved_tensors output_count = ctx.output_count grad_input = torch.zeros_like(data) grad_rois = None grad_offset = torch.zeros_like(offset) deform_pool_cuda.deform_psroi_pooling_cuda_backward( grad_output, data, rois, offset, output_count, grad_input, grad_offset, ctx.no_trans, ctx.spatial_scale, ctx.out_channels, ctx.group_size, ctx.out_size, ctx.part_size, ctx.sample_per_part, ctx.trans_std) return (grad_input, grad_rois, grad_offset, None, None, None, None, None, None, None, None) deform_roi_pooling = DeformRoIPoolingFunction.apply ================================================ FILE: mmdetection/mmdet/ops/dcn/modules/__init__.py ================================================ ================================================ FILE: mmdetection/mmdet/ops/dcn/modules/deform_conv.py ================================================ import math import torch import torch.nn as nn from torch.nn.modules.utils import _pair from ..functions.deform_conv import deform_conv, modulated_deform_conv class DeformConv(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=False): super(DeformConv, self).__init__() assert not bias assert in_channels % groups == 0, \ 'in_channels {} cannot be divisible by groups {}'.format( in_channels, groups) assert out_channels % groups == 0, \ 'out_channels {} cannot be divisible by groups {}'.format( out_channels, groups) self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = _pair(kernel_size) self.stride = _pair(stride) self.padding = _pair(padding) self.dilation = _pair(dilation) self.groups = groups self.deformable_groups = deformable_groups self.weight = nn.Parameter( torch.Tensor(out_channels, in_channels // self.groups, *self.kernel_size)) self.reset_parameters() def reset_parameters(self): n = self.in_channels for k in self.kernel_size: n *= k stdv = 1. / math.sqrt(n) self.weight.data.uniform_(-stdv, stdv) def forward(self, x, offset): return deform_conv(x, offset, self.weight, self.stride, self.padding, self.dilation, self.groups, self.deformable_groups) class DeformConvPack(DeformConv): def __init__(self, *args, **kwargs): super(DeformConvPack, self).__init__(*args, **kwargs) self.conv_offset = nn.Conv2d( self.in_channels, self.deformable_groups * 2 * self.kernel_size[0] * self.kernel_size[1], kernel_size=self.kernel_size, stride=_pair(self.stride), padding=_pair(self.padding), bias=True) self.init_offset() def init_offset(self): self.conv_offset.weight.data.zero_() self.conv_offset.bias.data.zero_() def forward(self, x): offset = self.conv_offset(x) return deform_conv(x, offset, self.weight, self.stride, self.padding, self.dilation, self.groups, self.deformable_groups) class ModulatedDeformConv(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=True): super(ModulatedDeformConv, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = _pair(kernel_size) self.stride = stride self.padding = padding self.dilation = dilation self.groups = groups self.deformable_groups = deformable_groups self.with_bias = bias self.weight = nn.Parameter( torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)) if bias: self.bias = nn.Parameter(torch.Tensor(out_channels)) else: self.register_parameter('bias', None) self.reset_parameters() def reset_parameters(self): n = self.in_channels for k in self.kernel_size: n *= k stdv = 1. / math.sqrt(n) self.weight.data.uniform_(-stdv, stdv) if self.bias is not None: self.bias.data.zero_() def forward(self, x, offset, mask): return modulated_deform_conv(x, offset, mask, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups, self.deformable_groups) class ModulatedDeformConvPack(ModulatedDeformConv): def __init__(self, *args, **kwargs): super(ModulatedDeformConvPack, self).__init__(*args, **kwargs) self.conv_offset_mask = nn.Conv2d( self.in_channels, self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1], kernel_size=self.kernel_size, stride=_pair(self.stride), padding=_pair(self.padding), bias=True) self.init_offset() def init_offset(self): self.conv_offset_mask.weight.data.zero_() self.conv_offset_mask.bias.data.zero_() def forward(self, x): out = self.conv_offset_mask(x) o1, o2, mask = torch.chunk(out, 3, dim=1) offset = torch.cat((o1, o2), dim=1) mask = torch.sigmoid(mask) return modulated_deform_conv(x, offset, mask, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups, self.deformable_groups) ================================================ FILE: mmdetection/mmdet/ops/dcn/modules/deform_pool.py ================================================ from torch import nn from ..functions.deform_pool import deform_roi_pooling class DeformRoIPooling(nn.Module): def __init__(self, spatial_scale, out_size, out_channels, no_trans, group_size=1, part_size=None, sample_per_part=4, trans_std=.0): super(DeformRoIPooling, self).__init__() self.spatial_scale = spatial_scale self.out_size = out_size self.out_channels = out_channels self.no_trans = no_trans self.group_size = group_size self.part_size = out_size if part_size is None else part_size self.sample_per_part = sample_per_part self.trans_std = trans_std def forward(self, data, rois, offset): if self.no_trans: offset = data.new_empty(0) return deform_roi_pooling( data, rois, offset, self.spatial_scale, self.out_size, self.out_channels, self.no_trans, self.group_size, self.part_size, self.sample_per_part, self.trans_std) class DeformRoIPoolingPack(DeformRoIPooling): def __init__(self, spatial_scale, out_size, out_channels, no_trans, group_size=1, part_size=None, sample_per_part=4, trans_std=.0, num_offset_fcs=3, deform_fc_channels=1024): super(DeformRoIPoolingPack, self).__init__(spatial_scale, out_size, out_channels, no_trans, group_size, part_size, sample_per_part, trans_std) self.num_offset_fcs = num_offset_fcs self.deform_fc_channels = deform_fc_channels if not no_trans: seq = [] ic = self.out_size * self.out_size * self.out_channels for i in range(self.num_offset_fcs): if i < self.num_offset_fcs - 1: oc = self.deform_fc_channels else: oc = self.out_size * self.out_size * 2 seq.append(nn.Linear(ic, oc)) ic = oc if i < self.num_offset_fcs - 1: seq.append(nn.ReLU(inplace=True)) self.offset_fc = nn.Sequential(*seq) self.offset_fc[-1].weight.data.zero_() self.offset_fc[-1].bias.data.zero_() def forward(self, data, rois): assert data.size(1) == self.out_channels if self.no_trans: offset = data.new_empty(0) return deform_roi_pooling( data, rois, offset, self.spatial_scale, self.out_size, self.out_channels, self.no_trans, self.group_size, self.part_size, self.sample_per_part, self.trans_std) else: n = rois.shape[0] offset = data.new_empty(0) x = deform_roi_pooling(data, rois, offset, self.spatial_scale, self.out_size, self.out_channels, True, self.group_size, self.part_size, self.sample_per_part, self.trans_std) offset = self.offset_fc(x.view(n, -1)) offset = offset.view(n, 2, self.out_size, self.out_size) return deform_roi_pooling( data, rois, offset, self.spatial_scale, self.out_size, self.out_channels, self.no_trans, self.group_size, self.part_size, self.sample_per_part, self.trans_std) class ModulatedDeformRoIPoolingPack(DeformRoIPooling): def __init__(self, spatial_scale, out_size, out_channels, no_trans, group_size=1, part_size=None, sample_per_part=4, trans_std=.0, num_offset_fcs=3, num_mask_fcs=2, deform_fc_channels=1024): super(ModulatedDeformRoIPoolingPack, self).__init__( spatial_scale, out_size, out_channels, no_trans, group_size, part_size, sample_per_part, trans_std) self.num_offset_fcs = num_offset_fcs self.num_mask_fcs = num_mask_fcs self.deform_fc_channels = deform_fc_channels if not no_trans: offset_fc_seq = [] ic = self.out_size * self.out_size * self.out_channels for i in range(self.num_offset_fcs): if i < self.num_offset_fcs - 1: oc = self.deform_fc_channels else: oc = self.out_size * self.out_size * 2 offset_fc_seq.append(nn.Linear(ic, oc)) ic = oc if i < self.num_offset_fcs - 1: offset_fc_seq.append(nn.ReLU(inplace=True)) self.offset_fc = nn.Sequential(*offset_fc_seq) self.offset_fc[-1].weight.data.zero_() self.offset_fc[-1].bias.data.zero_() mask_fc_seq = [] ic = self.out_size * self.out_size * self.out_channels for i in range(self.num_mask_fcs): if i < self.num_mask_fcs - 1: oc = self.deform_fc_channels else: oc = self.out_size * self.out_size mask_fc_seq.append(nn.Linear(ic, oc)) ic = oc if i < self.num_mask_fcs - 1: mask_fc_seq.append(nn.ReLU(inplace=True)) else: mask_fc_seq.append(nn.Sigmoid()) self.mask_fc = nn.Sequential(*mask_fc_seq) self.mask_fc[-2].weight.data.zero_() self.mask_fc[-2].bias.data.zero_() def forward(self, data, rois): assert data.size(1) == self.out_channels if self.no_trans: offset = data.new_empty(0) return deform_roi_pooling( data, rois, offset, self.spatial_scale, self.out_size, self.out_channels, self.no_trans, self.group_size, self.part_size, self.sample_per_part, self.trans_std) else: n = rois.shape[0] offset = data.new_empty(0) x = deform_roi_pooling(data, rois, offset, self.spatial_scale, self.out_size, self.out_channels, True, self.group_size, self.part_size, self.sample_per_part, self.trans_std) offset = self.offset_fc(x.view(n, -1)) offset = offset.view(n, 2, self.out_size, self.out_size) mask = self.mask_fc(x.view(n, -1)) mask = mask.view(n, 1, self.out_size, self.out_size) return deform_roi_pooling( data, rois, offset, self.spatial_scale, self.out_size, self.out_channels, self.no_trans, self.group_size, self.part_size, self.sample_per_part, self.trans_std) * mask ================================================ FILE: mmdetection/mmdet/ops/dcn/setup.py ================================================ from setuptools import setup from torch.utils.cpp_extension import BuildExtension, CUDAExtension setup( name='deform_conv', ext_modules=[ CUDAExtension('deform_conv_cuda', [ 'src/deform_conv_cuda.cpp', 'src/deform_conv_cuda_kernel.cu', ]), CUDAExtension('deform_pool_cuda', [ 'src/deform_pool_cuda.cpp', 'src/deform_pool_cuda_kernel.cu' ]), ], cmdclass={'build_ext': BuildExtension}) ================================================ FILE: mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.cpp ================================================ // modify from // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c #include #include #include void deformable_im2col(const at::Tensor data_im, const at::Tensor data_offset, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, at::Tensor data_col); void deformable_col2im(const at::Tensor data_col, const at::Tensor data_offset, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, at::Tensor grad_im); void deformable_col2im_coord( const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, at::Tensor grad_offset); void modulated_deformable_im2col_cuda( const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, at::Tensor data_col); void modulated_deformable_col2im_cuda( const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, at::Tensor grad_im); void modulated_deformable_col2im_coord_cuda( const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, at::Tensor grad_offset, at::Tensor grad_mask); void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput, at::Tensor weight, int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW, int group, int deformable_group) { AT_CHECK(weight.ndimension() == 4, "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, " "but got: %s", weight.ndimension()); AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); AT_CHECK(kW > 0 && kH > 0, "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); AT_CHECK((weight.size(2) == kH && weight.size(3) == kW), "kernel size should be consistent with weight, ", "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH, kW, weight.size(2), weight.size(3)); AT_CHECK(dW > 0 && dH > 0, "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); AT_CHECK( dilationW > 0 && dilationH > 0, "dilation should be greater than 0, but got dilationH: %d dilationW: %d", dilationH, dilationW); int ndim = input.ndimension(); int dimf = 0; int dimh = 1; int dimw = 2; if (ndim == 4) { dimf++; dimh++; dimw++; } AT_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s", ndim); long nInputPlane = weight.size(1) * group; long inputHeight = input.size(dimh); long inputWidth = input.size(dimw); long nOutputPlane = weight.size(0); long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; AT_CHECK(nInputPlane % deformable_group == 0, "input channels must divide deformable group size"); if (outputWidth < 1 || outputHeight < 1) AT_ERROR( "Given input size: (%ld x %ld x %ld). " "Calculated output size: (%ld x %ld x %ld). Output size is too small", nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight, outputWidth); AT_CHECK(input.size(1) == nInputPlane, "invalid number of input planes, expected: %d, but got: %d", nInputPlane, input.size(1)); AT_CHECK((inputHeight >= kH && inputWidth >= kW), "input image is smaller than kernel"); AT_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth), "invalid spatial size of offset, expected height: %d width: %d, but " "got height: %d width: %d", outputHeight, outputWidth, offset.size(2), offset.size(3)); AT_CHECK((offset.size(1) == deformable_group * 2 * kH * kW), "invalid number of channels of offset"); if (gradOutput != NULL) { AT_CHECK(gradOutput->size(dimf) == nOutputPlane, "invalid number of gradOutput planes, expected: %d, but got: %d", nOutputPlane, gradOutput->size(dimf)); AT_CHECK((gradOutput->size(dimh) == outputHeight && gradOutput->size(dimw) == outputWidth), "invalid size of gradOutput, expected height: %d width: %d , but " "got height: %d width: %d", outputHeight, outputWidth, gradOutput->size(dimh), gradOutput->size(dimw)); } } int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight, at::Tensor offset, at::Tensor output, at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step) { // todo: resize columns to include im2col: done // todo: add im2col_step as input // todo: add new output buffer and transpose it to output (or directly // transpose output) todo: possibly change data indexing because of // parallel_imgs shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH, padW, dilationH, dilationW, group, deformable_group); input = input.contiguous(); offset = offset.contiguous(); weight = weight.contiguous(); int batch = 1; if (input.ndimension() == 3) { // Force batch batch = 0; input.unsqueeze_(0); offset.unsqueeze_(0); } // todo: assert batchsize dividable by im2col_step long batchSize = input.size(0); long nInputPlane = input.size(1); long inputHeight = input.size(2); long inputWidth = input.size(3); long nOutputPlane = weight.size(0); long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset"); output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane, outputHeight, outputWidth}); columns = at::zeros( {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, input.options()); if (ones.ndimension() != 2 || ones.size(0) * ones.size(1) < outputHeight * outputWidth) { ones = at::ones({outputHeight, outputWidth}, input.options()); } input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, inputHeight, inputWidth}); offset = offset.view({batchSize / im2col_step, im2col_step, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); at::Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane, im2col_step * outputHeight, outputWidth}, output.options()); output_buffer = output_buffer.view( {output_buffer.size(0), group, output_buffer.size(1) / group, output_buffer.size(2), output_buffer.size(3)}); for (int elt = 0; elt < batchSize / im2col_step; elt++) { deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, im2col_step, deformable_group, columns); columns = columns.view({group, columns.size(0) / group, columns.size(1)}); weight = weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)}); for (int g = 0; g < group; g++) { output_buffer[elt][g] = output_buffer[elt][g] .flatten(1) .addmm_(weight[g].flatten(1), columns[g]) .view_as(output_buffer[elt][g]); } } output_buffer = output_buffer.view( {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2), output_buffer.size(3), output_buffer.size(4)}); output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane, im2col_step, outputHeight, outputWidth}); output_buffer.transpose_(1, 2); output.copy_(output_buffer); output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth}); input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); offset = offset.view( {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); if (batch == 0) { output = output.view({nOutputPlane, outputHeight, outputWidth}); input = input.view({nInputPlane, inputHeight, inputWidth}); offset = offset.view({offset.size(1), offset.size(2), offset.size(3)}); } return 1; } int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset, at::Tensor gradOutput, at::Tensor gradInput, at::Tensor gradOffset, at::Tensor weight, at::Tensor columns, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step) { shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW, padH, padW, dilationH, dilationW, group, deformable_group); input = input.contiguous(); offset = offset.contiguous(); gradOutput = gradOutput.contiguous(); weight = weight.contiguous(); int batch = 1; if (input.ndimension() == 3) { // Force batch batch = 0; input = input.view({1, input.size(0), input.size(1), input.size(2)}); offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)}); gradOutput = gradOutput.view( {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)}); } long batchSize = input.size(0); long nInputPlane = input.size(1); long inputHeight = input.size(2); long inputWidth = input.size(3); long nOutputPlane = weight.size(0); long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; AT_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset"); gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth}); columns = at::zeros( {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, input.options()); // change order of grad output gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step, nOutputPlane, outputHeight, outputWidth}); gradOutput.transpose_(1, 2); gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane, inputHeight, inputWidth}); input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, inputHeight, inputWidth}); gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); offset = offset.view({batchSize / im2col_step, im2col_step, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); for (int elt = 0; elt < batchSize / im2col_step; elt++) { // divide into groups columns = columns.view({group, columns.size(0) / group, columns.size(1)}); weight = weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)}); gradOutput = gradOutput.view( {gradOutput.size(0), group, gradOutput.size(1) / group, gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)}); for (int g = 0; g < group; g++) { columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1), gradOutput[elt][g].flatten(1), 0.0f, 1.0f); } columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)}); gradOutput = gradOutput.view( {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2), gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)}); deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, im2col_step, deformable_group, gradOffset[elt]); deformable_col2im(columns, offset[elt], nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, im2col_step, deformable_group, gradInput[elt]); } gradOutput.transpose_(1, 2); gradOutput = gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth}); gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth}); input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); gradOffset = gradOffset.view( {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); offset = offset.view( {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); if (batch == 0) { gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth}); input = input.view({nInputPlane, inputHeight, inputWidth}); gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth}); offset = offset.view({offset.size(1), offset.size(2), offset.size(3)}); gradOffset = gradOffset.view({offset.size(1), offset.size(2), offset.size(3)}); } return 1; } int deform_conv_backward_parameters_cuda( at::Tensor input, at::Tensor offset, at::Tensor gradOutput, at::Tensor gradWeight, // at::Tensor gradBias, at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, float scale, int im2col_step) { // todo: transpose and reshape outGrad // todo: reshape columns // todo: add im2col_step as input shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH, dW, padH, padW, dilationH, dilationW, group, deformable_group); input = input.contiguous(); offset = offset.contiguous(); gradOutput = gradOutput.contiguous(); int batch = 1; if (input.ndimension() == 3) { // Force batch batch = 0; input = input.view( at::IntList({1, input.size(0), input.size(1), input.size(2)})); gradOutput = gradOutput.view( {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)}); } long batchSize = input.size(0); long nInputPlane = input.size(1); long inputHeight = input.size(2); long inputWidth = input.size(3); long nOutputPlane = gradWeight.size(0); long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset"); columns = at::zeros( {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, input.options()); gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step, nOutputPlane, outputHeight, outputWidth}); gradOutput.transpose_(1, 2); at::Tensor gradOutputBuffer = at::zeros_like(gradOutput); gradOutputBuffer = gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step, outputHeight, outputWidth}); gradOutputBuffer.copy_(gradOutput); gradOutputBuffer = gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step * outputHeight, outputWidth}); gradOutput.transpose_(1, 2); gradOutput = gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth}); input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, inputHeight, inputWidth}); offset = offset.view({batchSize / im2col_step, im2col_step, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); for (int elt = 0; elt < batchSize / im2col_step; elt++) { deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, im2col_step, deformable_group, columns); // divide into group gradOutputBuffer = gradOutputBuffer.view( {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group, gradOutputBuffer.size(2), gradOutputBuffer.size(3)}); columns = columns.view({group, columns.size(0) / group, columns.size(1)}); gradWeight = gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1), gradWeight.size(2), gradWeight.size(3)}); for (int g = 0; g < group; g++) { gradWeight[g] = gradWeight[g] .flatten(1) .addmm_(gradOutputBuffer[elt][g].flatten(1), columns[g].transpose(1, 0), 1.0, scale) .view_as(gradWeight[g]); } gradOutputBuffer = gradOutputBuffer.view( {gradOutputBuffer.size(0), gradOutputBuffer.size(1) * gradOutputBuffer.size(2), gradOutputBuffer.size(3), gradOutputBuffer.size(4)}); columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)}); gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1), gradWeight.size(2), gradWeight.size(3), gradWeight.size(4)}); } input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); offset = offset.view( {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); if (batch == 0) { gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth}); input = input.view({nInputPlane, inputHeight, inputWidth}); } return 1; } void modulated_deform_conv_cuda_forward( at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns, int kernel_h, int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int deformable_group, const bool with_bias) { AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); const int batch = input.size(0); const int channels = input.size(1); const int height = input.size(2); const int width = input.size(3); const int channels_out = weight.size(0); const int channels_kernel = weight.size(1); const int kernel_h_ = weight.size(2); const int kernel_w_ = weight.size(3); if (kernel_h_ != kernel_h || kernel_w_ != kernel_w) AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); if (channels != channels_kernel * group) AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel * group); const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; if (ones.ndimension() != 2 || ones.size(0) * ones.size(1) < height_out * width_out) { // Resize plane and fill with ones... ones = at::ones({height_out, width_out}, input.options()); } // resize output output = output.view({batch, channels_out, height_out, width_out}).zero_(); // resize temporary columns columns = at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); output = output.view({output.size(0), group, output.size(1) / group, output.size(2), output.size(3)}); for (int b = 0; b < batch; b++) { modulated_deformable_im2col_cuda( input[b], offset[b], mask[b], 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, columns); // divide into group weight = weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)}); columns = columns.view({group, columns.size(0) / group, columns.size(1)}); for (int g = 0; g < group; g++) { output[b][g] = output[b][g] .flatten(1) .addmm_(weight[g].flatten(1), columns[g]) .view_as(output[b][g]); } weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), weight.size(3), weight.size(4)}); columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)}); } output = output.view({output.size(0), output.size(1) * output.size(2), output.size(3), output.size(4)}); if (with_bias) { output += bias.view({1, bias.size(0), 1, 1}); } } void modulated_deform_conv_cuda_backward( at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, at::Tensor offset, at::Tensor mask, at::Tensor columns, at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias, at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, const bool with_bias) { AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); const int batch = input.size(0); const int channels = input.size(1); const int height = input.size(2); const int width = input.size(3); const int channels_kernel = weight.size(1); const int kernel_h_ = weight.size(2); const int kernel_w_ = weight.size(3); if (kernel_h_ != kernel_h || kernel_w_ != kernel_w) AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); if (channels != channels_kernel * group) AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel * group); const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; if (ones.ndimension() != 2 || ones.size(0) * ones.size(1) < height_out * width_out) { // Resize plane and fill with ones... ones = at::ones({height_out, width_out}, input.options()); } grad_input = grad_input.view({batch, channels, height, width}); columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out}, input.options()); grad_output = grad_output.view({grad_output.size(0), group, grad_output.size(1) / group, grad_output.size(2), grad_output.size(3)}); for (int b = 0; b < batch; b++) { // divide int group columns = columns.view({group, columns.size(0) / group, columns.size(1)}); weight = weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)}); for (int g = 0; g < group; g++) { columns[g].addmm_(weight[g].flatten(1).transpose(0, 1), grad_output[b][g].flatten(1), 0.0f, 1.0f); } columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)}); weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), weight.size(3), weight.size(4)}); // gradient w.r.t. input coordinate data modulated_deformable_col2im_coord_cuda( columns, input[b], offset[b], mask[b], 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b], grad_mask[b]); // gradient w.r.t. input data modulated_deformable_col2im_cuda( columns, offset[b], mask[b], 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, grad_input[b]); // gradient w.r.t. weight, dWeight should accumulate across the batch and // group modulated_deformable_im2col_cuda( input[b], offset[b], mask[b], 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, columns); columns = columns.view({group, columns.size(0) / group, columns.size(1)}); grad_weight = grad_weight.view({group, grad_weight.size(0) / group, grad_weight.size(1), grad_weight.size(2), grad_weight.size(3)}); if (with_bias) grad_bias = grad_bias.view({group, grad_bias.size(0) / group}); for (int g = 0; g < group; g++) { grad_weight[g] = grad_weight[g] .flatten(1) .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1)) .view_as(grad_weight[g]); if (with_bias) { grad_bias[g] = grad_bias[g] .view({-1, 1}) .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1})) .view(-1); } } columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)}); grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1), grad_weight.size(2), grad_weight.size(3), grad_weight.size(4)}); if (with_bias) grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)}); } grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1), grad_output.size(2), grad_output.size(3), grad_output.size(4)}); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("deform_conv_forward_cuda", &deform_conv_forward_cuda, "deform forward (CUDA)"); m.def("deform_conv_backward_input_cuda", &deform_conv_backward_input_cuda, "deform_conv_backward_input (CUDA)"); m.def("deform_conv_backward_parameters_cuda", &deform_conv_backward_parameters_cuda, "deform_conv_backward_parameters (CUDA)"); m.def("modulated_deform_conv_cuda_forward", &modulated_deform_conv_cuda_forward, "modulated deform conv forward (CUDA)"); m.def("modulated_deform_conv_cuda_backward", &modulated_deform_conv_cuda_backward, "modulated deform conv backward (CUDA)"); } ================================================ FILE: mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu ================================================ /*! ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** * * COPYRIGHT * * All contributions by the University of California: * Copyright (c) 2014-2017 The Regents of the University of California (Regents) * All rights reserved. * * All other contributions: * Copyright (c) 2014-2017, the respective contributors * All rights reserved. * * Caffe uses a shared copyright model: each contributor holds copyright over * their contributions to Caffe. The project versioning records all such * contribution and copyright details. If a contributor wants to further mark * their specific copyright on a particular contribution, they should indicate * their copyright solely in the commit message of the change when it is * committed. * * LICENSE * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * CONTRIBUTION AGREEMENT * * By contributing to the BVLC/caffe repository through pull-request, comment, * or otherwise, the contributor releases their content to the * license and copyright terms herein. * ***************** END Caffe Copyright Notice and Disclaimer ******************** * * Copyright (c) 2018 Microsoft * Licensed under The MIT License [see LICENSE for details] * \file modulated_deformable_im2col.cuh * \brief Function definitions of converting an image to * column matrix based on kernel, padding, dilation, and offset. * These functions are mainly used in deformable convolution operators. * \ref: https://arxiv.org/abs/1703.06211 * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng */ // modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu #include #include #include #include #include using namespace at; #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) const int CUDA_NUM_THREADS = 1024; const int kMaxGridNum = 65535; inline int GET_BLOCKS(const int N) { return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS); } template __device__ scalar_t deformable_im2col_bilinear(const scalar_t *bottom_data, const int data_width, const int height, const int width, scalar_t h, scalar_t w) { int h_low = floor(h); int w_low = floor(w); int h_high = h_low + 1; int w_high = w_low + 1; scalar_t lh = h - h_low; scalar_t lw = w - w_low; scalar_t hh = 1 - lh, hw = 1 - lw; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low]; scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) v2 = bottom_data[h_low * data_width + w_high]; scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) v3 = bottom_data[h_high * data_width + w_low]; scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) v4 = bottom_data[h_high * data_width + w_high]; scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ scalar_t get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w, const int h, const int w, const int height, const int width) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { //empty return 0; } int argmax_h_low = floor(argmax_h); int argmax_w_low = floor(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; scalar_t weight = 0; if (h == argmax_h_low && w == argmax_w_low) weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); if (h == argmax_h_low && w == argmax_w_high) weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); if (h == argmax_h_high && w == argmax_w_low) weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); if (h == argmax_h_high && w == argmax_w_high) weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); return weight; } template __device__ scalar_t get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w, const int height, const int width, const scalar_t *im_data, const int data_width, const int bp_dir) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { //empty return 0; } int argmax_h_low = floor(argmax_h); int argmax_w_low = floor(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; scalar_t weight = 0; if (bp_dir == 0) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } else if (bp_dir == 1) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } return weight; } template __global__ void deformable_im2col_gpu_kernel(const int n, const scalar_t *data_im, const scalar_t *data_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int num_channels, const int deformable_group, const int height_col, const int width_col, scalar_t *data_col) { CUDA_KERNEL_LOOP(index, n) { // index index of output matrix const int w_col = index % width_col; const int h_col = (index / width_col) % height_col; const int b_col = (index / width_col / height_col) % batch_size; const int c_im = (index / width_col / height_col) / batch_size; const int c_col = c_im * kernel_h * kernel_w; // compute deformable group index const int deformable_group_index = c_im / channel_per_deformable_group; const int h_in = h_col * stride_h - pad_h; const int w_in = w_col * stride_w - pad_w; scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; //const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; scalar_t val = static_cast(0); const scalar_t h_im = h_in + i * dilation_h + offset_h; const scalar_t w_im = w_in + j * dilation_w + offset_w; if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { //const scalar_t map_h = i * dilation_h + offset_h; //const scalar_t map_w = j * dilation_w + offset_w; //const int cur_height = height - h_in; //const int cur_width = width - w_in; //val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w); val = deformable_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); } *data_col_ptr = val; data_col_ptr += batch_size * height_col * width_col; } } } } void deformable_im2col( const at::Tensor data_im, const at::Tensor data_offset, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, at::Tensor data_col) { // num_axes should be smaller than block size // todo: check parallel_imgs is correctly passed in int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; int num_kernels = channels * height_col * width_col * parallel_imgs; int channel_per_deformable_group = channels / deformable_group; AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_im.type(), "deformable_im2col_gpu", ([&] { const scalar_t *data_im_ = data_im.data(); const scalar_t *data_offset_ = data_offset.data(); scalar_t *data_col_ = data_col.data(); deformable_im2col_gpu_kernel<<>>( num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs, channels, deformable_group, height_col, width_col, data_col_); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in deformable_im2col: %s\n", cudaGetErrorString(err)); } } template __global__ void deformable_col2im_gpu_kernel( const int n, const scalar_t *data_col, const scalar_t *data_offset, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int deformable_group, const int height_col, const int width_col, scalar_t *grad_im) { CUDA_KERNEL_LOOP(index, n) { const int j = (index / width_col / height_col / batch_size) % kernel_w; const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; // compute the start and end of the output const int deformable_group_index = c / channel_per_deformable_group; int w_out = index % width_col; int h_out = (index / width_col) % height_col; int b = (index / width_col / height_col) % batch_size; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h; const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w; const scalar_t cur_top_grad = data_col[index]; const int cur_h = (int)cur_inv_h_data; const int cur_w = (int)cur_inv_w_data; for (int dy = -2; dy <= 2; dy++) { for (int dx = -2; dx <= 2; dx++) { if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && abs(cur_inv_w_data - (cur_w + dx)) < 1) { int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; scalar_t weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); } } } } } void deformable_col2im( const at::Tensor data_col, const at::Tensor data_offset, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, at::Tensor grad_im) { // todo: make sure parallel_imgs is passed in correctly int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; int num_kernels = channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs; int channel_per_deformable_group = channels / deformable_group; AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_col.type(), "deformable_col2im_gpu", ([&] { const scalar_t *data_col_ = data_col.data(); const scalar_t *data_offset_ = data_offset.data(); scalar_t *grad_im_ = grad_im.data(); deformable_col2im_gpu_kernel<<>>( num_kernels, data_col_, data_offset_, channels, height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs, deformable_group, height_col, width_col, grad_im_); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in deformable_col2im: %s\n", cudaGetErrorString(err)); } } template __global__ void deformable_col2im_coord_gpu_kernel(const int n, const scalar_t *data_col, const scalar_t *data_im, const scalar_t *data_offset, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int offset_channels, const int deformable_group, const int height_col, const int width_col, scalar_t *grad_offset) { CUDA_KERNEL_LOOP(index, n) { scalar_t val = 0; int w = index % width_col; int h = (index / width_col) % height_col; int c = (index / width_col / height_col) % offset_channels; int b = (index / width_col / height_col) / offset_channels; // compute the start and end of the output const int deformable_group_index = c / (2 * kernel_h * kernel_w); const int col_step = kernel_h * kernel_w; int cnt = 0; const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) { const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; const int bp_dir = offset_c % 2; int j = (col_pos / width_col / height_col / batch_size) % kernel_w; int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; int w_out = col_pos % width_col; int h_out = (col_pos / width_col) % height_col; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; scalar_t inv_h = h_in + i * dilation_h + offset_h; scalar_t inv_w = w_in + j * dilation_w + offset_w; if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } const scalar_t weight = get_coordinate_weight( inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, width, bp_dir); val += weight * data_col_ptr[col_pos]; cnt += 1; } grad_offset[index] = val; } } void deformable_col2im_coord( const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, at::Tensor grad_offset) { int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w * deformable_group * parallel_imgs; int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group; AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_col.type(), "deformable_col2im_coord_gpu", ([&] { const scalar_t *data_col_ = data_col.data(); const scalar_t *data_im_ = data_im.data(); const scalar_t *data_offset_ = data_offset.data(); scalar_t *grad_offset_ = grad_offset.data(); deformable_col2im_coord_gpu_kernel<<>>( num_kernels, data_col_, data_im_, data_offset_, channels, height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs, 2 * ksize_h * ksize_w * deformable_group, deformable_group, height_col, width_col, grad_offset_); })); } template __device__ scalar_t dmcn_im2col_bilinear(const scalar_t *bottom_data, const int data_width, const int height, const int width, scalar_t h, scalar_t w) { int h_low = floor(h); int w_low = floor(w); int h_high = h_low + 1; int w_high = w_low + 1; scalar_t lh = h - h_low; scalar_t lw = w - w_low; scalar_t hh = 1 - lh, hw = 1 - lw; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low]; scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) v2 = bottom_data[h_low * data_width + w_high]; scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) v3 = bottom_data[h_high * data_width + w_low]; scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) v4 = bottom_data[h_high * data_width + w_high]; scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ scalar_t dmcn_get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w, const int h, const int w, const int height, const int width) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { //empty return 0; } int argmax_h_low = floor(argmax_h); int argmax_w_low = floor(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; scalar_t weight = 0; if (h == argmax_h_low && w == argmax_w_low) weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); if (h == argmax_h_low && w == argmax_w_high) weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); if (h == argmax_h_high && w == argmax_w_low) weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); if (h == argmax_h_high && w == argmax_w_high) weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); return weight; } template __device__ scalar_t dmcn_get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w, const int height, const int width, const scalar_t *im_data, const int data_width, const int bp_dir) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { //empty return 0; } int argmax_h_low = floor(argmax_h); int argmax_w_low = floor(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; scalar_t weight = 0; if (bp_dir == 0) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } else if (bp_dir == 1) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } return weight; } template __global__ void modulated_deformable_im2col_gpu_kernel(const int n, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int num_channels, const int deformable_group, const int height_col, const int width_col, scalar_t *data_col) { CUDA_KERNEL_LOOP(index, n) { // index index of output matrix const int w_col = index % width_col; const int h_col = (index / width_col) % height_col; const int b_col = (index / width_col / height_col) % batch_size; const int c_im = (index / width_col / height_col) / batch_size; const int c_col = c_im * kernel_h * kernel_w; // compute deformable group index const int deformable_group_index = c_im / channel_per_deformable_group; const int h_in = h_col * stride_h - pad_h; const int w_in = w_col * stride_w - pad_w; scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const scalar_t *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; scalar_t val = static_cast(0); const scalar_t h_im = h_in + i * dilation_h + offset_h; const scalar_t w_im = w_in + j * dilation_w + offset_w; //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { //const float map_h = i * dilation_h + offset_h; //const float map_w = j * dilation_w + offset_w; //const int cur_height = height - h_in; //const int cur_width = width - w_in; //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w); val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); } *data_col_ptr = val * mask; data_col_ptr += batch_size * height_col * width_col; //data_col_ptr += height_col * width_col; } } } } template __global__ void modulated_deformable_col2im_gpu_kernel(const int n, const scalar_t *data_col, const scalar_t *data_offset, const scalar_t *data_mask, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int deformable_group, const int height_col, const int width_col, scalar_t *grad_im) { CUDA_KERNEL_LOOP(index, n) { const int j = (index / width_col / height_col / batch_size) % kernel_w; const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; // compute the start and end of the output const int deformable_group_index = c / channel_per_deformable_group; int w_out = index % width_col; int h_out = (index / width_col) % height_col; int b = (index / width_col / height_col) % batch_size; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h; const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w; const scalar_t cur_top_grad = data_col[index] * mask; const int cur_h = (int)cur_inv_h_data; const int cur_w = (int)cur_inv_w_data; for (int dy = -2; dy <= 2; dy++) { for (int dx = -2; dx <= 2; dx++) { if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && abs(cur_inv_w_data - (cur_w + dx)) < 1) { int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; scalar_t weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); } } } } } template __global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n, const scalar_t *data_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int offset_channels, const int deformable_group, const int height_col, const int width_col, scalar_t *grad_offset, scalar_t *grad_mask) { CUDA_KERNEL_LOOP(index, n) { scalar_t val = 0, mval = 0; int w = index % width_col; int h = (index / width_col) % height_col; int c = (index / width_col / height_col) % offset_channels; int b = (index / width_col / height_col) / offset_channels; // compute the start and end of the output const int deformable_group_index = c / (2 * kernel_h * kernel_w); const int col_step = kernel_h * kernel_w; int cnt = 0; const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) { const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; const int bp_dir = offset_c % 2; int j = (col_pos / width_col / height_col / batch_size) % kernel_w; int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; int w_out = col_pos % width_col; int h_out = (col_pos / width_col) % height_col; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; scalar_t inv_h = h_in + i * dilation_h + offset_h; scalar_t inv_w = w_in + j * dilation_w + offset_w; if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } else { mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); } const scalar_t weight = dmcn_get_coordinate_weight( inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, width, bp_dir); val += weight * data_col_ptr[col_pos] * mask; cnt += 1; } // KERNEL_ASSIGN(grad_offset[index], offset_req, val); grad_offset[index] = val; if (offset_c % 2 == 0) // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval); grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; } } void modulated_deformable_im2col_cuda( const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, at::Tensor data_col) { // num_axes should be smaller than block size const int channel_per_deformable_group = channels / deformable_group; const int num_kernels = channels * batch_size * height_col * width_col; AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_im.type(), "modulated_deformable_im2col_gpu", ([&] { const scalar_t *data_im_ = data_im.data(); const scalar_t *data_offset_ = data_offset.data(); const scalar_t *data_mask_ = data_mask.data(); scalar_t *data_col_ = data_col.data(); modulated_deformable_im2col_gpu_kernel<<>>( num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, batch_size, channels, deformable_group, height_col, width_col, data_col_); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); } } void modulated_deformable_col2im_cuda( const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, at::Tensor grad_im) { const int channel_per_deformable_group = channels / deformable_group; const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col; AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_col.type(), "modulated_deformable_col2im_gpu", ([&] { const scalar_t *data_col_ = data_col.data(); const scalar_t *data_offset_ = data_offset.data(); const scalar_t *data_mask_ = data_mask.data(); scalar_t *grad_im_ = grad_im.data(); modulated_deformable_col2im_gpu_kernel<<>>( num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, batch_size, deformable_group, height_col, width_col, grad_im_); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); } } void modulated_deformable_col2im_coord_cuda( const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, at::Tensor grad_offset, at::Tensor grad_mask) { const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group; const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group; AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_col.type(), "modulated_deformable_col2im_coord_gpu", ([&] { const scalar_t *data_col_ = data_col.data(); const scalar_t *data_im_ = data_im.data(); const scalar_t *data_offset_ = data_offset.data(); const scalar_t *data_mask_ = data_mask.data(); scalar_t *grad_offset_ = grad_offset.data(); scalar_t *grad_mask_ = grad_mask.data(); modulated_deformable_col2im_coord_gpu_kernel<<>>( num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, grad_offset_, grad_mask_); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err)); } } ================================================ FILE: mmdetection/mmdet/ops/dcn/src/deform_pool_cuda.cpp ================================================ // modify from // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c // based on // author: Charles Shang // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu #include #include #include void DeformablePSROIPoolForward( const at::Tensor data, const at::Tensor bbox, const at::Tensor trans, at::Tensor out, at::Tensor top_count, const int batch, const int channels, const int height, const int width, const int num_bbox, const int channels_trans, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std); void DeformablePSROIPoolBackwardAcc( const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox, const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad, at::Tensor trans_grad, const int batch, const int channels, const int height, const int width, const int num_bbox, const int channels_trans, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std); void deform_psroi_pooling_cuda_forward( at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out, at::Tensor top_count, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); const int batch = input.size(0); const int channels = input.size(1); const int height = input.size(2); const int width = input.size(3); const int channels_trans = no_trans ? 2 : trans.size(1); const int num_bbox = bbox.size(0); if (num_bbox != out.size(0)) AT_ERROR("Output shape and bbox number wont match: (%d vs %d).", out.size(0), num_bbox); DeformablePSROIPoolForward( input, bbox, trans, out, top_count, batch, channels, height, width, num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size, pooled_size, part_size, sample_per_part, trans_std); } void deform_psroi_pooling_cuda_backward( at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous"); AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); const int batch = input.size(0); const int channels = input.size(1); const int height = input.size(2); const int width = input.size(3); const int channels_trans = no_trans ? 2 : trans.size(1); const int num_bbox = bbox.size(0); if (num_bbox != out_grad.size(0)) AT_ERROR("Output shape and bbox number wont match: (%d vs %d).", out_grad.size(0), num_bbox); DeformablePSROIPoolBackwardAcc( out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch, channels, height, width, num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size, pooled_size, part_size, sample_per_part, trans_std); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("deform_psroi_pooling_cuda_forward", &deform_psroi_pooling_cuda_forward, "deform psroi pooling forward(CUDA)"); m.def("deform_psroi_pooling_cuda_backward", &deform_psroi_pooling_cuda_backward, "deform psroi pooling backward(CUDA)"); } ================================================ FILE: mmdetection/mmdet/ops/dcn/src/deform_pool_cuda_kernel.cu ================================================ /*! * Copyright (c) 2017 Microsoft * Licensed under The MIT License [see LICENSE for details] * \file deformable_psroi_pooling.cu * \brief * \author Yi Li, Guodong Zhang, Jifeng Dai */ /***************** Adapted by Charles Shang *********************/ // modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/cuda/deform_psroi_pooling_cuda.cu #include #include #include #include #include using namespace at; #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ i < (n); \ i += blockDim.x * gridDim.x) const int CUDA_NUM_THREADS = 1024; inline int GET_BLOCKS(const int N) { return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; } template __device__ scalar_t bilinear_interp( const scalar_t *data, const scalar_t x, const scalar_t y, const int width, const int height) { int x1 = floor(x); int x2 = ceil(x); int y1 = floor(y); int y2 = ceil(y); scalar_t dist_x = (scalar_t)(x - x1); scalar_t dist_y = (scalar_t)(y - y1); scalar_t value11 = data[y1 * width + x1]; scalar_t value12 = data[y2 * width + x1]; scalar_t value21 = data[y1 * width + x2]; scalar_t value22 = data[y2 * width + x2]; scalar_t value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22; return value; } template __global__ void DeformablePSROIPoolForwardKernel( const int count, const scalar_t *bottom_data, const scalar_t spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const scalar_t *bottom_rois, const scalar_t *bottom_trans, const int no_trans, const scalar_t trans_std, const int sample_per_part, const int output_dim, const int group_size, const int part_size, const int num_classes, const int channels_each_class, scalar_t *top_data, scalar_t *top_count) { CUDA_KERNEL_LOOP(index, count) { // The output is in order (n, ctop, ph, pw) int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int ctop = (index / pooled_width / pooled_height) % output_dim; int n = index / pooled_width / pooled_height / output_dim; // [start, end) interval for spatial sampling const scalar_t *offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5; scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5; scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; // Force too small ROIs to be 1x1 scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1); // Compute w and h at bottom scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height); scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width); scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part); scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part); int part_h = floor((scalar_t)(ph) / pooled_height * part_size); int part_w = floor((scalar_t)(pw) / pooled_width * part_size); int class_id = ctop / channels_each_class; scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w; wstart += trans_x * roi_width; scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h; hstart += trans_y * roi_height; scalar_t sum = 0; int count = 0; int gw = floor((scalar_t)(pw)*group_size / pooled_width); int gh = floor((scalar_t)(ph)*group_size / pooled_height); gw = min(max(gw, 0), group_size - 1); gh = min(max(gh, 0), group_size - 1); const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; for (int ih = 0; ih < sample_per_part; ih++) { for (int iw = 0; iw < sample_per_part; iw++) { scalar_t w = wstart + iw * sub_bin_size_w; scalar_t h = hstart + ih * sub_bin_size_h; // bilinear interpolation if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { continue; } w = min(max(w, 0.), width - 1.); h = min(max(h, 0.), height - 1.); int c = (ctop * group_size + gh) * group_size + gw; scalar_t val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height); sum += val; count++; } } top_data[index] = count == 0 ? (scalar_t)(0) : sum / count; top_count[index] = count; } } template __global__ void DeformablePSROIPoolBackwardAccKernel( const int count, const scalar_t *top_diff, const scalar_t *top_count, const int num_rois, const scalar_t spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int output_dim, scalar_t *bottom_data_diff, scalar_t *bottom_trans_diff, const scalar_t *bottom_data, const scalar_t *bottom_rois, const scalar_t *bottom_trans, const int no_trans, const scalar_t trans_std, const int sample_per_part, const int group_size, const int part_size, const int num_classes, const int channels_each_class) { CUDA_KERNEL_LOOP(index, count) { // The output is in order (n, ctop, ph, pw) int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int ctop = (index / pooled_width / pooled_height) % output_dim; int n = index / pooled_width / pooled_height / output_dim; // [start, end) interval for spatial sampling const scalar_t *offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5; scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5; scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; // Force too small ROIs to be 1x1 scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1); // Compute w and h at bottom scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height); scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width); scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part); scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part); int part_h = floor((scalar_t)(ph) / pooled_height * part_size); int part_w = floor((scalar_t)(pw) / pooled_width * part_size); int class_id = ctop / channels_each_class; scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w; wstart += trans_x * roi_width; scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h; hstart += trans_y * roi_height; if (top_count[index] <= 0) { continue; } scalar_t diff_val = top_diff[index] / top_count[index]; const scalar_t *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; scalar_t *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; int gw = floor((scalar_t)(pw)*group_size / pooled_width); int gh = floor((scalar_t)(ph)*group_size / pooled_height); gw = min(max(gw, 0), group_size - 1); gh = min(max(gh, 0), group_size - 1); for (int ih = 0; ih < sample_per_part; ih++) { for (int iw = 0; iw < sample_per_part; iw++) { scalar_t w = wstart + iw * sub_bin_size_w; scalar_t h = hstart + ih * sub_bin_size_h; // bilinear interpolation if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { continue; } w = min(max(w, 0.), width - 1.); h = min(max(h, 0.), height - 1.); int c = (ctop * group_size + gh) * group_size + gw; // backward on feature int x0 = floor(w); int x1 = ceil(w); int y0 = floor(h); int y1 = ceil(h); scalar_t dist_x = w - x0, dist_y = h - y0; scalar_t q00 = (1 - dist_x) * (1 - dist_y); scalar_t q01 = (1 - dist_x) * dist_y; scalar_t q10 = dist_x * (1 - dist_y); scalar_t q11 = dist_x * dist_y; int bottom_index_base = c * height * width; atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val); atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val); atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val); atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val); if (no_trans) { continue; } scalar_t U00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; scalar_t U01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; scalar_t U10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; scalar_t U11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; scalar_t diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val; diff_x *= roi_width; scalar_t diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val; diff_y *= roi_height; atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x); atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y); } } } } void DeformablePSROIPoolForward(const at::Tensor data, const at::Tensor bbox, const at::Tensor trans, at::Tensor out, at::Tensor top_count, const int batch, const int channels, const int height, const int width, const int num_bbox, const int channels_trans, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { const int pooled_height = pooled_size; const int pooled_width = pooled_size; const int count = num_bbox * output_dim * pooled_height * pooled_width; const int num_classes = no_trans ? 1 : channels_trans / 2; const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; AT_DISPATCH_FLOATING_TYPES_AND_HALF( data.type(), "deformable_psroi_pool_forward", ([&] { const scalar_t *bottom_data = data.data(); const scalar_t *bottom_rois = bbox.data(); const scalar_t *bottom_trans = no_trans ? NULL : trans.data(); scalar_t *top_data = out.data(); scalar_t *top_count_data = top_count.data(); DeformablePSROIPoolForwardKernel<<>>( count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, output_dim, group_size, part_size, num_classes, channels_each_class, top_data, top_count_data); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err)); } } void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox, const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad, at::Tensor trans_grad, const int batch, const int channels, const int height, const int width, const int num_bbox, const int channels_trans, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { // LOG(INFO) << "DeformablePSROIPoolBackward"; const int num_rois = num_bbox; const int pooled_height = pooled_size; const int pooled_width = pooled_size; const int count = num_bbox * output_dim * pooled_height * pooled_width; const int num_classes = no_trans ? 1 : channels_trans / 2; const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; AT_DISPATCH_FLOATING_TYPES_AND_HALF( out_grad.type(), "deformable_psroi_pool_backward_acc", ([&] { const scalar_t *top_diff = out_grad.data(); const scalar_t *bottom_data = data.data(); const scalar_t *bottom_rois = bbox.data(); const scalar_t *bottom_trans = no_trans ? NULL : trans.data(); scalar_t *bottom_data_diff = in_grad.data(); scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data(); const scalar_t *top_count_data = top_count.data(); DeformablePSROIPoolBackwardAccKernel<<>>( count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff, bottom_data, bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, group_size, part_size, num_classes, channels_each_class); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err)); } } ================================================ FILE: mmdetection/mmdet/ops/nms/__init__.py ================================================ from .nms_wrapper import nms, soft_nms __all__ = ['nms', 'soft_nms'] ================================================ FILE: mmdetection/mmdet/ops/nms/nms_wrapper.py ================================================ import numpy as np import torch from . import nms_cuda, nms_cpu from .soft_nms_cpu import soft_nms_cpu def nms(dets, iou_thr, device_id=None): """Dispatch to either CPU or GPU NMS implementations. The input can be either a torch tensor or numpy array. GPU NMS will be used if the input is a gpu tensor or device_id is specified, otherwise CPU NMS will be used. The returned type will always be the same as inputs. Arguments: dets (torch.Tensor or np.ndarray): bboxes with scores. iou_thr (float): IoU threshold for NMS. device_id (int, optional): when `dets` is a numpy array, if `device_id` is None, then cpu nms is used, otherwise gpu_nms will be used. Returns: tuple: kept bboxes and indice, which is always the same data type as the input. """ # convert dets (tensor or numpy array) to tensor if isinstance(dets, torch.Tensor): is_numpy = False dets_th = dets elif isinstance(dets, np.ndarray): is_numpy = True device = 'cpu' if device_id is None else 'cuda:{}'.format(device_id) dets_th = torch.from_numpy(dets).to(device) else: raise TypeError( 'dets must be either a Tensor or numpy array, but got {}'.format( type(dets))) # execute cpu or cuda nms if dets_th.shape[0] == 0: inds = dets_th.new_zeros(0, dtype=torch.long) else: if dets_th.is_cuda: inds = nms_cuda.nms(dets_th, iou_thr) else: inds = nms_cpu.nms(dets_th, iou_thr) if is_numpy: inds = inds.cpu().numpy() return dets[inds, :], inds def soft_nms(dets, iou_thr, method='linear', sigma=0.5, min_score=1e-3): if isinstance(dets, torch.Tensor): is_tensor = True dets_np = dets.detach().cpu().numpy() elif isinstance(dets, np.ndarray): is_tensor = False dets_np = dets else: raise TypeError( 'dets must be either a Tensor or numpy array, but got {}'.format( type(dets))) method_codes = {'linear': 1, 'gaussian': 2} if method not in method_codes: raise ValueError('Invalid method for SoftNMS: {}'.format(method)) new_dets, inds = soft_nms_cpu( dets_np, iou_thr, method=method_codes[method], sigma=sigma, min_score=min_score) if is_tensor: return dets.new_tensor(new_dets), dets.new_tensor( inds, dtype=torch.long) else: return new_dets.astype(np.float32), inds.astype(np.int64) ================================================ FILE: mmdetection/mmdet/ops/nms/setup.py ================================================ import os.path as osp from setuptools import setup, Extension import numpy as np from Cython.Build import cythonize from Cython.Distutils import build_ext from torch.utils.cpp_extension import BuildExtension, CUDAExtension ext_args = dict( include_dirs=[np.get_include()], language='c++', extra_compile_args={ 'cc': ['-Wno-unused-function', '-Wno-write-strings'], 'nvcc': ['-c', '--compiler-options', '-fPIC'], }, ) extensions = [ Extension('soft_nms_cpu', ['src/soft_nms_cpu.pyx'], **ext_args), ] def customize_compiler_for_nvcc(self): """inject deep into distutils to customize how the dispatch to cc/nvcc works. If you subclass UnixCCompiler, it's not trivial to get your subclass injected in, and still have the right customizations (i.e. distutils.sysconfig.customize_compiler) run on it. So instead of going the OO route, I have this. Note, it's kindof like a wierd functional subclassing going on.""" # tell the compiler it can processes .cu self.src_extensions.append('.cu') # save references to the default compiler_so and _comple methods default_compiler_so = self.compiler_so super = self._compile # now redefine the _compile method. This gets executed for each # object but distutils doesn't have the ability to change compilers # based on source extension: we add it. def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): if osp.splitext(src)[1] == '.cu': # use the cuda for .cu files self.set_executable('compiler_so', 'nvcc') # use only a subset of the extra_postargs, which are 1-1 translated # from the extra_compile_args in the Extension class postargs = extra_postargs['nvcc'] else: postargs = extra_postargs['cc'] super(obj, src, ext, cc_args, postargs, pp_opts) # reset the default compiler_so, which we might have changed for cuda self.compiler_so = default_compiler_so # inject our redefined _compile method into the class self._compile = _compile class custom_build_ext(build_ext): def build_extensions(self): customize_compiler_for_nvcc(self.compiler) build_ext.build_extensions(self) setup( name='soft_nms', cmdclass={'build_ext': custom_build_ext}, ext_modules=cythonize(extensions), ) setup( name='nms_cuda', ext_modules=[ CUDAExtension('nms_cuda', [ 'src/nms_cuda.cpp', 'src/nms_kernel.cu', ]), CUDAExtension('nms_cpu', [ 'src/nms_cpu.cpp', ]), ], cmdclass={'build_ext': BuildExtension}) ================================================ FILE: mmdetection/mmdet/ops/nms/src/nms_cpu.cpp ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include template at::Tensor nms_cpu_kernel(const at::Tensor& dets, const float threshold) { AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor"); if (dets.numel() == 0) { return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); } auto x1_t = dets.select(1, 0).contiguous(); auto y1_t = dets.select(1, 1).contiguous(); auto x2_t = dets.select(1, 2).contiguous(); auto y2_t = dets.select(1, 3).contiguous(); auto scores = dets.select(1, 4).contiguous(); at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1); auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); auto ndets = dets.size(0); at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); auto suppressed = suppressed_t.data(); auto order = order_t.data(); auto x1 = x1_t.data(); auto y1 = y1_t.data(); auto x2 = x2_t.data(); auto y2 = y2_t.data(); auto areas = areas_t.data(); for (int64_t _i = 0; _i < ndets; _i++) { auto i = order[_i]; if (suppressed[i] == 1) continue; auto ix1 = x1[i]; auto iy1 = y1[i]; auto ix2 = x2[i]; auto iy2 = y2[i]; auto iarea = areas[i]; for (int64_t _j = _i + 1; _j < ndets; _j++) { auto j = order[_j]; if (suppressed[j] == 1) continue; auto xx1 = std::max(ix1, x1[j]); auto yy1 = std::max(iy1, y1[j]); auto xx2 = std::min(ix2, x2[j]); auto yy2 = std::min(iy2, y2[j]); auto w = std::max(static_cast(0), xx2 - xx1 + 1); auto h = std::max(static_cast(0), yy2 - yy1 + 1); auto inter = w * h; auto ovr = inter / (iarea + areas[j] - inter); if (ovr >= threshold) suppressed[j] = 1; } } return at::nonzero(suppressed_t == 0).squeeze(1); } at::Tensor nms(const at::Tensor& dets, const float threshold) { at::Tensor result; AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] { result = nms_cpu_kernel(dets, threshold); }); return result; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("nms", &nms, "non-maximum suppression"); } ================================================ FILE: mmdetection/mmdet/ops/nms/src/nms_cuda.cpp ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh); at::Tensor nms(const at::Tensor& dets, const float threshold) { CHECK_CUDA(dets); if (dets.numel() == 0) return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); return nms_cuda(dets, threshold); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("nms", &nms, "non-maximum suppression"); } ================================================ FILE: mmdetection/mmdet/ops/nms/src/nms_kernel.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include #include #include #include #include #include int const threadsPerBlock = sizeof(unsigned long long) * 8; __device__ inline float devIoU(float const * const a, float const * const b) { float left = max(a[0], b[0]), right = min(a[2], b[2]); float top = max(a[1], b[1]), bottom = min(a[3], b[3]); float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); float interS = width * height; float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); return interS / (Sa + Sb - interS); } __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, const float *dev_boxes, unsigned long long *dev_mask) { const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); __shared__ float block_boxes[threadsPerBlock * 5]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 5 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; block_boxes[threadIdx.x * 5 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; block_boxes[threadIdx.x * 5 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; block_boxes[threadIdx.x * 5 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; block_boxes[threadIdx.x * 5 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; const float *cur_box = dev_boxes + cur_box_idx * 5; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { t |= 1ULL << i; } } const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); dev_mask[cur_box_idx * col_blocks + col_start] = t; } } // boxes is a N x 5 tensor at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) { using scalar_t = float; AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); auto scores = boxes.select(1, 4); auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); auto boxes_sorted = boxes.index_select(0, order_t); int boxes_num = boxes.size(0); const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); scalar_t* boxes_dev = boxes_sorted.data(); THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState unsigned long long* mask_dev = NULL; //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, // boxes_num * col_blocks * sizeof(unsigned long long))); mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), THCCeilDiv(boxes_num, threadsPerBlock)); dim3 threads(threadsPerBlock); nms_kernel<<>>(boxes_num, nms_overlap_thresh, boxes_dev, mask_dev); std::vector mask_host(boxes_num * col_blocks); THCudaCheck(cudaMemcpy(&mask_host[0], mask_dev, sizeof(unsigned long long) * boxes_num * col_blocks, cudaMemcpyDeviceToHost)); std::vector remv(col_blocks); memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); int64_t* keep_out = keep.data(); int num_to_keep = 0; for (int i = 0; i < boxes_num; i++) { int nblock = i / threadsPerBlock; int inblock = i % threadsPerBlock; if (!(remv[nblock] & (1ULL << inblock))) { keep_out[num_to_keep++] = i; unsigned long long *p = &mask_host[0] + i * col_blocks; for (int j = nblock; j < col_blocks; j++) { remv[j] |= p[j]; } } } THCudaFree(state, mask_dev); // TODO improve this part return std::get<0>(order_t.index({ keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( order_t.device(), keep.scalar_type()) }).sort(0, false)); } ================================================ FILE: mmdetection/mmdet/ops/nms/src/soft_nms_cpu.pyx ================================================ # ---------------------------------------------------------- # Soft-NMS: Improving Object Detection With One Line of Code # Copyright (c) University of Maryland, College Park # Licensed under The MIT License [see LICENSE for details] # Written by Navaneeth Bodla and Bharat Singh # Modified by Kai Chen # ---------------------------------------------------------- # cython: language_level=3, boundscheck=False import numpy as np cimport numpy as np cdef inline np.float32_t max(np.float32_t a, np.float32_t b): return a if a >= b else b cdef inline np.float32_t min(np.float32_t a, np.float32_t b): return a if a <= b else b def soft_nms_cpu( np.ndarray[float, ndim=2] boxes_in, float iou_thr, unsigned int method=1, float sigma=0.5, float min_score=0.001, ): boxes = boxes_in.copy() cdef unsigned int N = boxes.shape[0] cdef float iw, ih, box_area cdef float ua cdef int pos = 0 cdef float maxscore = 0 cdef int maxpos = 0 cdef float x1, x2, y1, y2, tx1, tx2, ty1, ty2, ts, area, weight, ov inds = np.arange(N) for i in range(N): maxscore = boxes[i, 4] maxpos = i tx1 = boxes[i, 0] ty1 = boxes[i, 1] tx2 = boxes[i, 2] ty2 = boxes[i, 3] ts = boxes[i, 4] ti = inds[i] pos = i + 1 # get max box while pos < N: if maxscore < boxes[pos, 4]: maxscore = boxes[pos, 4] maxpos = pos pos = pos + 1 # add max box as a detection boxes[i, 0] = boxes[maxpos, 0] boxes[i, 1] = boxes[maxpos, 1] boxes[i, 2] = boxes[maxpos, 2] boxes[i, 3] = boxes[maxpos, 3] boxes[i, 4] = boxes[maxpos, 4] inds[i] = inds[maxpos] # swap ith box with position of max box boxes[maxpos, 0] = tx1 boxes[maxpos, 1] = ty1 boxes[maxpos, 2] = tx2 boxes[maxpos, 3] = ty2 boxes[maxpos, 4] = ts inds[maxpos] = ti tx1 = boxes[i, 0] ty1 = boxes[i, 1] tx2 = boxes[i, 2] ty2 = boxes[i, 3] ts = boxes[i, 4] pos = i + 1 # NMS iterations, note that N changes if detection boxes fall below # threshold while pos < N: x1 = boxes[pos, 0] y1 = boxes[pos, 1] x2 = boxes[pos, 2] y2 = boxes[pos, 3] s = boxes[pos, 4] area = (x2 - x1 + 1) * (y2 - y1 + 1) iw = (min(tx2, x2) - max(tx1, x1) + 1) if iw > 0: ih = (min(ty2, y2) - max(ty1, y1) + 1) if ih > 0: ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih) ov = iw * ih / ua # iou between max box and detection box if method == 1: # linear if ov > iou_thr: weight = 1 - ov else: weight = 1 elif method == 2: # gaussian weight = np.exp(-(ov * ov) / sigma) else: # original NMS if ov > iou_thr: weight = 0 else: weight = 1 boxes[pos, 4] = weight * boxes[pos, 4] # if box score falls below threshold, discard the box by # swapping with last box update N if boxes[pos, 4] < min_score: boxes[pos, 0] = boxes[N-1, 0] boxes[pos, 1] = boxes[N-1, 1] boxes[pos, 2] = boxes[N-1, 2] boxes[pos, 3] = boxes[N-1, 3] boxes[pos, 4] = boxes[N-1, 4] inds[pos] = inds[N - 1] N = N - 1 pos = pos - 1 pos = pos + 1 return boxes[:N], inds[:N] ================================================ FILE: mmdetection/mmdet/ops/roi_align/__init__.py ================================================ from .functions.roi_align import roi_align from .modules.roi_align import RoIAlign __all__ = ['roi_align', 'RoIAlign'] ================================================ FILE: mmdetection/mmdet/ops/roi_align/functions/__init__.py ================================================ ================================================ FILE: mmdetection/mmdet/ops/roi_align/functions/roi_align.py ================================================ from torch.autograd import Function from .. import roi_align_cuda class RoIAlignFunction(Function): @staticmethod def forward(ctx, features, rois, out_size, spatial_scale, sample_num=0): if isinstance(out_size, int): out_h = out_size out_w = out_size elif isinstance(out_size, tuple): assert len(out_size) == 2 assert isinstance(out_size[0], int) assert isinstance(out_size[1], int) out_h, out_w = out_size else: raise TypeError( '"out_size" must be an integer or tuple of integers') ctx.spatial_scale = spatial_scale ctx.sample_num = sample_num ctx.save_for_backward(rois) ctx.feature_size = features.size() batch_size, num_channels, data_height, data_width = features.size() num_rois = rois.size(0) output = features.new_zeros(num_rois, num_channels, out_h, out_w) if features.is_cuda: roi_align_cuda.forward(features, rois, out_h, out_w, spatial_scale, sample_num, output) else: raise NotImplementedError return output @staticmethod def backward(ctx, grad_output): feature_size = ctx.feature_size spatial_scale = ctx.spatial_scale sample_num = ctx.sample_num rois = ctx.saved_tensors[0] assert (feature_size is not None and grad_output.is_cuda) batch_size, num_channels, data_height, data_width = feature_size out_w = grad_output.size(3) out_h = grad_output.size(2) grad_input = grad_rois = None if ctx.needs_input_grad[0]: grad_input = rois.new_zeros(batch_size, num_channels, data_height, data_width) roi_align_cuda.backward(grad_output.contiguous(), rois, out_h, out_w, spatial_scale, sample_num, grad_input) return grad_input, grad_rois, None, None, None roi_align = RoIAlignFunction.apply ================================================ FILE: mmdetection/mmdet/ops/roi_align/gradcheck.py ================================================ import numpy as np import torch from torch.autograd import gradcheck import os.path as osp import sys sys.path.append(osp.abspath(osp.join(__file__, '../../'))) from roi_align import RoIAlign # noqa: E402 feat_size = 15 spatial_scale = 1.0 / 8 img_size = feat_size / spatial_scale num_imgs = 2 num_rois = 20 batch_ind = np.random.randint(num_imgs, size=(num_rois, 1)) rois = np.random.rand(num_rois, 4) * img_size * 0.5 rois[:, 2:] += img_size * 0.5 rois = np.hstack((batch_ind, rois)) feat = torch.randn( num_imgs, 16, feat_size, feat_size, requires_grad=True, device='cuda:0') rois = torch.from_numpy(rois).float().cuda() inputs = (feat, rois) print('Gradcheck for roi align...') test = gradcheck(RoIAlign(3, spatial_scale), inputs, atol=1e-3, eps=1e-3) print(test) test = gradcheck(RoIAlign(3, spatial_scale, 2), inputs, atol=1e-3, eps=1e-3) print(test) ================================================ FILE: mmdetection/mmdet/ops/roi_align/modules/__init__.py ================================================ ================================================ FILE: mmdetection/mmdet/ops/roi_align/modules/roi_align.py ================================================ from torch.nn.modules.module import Module from ..functions.roi_align import RoIAlignFunction class RoIAlign(Module): def __init__(self, out_size, spatial_scale, sample_num=0): super(RoIAlign, self).__init__() self.out_size = out_size self.spatial_scale = float(spatial_scale) self.sample_num = int(sample_num) def forward(self, features, rois): return RoIAlignFunction.apply(features, rois, self.out_size, self.spatial_scale, self.sample_num) ================================================ FILE: mmdetection/mmdet/ops/roi_align/setup.py ================================================ from setuptools import setup from torch.utils.cpp_extension import BuildExtension, CUDAExtension setup( name='roi_align_cuda', ext_modules=[ CUDAExtension('roi_align_cuda', [ 'src/roi_align_cuda.cpp', 'src/roi_align_kernel.cu', ]), ], cmdclass={'build_ext': BuildExtension}) ================================================ FILE: mmdetection/mmdet/ops/roi_align/src/roi_align_cuda.cpp ================================================ #include #include #include int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois, const float spatial_scale, const int sample_num, const int channels, const int height, const int width, const int num_rois, const int pooled_height, const int pooled_width, at::Tensor output); int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale, const int sample_num, const int channels, const int height, const int width, const int num_rois, const int pooled_height, const int pooled_width, at::Tensor bottom_grad); #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CONTIGUOUS(x) \ AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) int roi_align_forward_cuda(at::Tensor features, at::Tensor rois, int pooled_height, int pooled_width, float spatial_scale, int sample_num, at::Tensor output) { CHECK_INPUT(features); CHECK_INPUT(rois); CHECK_INPUT(output); // Number of ROIs int num_rois = rois.size(0); int size_rois = rois.size(1); if (size_rois != 5) { printf("wrong roi size\n"); return 0; } int num_channels = features.size(1); int data_height = features.size(2); int data_width = features.size(3); ROIAlignForwardLaucher(features, rois, spatial_scale, sample_num, num_channels, data_height, data_width, num_rois, pooled_height, pooled_width, output); return 1; } int roi_align_backward_cuda(at::Tensor top_grad, at::Tensor rois, int pooled_height, int pooled_width, float spatial_scale, int sample_num, at::Tensor bottom_grad) { CHECK_INPUT(top_grad); CHECK_INPUT(rois); CHECK_INPUT(bottom_grad); // Number of ROIs int num_rois = rois.size(0); int size_rois = rois.size(1); if (size_rois != 5) { printf("wrong roi size\n"); return 0; } int num_channels = bottom_grad.size(1); int data_height = bottom_grad.size(2); int data_width = bottom_grad.size(3); ROIAlignBackwardLaucher(top_grad, rois, spatial_scale, sample_num, num_channels, data_height, data_width, num_rois, pooled_height, pooled_width, bottom_grad); return 1; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("forward", &roi_align_forward_cuda, "Roi_Align forward (CUDA)"); m.def("backward", &roi_align_backward_cuda, "Roi_Align backward (CUDA)"); } ================================================ FILE: mmdetection/mmdet/ops/roi_align/src/roi_align_kernel.cu ================================================ #include #include #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ i += blockDim.x * gridDim.x) #define THREADS_PER_BLOCK 1024 inline int GET_BLOCKS(const int N) { int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; int max_block_num = 65000; return min(optimal_block_num, max_block_num); } template __device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data, const int height, const int width, scalar_t y, scalar_t x) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { return 0; } if (y <= 0) y = 0; if (x <= 0) x = 0; int y_low = (int)y; int x_low = (int)x; int y_high; int x_high; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (scalar_t)y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (scalar_t)x_low; } else { x_high = x_low + 1; } scalar_t ly = y - y_low; scalar_t lx = x - x_low; scalar_t hy = 1. - ly; scalar_t hx = 1. - lx; // do bilinear interpolation scalar_t lt = bottom_data[y_low * width + x_low]; scalar_t rt = bottom_data[y_low * width + x_high]; scalar_t lb = bottom_data[y_high * width + x_low]; scalar_t rb = bottom_data[y_high * width + x_high]; scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; scalar_t val = (w1 * lt + w2 * rt + w3 * lb + w4 * rb); return val; } template __global__ void ROIAlignForward(const int nthreads, const scalar_t *bottom_data, const scalar_t *bottom_rois, const scalar_t spatial_scale, const int sample_num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, scalar_t *top_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the aligned output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const scalar_t *offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; scalar_t roi_start_w = offset_bottom_rois[1] * spatial_scale; scalar_t roi_start_h = offset_bottom_rois[2] * spatial_scale; scalar_t roi_end_w = (offset_bottom_rois[3] + 1) * spatial_scale; scalar_t roi_end_h = (offset_bottom_rois[4] + 1) * spatial_scale; // Force malformed ROIs to be 1x1 scalar_t roi_width = fmaxf((scalar_t)roi_end_w - roi_start_w, 0.); scalar_t roi_height = fmaxf((scalar_t)roi_end_h - roi_start_h, 0.); scalar_t bin_size_h = roi_height / pooled_height; scalar_t bin_size_w = roi_width / pooled_width; const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; int sample_num_h = (sample_num > 0) ? sample_num : ceil(roi_height / pooled_height); // e.g., = 2 int sample_num_w = (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width); scalar_t h = (scalar_t)(ph + 0.5) * bin_size_h + roi_start_h; scalar_t w = (scalar_t)(pw + 0.5) * bin_size_w + roi_start_w; int hstart = fminf(floor(h), height - 2); int wstart = fminf(floor(w), width - 2); scalar_t output_val = 0; for (int iy = 0; iy < sample_num_h; iy++) { const scalar_t y = roi_start_h + ph * bin_size_h + (scalar_t)(iy + scalar_t(.5f)) * bin_size_h / (scalar_t)(sample_num_h); for (int ix = 0; ix < sample_num_w; ix++) { const scalar_t x = roi_start_w + pw * bin_size_w + (scalar_t)(ix + scalar_t(.5f)) * bin_size_w / (scalar_t)(sample_num_w); scalar_t val = bilinear_interpolate(offset_bottom_data, height, width, y, x); output_val += val; } } output_val /= (sample_num_h * sample_num_w); top_data[index] = output_val; } } int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois, const float spatial_scale, const int sample_num, const int channels, const int height, const int width, const int num_rois, const int pooled_height, const int pooled_width, at::Tensor output) { const int output_size = num_rois * pooled_height * pooled_width * channels; AT_DISPATCH_FLOATING_TYPES_AND_HALF( features.type(), "ROIAlignLaucherForward", ([&] { const scalar_t *bottom_data = features.data(); const scalar_t *rois_data = rois.data(); scalar_t *top_data = output.data(); ROIAlignForward <<>>( output_size, bottom_data, rois_data, scalar_t(spatial_scale), sample_num, channels, height, width, pooled_height, pooled_width, top_data); })); THCudaCheck(cudaGetLastError()); return 1; } template __device__ void bilinear_interpolate_gradient(const int height, const int width, scalar_t y, scalar_t x, scalar_t &w1, scalar_t &w2, scalar_t &w3, scalar_t &w4, int &x_low, int &x_high, int &y_low, int &y_high) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { w1 = w2 = w3 = w4 = 0.; x_low = x_high = y_low = y_high = -1; return; } if (y <= 0) y = 0; if (x <= 0) x = 0; y_low = (int)y; x_low = (int)x; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (scalar_t)y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (scalar_t)x_low; } else { x_high = x_low + 1; } scalar_t ly = y - y_low; scalar_t lx = x - x_low; scalar_t hy = 1. - ly; scalar_t hx = 1. - lx; w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; return; } template __global__ void ROIAlignBackward( const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois, const scalar_t spatial_scale, const int sample_num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, scalar_t *bottom_diff) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the aligned output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const scalar_t *offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; scalar_t roi_start_w = offset_bottom_rois[1] * spatial_scale; scalar_t roi_start_h = offset_bottom_rois[2] * spatial_scale; scalar_t roi_end_w = (offset_bottom_rois[3] + 1) * spatial_scale; scalar_t roi_end_h = (offset_bottom_rois[4] + 1) * spatial_scale; // Force malformed ROIs to be 1x1 scalar_t roi_width = fmaxf((scalar_t)roi_end_w - roi_start_w, 0.); scalar_t roi_height = fmaxf((scalar_t)roi_end_h - roi_start_h, 0.); scalar_t bin_size_h = roi_height / pooled_height; scalar_t bin_size_w = roi_width / pooled_width; scalar_t *offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width; int offset_top = (n * channels + c) * pooled_height * pooled_width + ph * pooled_width + pw; scalar_t offset_top_diff = top_diff[offset_top]; int sample_num_h = (sample_num > 0) ? sample_num : ceil(roi_height / pooled_height); // e.g., = 2 int sample_num_w = (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width); const scalar_t count = (scalar_t)(sample_num_h * sample_num_w); scalar_t h = (scalar_t)(ph + 0.5) * bin_size_h + roi_start_h; scalar_t w = (scalar_t)(pw + 0.5) * bin_size_w + roi_start_w; int hstart = fminf(floor(h), height - 2); int wstart = fminf(floor(w), width - 2); for (int iy = 0; iy < sample_num_h; iy++) { const scalar_t y = roi_start_h + ph * bin_size_h + (scalar_t)(iy + .5f) * bin_size_h / (scalar_t)(sample_num_h); for (int ix = 0; ix < sample_num_w; ix++) { const scalar_t x = roi_start_w + pw * bin_size_w + (scalar_t)(ix + .5f) * bin_size_w / (scalar_t)(sample_num_w); scalar_t w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient( height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high); scalar_t g1 = offset_top_diff * w1 / count; scalar_t g2 = offset_top_diff * w2 / count; scalar_t g3 = offset_top_diff * w3 / count; scalar_t g4 = offset_top_diff * w4 / count; if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(offset_bottom_diff + y_low * width + x_low, g1); atomicAdd(offset_bottom_diff + y_low * width + x_high, g2); atomicAdd(offset_bottom_diff + y_high * width + x_low, g3); atomicAdd(offset_bottom_diff + y_high * width + x_high, g4); } } } } } int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale, const int sample_num, const int channels, const int height, const int width, const int num_rois, const int pooled_height, const int pooled_width, at::Tensor bottom_grad) { const int output_size = num_rois * pooled_height * pooled_width * channels; AT_DISPATCH_FLOATING_TYPES_AND_HALF( top_grad.type(), "ROIAlignLaucherBackward", ([&] { const scalar_t *top_diff = top_grad.data(); const scalar_t *rois_data = rois.data(); scalar_t *bottom_diff = bottom_grad.data(); if (sizeof(scalar_t) == sizeof(double)) { fprintf(stderr, "double is not supported\n"); exit(-1); } ROIAlignBackward <<>>( output_size, top_diff, rois_data, spatial_scale, sample_num, channels, height, width, pooled_height, pooled_width, bottom_diff); })); THCudaCheck(cudaGetLastError()); return 1; } ================================================ FILE: mmdetection/mmdet/ops/roi_pool/__init__.py ================================================ from .functions.roi_pool import roi_pool from .modules.roi_pool import RoIPool __all__ = ['roi_pool', 'RoIPool'] ================================================ FILE: mmdetection/mmdet/ops/roi_pool/functions/__init__.py ================================================ ================================================ FILE: mmdetection/mmdet/ops/roi_pool/functions/roi_pool.py ================================================ import torch from torch.autograd import Function from .. import roi_pool_cuda class RoIPoolFunction(Function): @staticmethod def forward(ctx, features, rois, out_size, spatial_scale): if isinstance(out_size, int): out_h = out_size out_w = out_size elif isinstance(out_size, tuple): assert len(out_size) == 2 assert isinstance(out_size[0], int) assert isinstance(out_size[1], int) out_h, out_w = out_size else: raise TypeError( '"out_size" must be an integer or tuple of integers') assert features.is_cuda ctx.save_for_backward(rois) num_channels = features.size(1) num_rois = rois.size(0) out_size = (num_rois, num_channels, out_h, out_w) output = features.new_zeros(out_size) argmax = features.new_zeros(out_size, dtype=torch.int) roi_pool_cuda.forward(features, rois, out_h, out_w, spatial_scale, output, argmax) ctx.spatial_scale = spatial_scale ctx.feature_size = features.size() ctx.argmax = argmax return output @staticmethod def backward(ctx, grad_output): assert grad_output.is_cuda spatial_scale = ctx.spatial_scale feature_size = ctx.feature_size argmax = ctx.argmax rois = ctx.saved_tensors[0] assert feature_size is not None grad_input = grad_rois = None if ctx.needs_input_grad[0]: grad_input = grad_output.new_zeros(feature_size) roi_pool_cuda.backward(grad_output.contiguous(), rois, argmax, spatial_scale, grad_input) return grad_input, grad_rois, None, None roi_pool = RoIPoolFunction.apply ================================================ FILE: mmdetection/mmdet/ops/roi_pool/gradcheck.py ================================================ import torch from torch.autograd import gradcheck import os.path as osp import sys sys.path.append(osp.abspath(osp.join(__file__, '../../'))) from roi_pool import RoIPool # noqa: E402 feat = torch.randn(4, 16, 15, 15, requires_grad=True).cuda() rois = torch.Tensor([[0, 0, 0, 50, 50], [0, 10, 30, 43, 55], [1, 67, 40, 110, 120]]).cuda() inputs = (feat, rois) print('Gradcheck for roi pooling...') test = gradcheck(RoIPool(4, 1.0 / 8), inputs, eps=1e-5, atol=1e-3) print(test) ================================================ FILE: mmdetection/mmdet/ops/roi_pool/modules/__init__.py ================================================ ================================================ FILE: mmdetection/mmdet/ops/roi_pool/modules/roi_pool.py ================================================ from torch.nn.modules.module import Module from ..functions.roi_pool import roi_pool class RoIPool(Module): def __init__(self, out_size, spatial_scale): super(RoIPool, self).__init__() self.out_size = out_size self.spatial_scale = float(spatial_scale) def forward(self, features, rois): return roi_pool(features, rois, self.out_size, self.spatial_scale) ================================================ FILE: mmdetection/mmdet/ops/roi_pool/setup.py ================================================ from setuptools import setup from torch.utils.cpp_extension import BuildExtension, CUDAExtension setup( name='roi_pool', ext_modules=[ CUDAExtension('roi_pool_cuda', [ 'src/roi_pool_cuda.cpp', 'src/roi_pool_kernel.cu', ]) ], cmdclass={'build_ext': BuildExtension}) ================================================ FILE: mmdetection/mmdet/ops/roi_pool/src/roi_pool_cuda.cpp ================================================ #include #include #include int ROIPoolForwardLaucher(const at::Tensor features, const at::Tensor rois, const float spatial_scale, const int channels, const int height, const int width, const int num_rois, const int pooled_h, const int pooled_w, at::Tensor output, at::Tensor argmax); int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois, const at::Tensor argmax, const float spatial_scale, const int batch_size, const int channels, const int height, const int width, const int num_rois, const int pooled_h, const int pooled_w, at::Tensor bottom_grad); #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CONTIGUOUS(x) \ AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) int roi_pooling_forward_cuda(at::Tensor features, at::Tensor rois, int pooled_height, int pooled_width, float spatial_scale, at::Tensor output, at::Tensor argmax) { CHECK_INPUT(features); CHECK_INPUT(rois); CHECK_INPUT(output); CHECK_INPUT(argmax); // Number of ROIs int num_rois = rois.size(0); int size_rois = rois.size(1); if (size_rois != 5) { printf("wrong roi size\n"); return 0; } int channels = features.size(1); int height = features.size(2); int width = features.size(3); ROIPoolForwardLaucher(features, rois, spatial_scale, channels, height, width, num_rois, pooled_height, pooled_width, output, argmax); return 1; } int roi_pooling_backward_cuda(at::Tensor top_grad, at::Tensor rois, at::Tensor argmax, float spatial_scale, at::Tensor bottom_grad) { CHECK_INPUT(top_grad); CHECK_INPUT(rois); CHECK_INPUT(argmax); CHECK_INPUT(bottom_grad); int pooled_height = top_grad.size(2); int pooled_width = top_grad.size(3); int num_rois = rois.size(0); int size_rois = rois.size(1); if (size_rois != 5) { printf("wrong roi size\n"); return 0; } int batch_size = bottom_grad.size(0); int channels = bottom_grad.size(1); int height = bottom_grad.size(2); int width = bottom_grad.size(3); ROIPoolBackwardLaucher(top_grad, rois, argmax, spatial_scale, batch_size, channels, height, width, num_rois, pooled_height, pooled_width, bottom_grad); return 1; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("forward", &roi_pooling_forward_cuda, "Roi_Pooling forward (CUDA)"); m.def("backward", &roi_pooling_backward_cuda, "Roi_Pooling backward (CUDA)"); } ================================================ FILE: mmdetection/mmdet/ops/roi_pool/src/roi_pool_kernel.cu ================================================ #include #include #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ i += blockDim.x * gridDim.x) #define THREADS_PER_BLOCK 1024 inline int GET_BLOCKS(const int N) { int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; int max_block_num = 65000; return min(optimal_block_num, max_block_num); } template __global__ void ROIPoolForward(const int nthreads, const scalar_t *bottom_data, const scalar_t *rois, const scalar_t spatial_scale, const int channels, const int height, const int width, const int pooled_h, const int pooled_w, scalar_t *top_data, int *argmax_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_w; int ph = (index / pooled_w) % pooled_h; int c = (index / pooled_w / pooled_h) % channels; int n = index / pooled_w / pooled_h / channels; const scalar_t *offset_rois = rois + n * 5; int roi_batch_ind = offset_rois[0]; // calculate the roi region on feature maps scalar_t roi_x1 = offset_rois[1] * spatial_scale; scalar_t roi_y1 = offset_rois[2] * spatial_scale; scalar_t roi_x2 = (offset_rois[3] + 1) * spatial_scale; scalar_t roi_y2 = (offset_rois[4] + 1) * spatial_scale; // force malformed rois to be 1x1 scalar_t roi_w = roi_x2 - roi_x1; scalar_t roi_h = roi_y2 - roi_y1; if (roi_w <= 0 || roi_h <= 0) continue; scalar_t bin_size_w = roi_w / static_cast(pooled_w); scalar_t bin_size_h = roi_h / static_cast(pooled_h); // the corresponding bin region int bin_x1 = floor(static_cast(pw) * bin_size_w + roi_x1); int bin_y1 = floor(static_cast(ph) * bin_size_h + roi_y1); int bin_x2 = ceil(static_cast(pw + 1) * bin_size_w + roi_x1); int bin_y2 = ceil(static_cast(ph + 1) * bin_size_h + roi_y1); // add roi offsets and clip to input boundaries bin_x1 = min(max(bin_x1, 0), width); bin_y1 = min(max(bin_y1, 0), height); bin_x2 = min(max(bin_x2, 0), width); bin_y2 = min(max(bin_y2, 0), height); bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1); // If nothing is pooled, argmax = -1 causes nothing to be backprop'd int max_idx = -1; bottom_data += (roi_batch_ind * channels + c) * height * width; // Define an empty pooling region to be zero scalar_t max_val = is_empty ? static_cast(0) : bottom_data[bin_y1 * width + bin_x1] - 1; for (int h = bin_y1; h < bin_y2; ++h) { for (int w = bin_x1; w < bin_x2; ++w) { int offset = h * width + w; if (bottom_data[offset] > max_val) { max_val = bottom_data[offset]; max_idx = offset; } } } top_data[index] = max_val; if (argmax_data != NULL) argmax_data[index] = max_idx; } } int ROIPoolForwardLaucher(const at::Tensor features, const at::Tensor rois, const float spatial_scale, const int channels, const int height, const int width, const int num_rois, const int pooled_h, const int pooled_w, at::Tensor output, at::Tensor argmax) { const int output_size = num_rois * channels * pooled_h * pooled_w; AT_DISPATCH_FLOATING_TYPES_AND_HALF( features.type(), "ROIPoolLaucherForward", ([&] { const scalar_t *bottom_data = features.data(); const scalar_t *rois_data = rois.data(); scalar_t *top_data = output.data(); int *argmax_data = argmax.data(); ROIPoolForward <<>>( output_size, bottom_data, rois_data, scalar_t(spatial_scale), channels, height, width, pooled_h, pooled_w, top_data, argmax_data); })); THCudaCheck(cudaGetLastError()); return 1; } template __global__ void ROIPoolBackward(const int nthreads, const scalar_t *top_diff, const scalar_t *rois, const int *argmax_data, const scalar_t spatial_scale, const int channels, const int height, const int width, const int pooled_h, const int pooled_w, scalar_t *bottom_diff) { CUDA_1D_KERNEL_LOOP(index, nthreads) { int pw = index % pooled_w; int ph = (index / pooled_w) % pooled_h; int c = (index / pooled_w / pooled_h) % channels; int n = index / pooled_w / pooled_h / channels; int roi_batch_ind = rois[n * 5]; int bottom_index = argmax_data[(n * channels + c) * pooled_h * pooled_w + ph * pooled_w + pw]; atomicAdd(bottom_diff + (roi_batch_ind * channels + c) * height * width + bottom_index, top_diff[index]); } } int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois, const at::Tensor argmax, const float spatial_scale, const int batch_size, const int channels, const int height, const int width, const int num_rois, const int pooled_h, const int pooled_w, at::Tensor bottom_grad) { const int output_size = num_rois * pooled_h * pooled_w * channels; AT_DISPATCH_FLOATING_TYPES_AND_HALF( top_grad.type(), "ROIPoolLaucherBackward", ([&] { const scalar_t *top_diff = top_grad.data(); const scalar_t *rois_data = rois.data(); const int *argmax_data = argmax.data(); scalar_t *bottom_diff = bottom_grad.data(); if (sizeof(scalar_t) == sizeof(double)) { fprintf(stderr, "double is not supported\n"); exit(-1); } ROIPoolBackward <<>>( output_size, top_diff, rois_data, argmax_data, scalar_t(spatial_scale), channels, height, width, pooled_h, pooled_w, bottom_diff); })); THCudaCheck(cudaGetLastError()); return 1; } ================================================ FILE: mmdetection/mmdet/ops/sigmoid_focal_loss/__init__.py ================================================ from .modules.sigmoid_focal_loss import SigmoidFocalLoss, sigmoid_focal_loss __all__ = ['SigmoidFocalLoss', 'sigmoid_focal_loss'] ================================================ FILE: mmdetection/mmdet/ops/sigmoid_focal_loss/functions/__init__.py ================================================ ================================================ FILE: mmdetection/mmdet/ops/sigmoid_focal_loss/functions/sigmoid_focal_loss.py ================================================ import torch.nn.functional as F from torch.autograd import Function from torch.autograd.function import once_differentiable from .. import sigmoid_focal_loss_cuda class SigmoidFocalLossFunction(Function): @staticmethod def forward(ctx, input, target, gamma=2.0, alpha=0.25, reduction='mean'): ctx.save_for_backward(input, target) num_classes = input.shape[1] ctx.num_classes = num_classes ctx.gamma = gamma ctx.alpha = alpha loss = sigmoid_focal_loss_cuda.forward(input, target, num_classes, gamma, alpha) reduction_enum = F._Reduction.get_enum(reduction) # none: 0, mean:1, sum: 2 if reduction_enum == 0: return loss elif reduction_enum == 1: return loss.mean() elif reduction_enum == 2: return loss.sum() @staticmethod @once_differentiable def backward(ctx, d_loss): input, target = ctx.saved_tensors num_classes = ctx.num_classes gamma = ctx.gamma alpha = ctx.alpha d_loss = d_loss.contiguous() d_input = sigmoid_focal_loss_cuda.backward(input, target, d_loss, num_classes, gamma, alpha) return d_input, None, None, None, None sigmoid_focal_loss = SigmoidFocalLossFunction.apply ================================================ FILE: mmdetection/mmdet/ops/sigmoid_focal_loss/modules/__init__.py ================================================ ================================================ FILE: mmdetection/mmdet/ops/sigmoid_focal_loss/modules/sigmoid_focal_loss.py ================================================ from torch import nn from ..functions.sigmoid_focal_loss import sigmoid_focal_loss class SigmoidFocalLoss(nn.Module): def __init__(self, gamma, alpha): super(SigmoidFocalLoss, self).__init__() self.gamma = gamma self.alpha = alpha def forward(self, logits, targets): assert logits.is_cuda loss = sigmoid_focal_loss(logits, targets, self.gamma, self.alpha) return loss.sum() def __repr__(self): tmpstr = self.__class__.__name__ + "(" tmpstr += "gamma=" + str(self.gamma) tmpstr += ", alpha=" + str(self.alpha) tmpstr += ")" return tmpstr ================================================ FILE: mmdetection/mmdet/ops/sigmoid_focal_loss/setup.py ================================================ from setuptools import setup from torch.utils.cpp_extension import BuildExtension, CUDAExtension setup( name='SigmoidFocalLoss', ext_modules=[ CUDAExtension('sigmoid_focal_loss_cuda', [ 'src/sigmoid_focal_loss.cpp', 'src/sigmoid_focal_loss_cuda.cu', ]), ], cmdclass={'build_ext': BuildExtension}) ================================================ FILE: mmdetection/mmdet/ops/sigmoid_focal_loss/src/sigmoid_focal_loss.cpp ================================================ // modify from // https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h #include at::Tensor SigmoidFocalLoss_forward_cuda(const at::Tensor &logits, const at::Tensor &targets, const int num_classes, const float gamma, const float alpha); at::Tensor SigmoidFocalLoss_backward_cuda(const at::Tensor &logits, const at::Tensor &targets, const at::Tensor &d_losses, const int num_classes, const float gamma, const float alpha); // Interface for Python at::Tensor SigmoidFocalLoss_forward(const at::Tensor &logits, const at::Tensor &targets, const int num_classes, const float gamma, const float alpha) { if (logits.type().is_cuda()) { return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha); } } at::Tensor SigmoidFocalLoss_backward(const at::Tensor &logits, const at::Tensor &targets, const at::Tensor &d_losses, const int num_classes, const float gamma, const float alpha) { if (logits.type().is_cuda()) { return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha); } } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss forward (CUDA)"); m.def("backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss backward (CUDA)"); } ================================================ FILE: mmdetection/mmdet/ops/sigmoid_focal_loss/src/sigmoid_focal_loss_cuda.cu ================================================ // modify from // https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. // This file is modified from // https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu // Cheng-Yang Fu // cyfu@cs.unc.edu #include #include #include #include #include #include // TODO make it in a common file #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ i += blockDim.x * gridDim.x) template __global__ void SigmoidFocalLossForward(const int nthreads, const scalar_t *logits, const long *targets, const int num_classes, const float gamma, const float alpha, const int num, scalar_t *losses) { CUDA_1D_KERNEL_LOOP(i, nthreads) { int n = i / num_classes; int d = i % num_classes; // current class[0~79]; int t = targets[n]; // target class [1~80]; // Decide it is positive or negative case. scalar_t c1 = (t == (d + 1)); scalar_t c2 = (t >= 0 & t != (d + 1)); scalar_t zn = (1.0 - alpha); scalar_t zp = (alpha); // p = 1. / 1. + expf(-x); p = sigmoid(x) scalar_t p = 1. / (1. + expf(-logits[i])); // (1-p)**gamma * log(p) where scalar_t term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN)); // p**gamma * log(1-p) scalar_t term2 = powf(p, gamma) * (-1. * logits[i] * (logits[i] >= 0) - logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))); losses[i] = 0.0; losses[i] += -c1 * term1 * zp; losses[i] += -c2 * term2 * zn; } // CUDA_1D_KERNEL_LOOP } // SigmoidFocalLossForward template __global__ void SigmoidFocalLossBackward( const int nthreads, const scalar_t *logits, const long *targets, const scalar_t *d_losses, const int num_classes, const float gamma, const float alpha, const int num, scalar_t *d_logits) { CUDA_1D_KERNEL_LOOP(i, nthreads) { int n = i / num_classes; int d = i % num_classes; // current class[0~79]; int t = targets[n]; // target class [1~80], 0 is background; // Decide it is positive or negative case. scalar_t c1 = (t == (d + 1)); scalar_t c2 = (t >= 0 & t != (d + 1)); scalar_t zn = (1.0 - alpha); scalar_t zp = (alpha); // p = 1. / 1. + expf(-x); p = sigmoid(x) scalar_t p = 1. / (1. + expf(-logits[i])); // (1-p)**g * (1 - p - g*p*log(p) scalar_t term1 = powf((1. - p), gamma) * (1. - p - (p * gamma * logf(max(p, FLT_MIN)))); // (p**g) * (g*(1-p)*log(1-p) - p) scalar_t term2 = powf(p, gamma) * ((-1. * logits[i] * (logits[i] >= 0) - logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) * (1. - p) * gamma - p); d_logits[i] = 0.0; d_logits[i] += -c1 * term1 * zp; d_logits[i] += -c2 * term2 * zn; d_logits[i] = d_logits[i] * d_losses[i]; } // CUDA_1D_KERNEL_LOOP } // SigmoidFocalLossBackward at::Tensor SigmoidFocalLoss_forward_cuda(const at::Tensor &logits, const at::Tensor &targets, const int num_classes, const float gamma, const float alpha) { AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); const int num_samples = logits.size(0); auto losses = at::empty({num_samples, logits.size(1)}, logits.options()); auto losses_size = num_samples * logits.size(1); dim3 grid(std::min(THCCeilDiv(losses_size, 512L), 4096L)); dim3 block(512); if (losses.numel() == 0) { THCudaCheck(cudaGetLastError()); return losses; } AT_DISPATCH_FLOATING_TYPES_AND_HALF( logits.type(), "SigmoidFocalLoss_forward", [&] { SigmoidFocalLossForward<<>>( losses_size, logits.contiguous().data(), targets.contiguous().data(), num_classes, gamma, alpha, num_samples, losses.data()); }); THCudaCheck(cudaGetLastError()); return losses; } at::Tensor SigmoidFocalLoss_backward_cuda(const at::Tensor &logits, const at::Tensor &targets, const at::Tensor &d_losses, const int num_classes, const float gamma, const float alpha) { AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); AT_ASSERTM(d_losses.type().is_cuda(), "d_losses must be a CUDA tensor"); AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); const int num_samples = logits.size(0); AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes"); auto d_logits = at::zeros({num_samples, num_classes}, logits.options()); auto d_logits_size = num_samples * logits.size(1); dim3 grid(std::min(THCCeilDiv(d_logits_size, 512L), 4096L)); dim3 block(512); if (d_logits.numel() == 0) { THCudaCheck(cudaGetLastError()); return d_logits; } AT_DISPATCH_FLOATING_TYPES_AND_HALF( logits.type(), "SigmoidFocalLoss_backward", [&] { SigmoidFocalLossBackward<<>>( d_logits_size, logits.contiguous().data(), targets.contiguous().data(), d_losses.contiguous().data(), num_classes, gamma, alpha, num_samples, d_logits.data()); }); THCudaCheck(cudaGetLastError()); return d_logits; } ================================================ FILE: mmdetection/setup.py ================================================ import os import subprocess import time from setuptools import find_packages, setup def readme(): with open('README.md', encoding='utf-8') as f: content = f.read() return content MAJOR = 0 MINOR = 6 PATCH = 0 SUFFIX = '' SHORT_VERSION = '{}.{}.{}{}'.format(MAJOR, MINOR, PATCH, SUFFIX) version_file = 'mmdet/version.py' def get_git_hash(): def _minimal_ext_cmd(cmd): # construct minimal environment env = {} for k in ['SYSTEMROOT', 'PATH', 'HOME']: v = os.environ.get(k) if v is not None: env[k] = v # LANGUAGE is used on win32 env['LANGUAGE'] = 'C' env['LANG'] = 'C' env['LC_ALL'] = 'C' out = subprocess.Popen( cmd, stdout=subprocess.PIPE, env=env).communicate()[0] return out try: out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD']) sha = out.strip().decode('ascii') except OSError: sha = 'unknown' return sha def get_hash(): if os.path.exists('.git'): sha = get_git_hash()[:7] elif os.path.exists(version_file): try: from mmdet.version import __version__ sha = __version__.split('+')[-1] except ImportError: raise ImportError('Unable to get git version') else: sha = 'unknown' return sha def write_version_py(): content = """# GENERATED VERSION FILE # TIME: {} __version__ = '{}' short_version = '{}' """ sha = get_hash() VERSION = SHORT_VERSION + '+' + sha with open(version_file, 'w') as f: f.write(content.format(time.asctime(), VERSION, SHORT_VERSION)) def get_version(): with open(version_file, 'r') as f: exec(compile(f.read(), version_file, 'exec')) return locals()['__version__'] if __name__ == '__main__': write_version_py() setup( name='mmdet', version=get_version(), description='Open MMLab Detection Toolbox', long_description=readme(), keywords='computer vision, object detection', url='https://github.com/open-mmlab/mmdetection', packages=find_packages(exclude=('configs', 'tools', 'demo')), package_data={'mmdet.ops': ['*/*.so']}, classifiers=[ 'Development Status :: 4 - Beta', 'License :: OSI Approved :: Apache Software License', 'Operating System :: OS Independent', 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', ], license='GPLv3', setup_requires=['pytest-runner'], tests_require=['pytest'], install_requires=[ 'mmcv>=0.2.6', 'numpy', 'matplotlib', 'six', 'terminaltables', 'pycocotools' ], zip_safe=False) ================================================ FILE: mmdetection/tools/analyze_logs.py ================================================ import argparse import json from collections import defaultdict import matplotlib.pyplot as plt import numpy as np import seaborn as sns def cal_train_time(log_dicts, args): for i, log_dict in enumerate(log_dicts): print('{}Analyze train time of {}{}'.format('-' * 5, args.json_logs[i], '-' * 5)) all_times = [] for epoch in log_dict.keys(): if args.include_outliers: all_times.append(log_dict[epoch]['time']) else: all_times.append(log_dict[epoch]['time'][1:]) all_times = np.array(all_times) epoch_ave_time = all_times.mean(-1) slowest_epoch = epoch_ave_time.argmax() fastest_epoch = epoch_ave_time.argmin() std_over_epoch = epoch_ave_time.std() print('slowest epoch {}, average time is {:.4f}'.format( slowest_epoch + 1, epoch_ave_time[slowest_epoch])) print('fastest epoch {}, average time is {:.4f}'.format( fastest_epoch + 1, epoch_ave_time[fastest_epoch])) print('time std over epochs is {:.4f}'.format(std_over_epoch)) print('average iter time: {:.4f} s/iter'.format(np.mean(all_times))) print() def plot_curve(log_dicts, args): if args.backend is not None: plt.switch_backend(args.backend) sns.set_style(args.style) # if legend is None, use {filename}_{key} as legend legend = args.legend if legend is None: legend = [] for json_log in args.json_logs: for metric in args.keys: legend.append('{}_{}'.format(json_log, metric)) assert len(legend) == (len(args.json_logs) * len(args.keys)) metrics = args.keys num_metrics = len(metrics) for i, log_dict in enumerate(log_dicts): epochs = list(log_dict.keys()) for j, metric in enumerate(metrics): print('plot curve of {}, metric is {}'.format( args.json_logs[i], metric)) assert metric in log_dict[ epochs[0]], '{} does not contain metric {}'.format( args.json_logs[i], metric) if 'mAP' in metric: xs = np.arange(1, max(epochs) + 1) ys = [] for epoch in epochs: ys += log_dict[epoch][metric] ax = plt.gca() ax.set_xticks(xs) plt.xlabel('epoch') plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o') else: xs = [] ys = [] num_iters_per_epoch = log_dict[epochs[0]]['iter'][-1] for epoch in epochs: iters = log_dict[epoch]['iter'] if log_dict[epoch]['mode'][-1] == 'val': iters = iters[:-1] xs.append( np.array(iters) + (epoch - 1) * num_iters_per_epoch) ys.append(np.array(log_dict[epoch][metric][:len(iters)])) xs = np.concatenate(xs) ys = np.concatenate(ys) plt.xlabel('iter') plt.plot( xs, ys, label=legend[i * num_metrics + j], linewidth=0.5) plt.legend() if args.title is not None: plt.title(args.title) if args.out is None: plt.show() else: print('save curve to: {}'.format(args.out)) plt.savefig(args.out) plt.cla() def add_plot_parser(subparsers): parser_plt = subparsers.add_parser( 'plot_curve', help='parser for plotting curves') parser_plt.add_argument( 'json_logs', type=str, nargs='+', help='path of train log in json format') parser_plt.add_argument( '--keys', type=str, nargs='+', default=['bbox_mAP'], help='the metric that you want to plot') parser_plt.add_argument('--title', type=str, help='title of figure') parser_plt.add_argument( '--legend', type=str, nargs='+', default=None, help='legend of each plot') parser_plt.add_argument( '--backend', type=str, default=None, help='backend of plt') parser_plt.add_argument( '--style', type=str, default='dark', help='style of plt') parser_plt.add_argument('--out', type=str, default=None) def add_time_parser(subparsers): parser_time = subparsers.add_parser( 'cal_train_time', help='parser for computing the average time per training iteration') parser_time.add_argument( 'json_logs', type=str, nargs='+', help='path of train log in json format') parser_time.add_argument( '--include-outliers', action='store_true', help='include the first value of every epoch when computing ' 'the average time') def parse_args(): parser = argparse.ArgumentParser(description='Analyze Json Log') # currently only support plot curve and calculate average train time subparsers = parser.add_subparsers(dest='task', help='task parser') add_plot_parser(subparsers) add_time_parser(subparsers) args = parser.parse_args() return args def load_json_logs(json_logs): # load and convert json_logs to log_dict, key is epoch, value is a sub dict # keys of sub dict is different metrics, e.g. memory, bbox_mAP # value of sub dict is a list of corresponding values of all iterations log_dicts = [dict() for _ in json_logs] for json_log, log_dict in zip(json_logs, log_dicts): with open(json_log, 'r') as log_file: for l in log_file: log = json.loads(l.strip()) epoch = log.pop('epoch') if epoch not in log_dict: log_dict[epoch] = defaultdict(list) for k, v in log.items(): log_dict[epoch][k].append(v) return log_dicts def main(): args = parse_args() json_logs = args.json_logs for json_log in json_logs: assert json_log.endswith('.json') log_dicts = load_json_logs(json_logs) eval(args.task)(log_dicts, args) if __name__ == '__main__': main() ================================================ FILE: mmdetection/tools/coco_eval.py ================================================ from argparse import ArgumentParser from mmdet.core import coco_eval def main(): parser = ArgumentParser(description='COCO Evaluation') parser.add_argument('result', help='result file path') parser.add_argument('--ann', help='annotation file path') parser.add_argument( '--types', type=str, nargs='+', choices=['proposal_fast', 'proposal', 'bbox', 'segm', 'keypoint'], default=['bbox'], help='result types') parser.add_argument( '--max-dets', type=int, nargs='+', default=[100, 300, 1000], help='proposal numbers, only used for recall evaluation') args = parser.parse_args() coco_eval(args.result, args.types, args.ann, args.max_dets) if __name__ == '__main__': main() ================================================ FILE: mmdetection/tools/convert_datasets/pascal_voc.py ================================================ import argparse import os.path as osp import xml.etree.ElementTree as ET import mmcv import numpy as np from mmdet.core import voc_classes label_ids = {name: i + 1 for i, name in enumerate(voc_classes())} def parse_xml(args): xml_path, img_path = args tree = ET.parse(xml_path) root = tree.getroot() size = root.find('size') w = int(size.find('width').text) h = int(size.find('height').text) bboxes = [] labels = [] bboxes_ignore = [] labels_ignore = [] for obj in root.findall('object'): name = obj.find('name').text label = label_ids[name] difficult = int(obj.find('difficult').text) bnd_box = obj.find('bndbox') bbox = [ int(bnd_box.find('xmin').text), int(bnd_box.find('ymin').text), int(bnd_box.find('xmax').text), int(bnd_box.find('ymax').text) ] if difficult: bboxes_ignore.append(bbox) labels_ignore.append(label) else: bboxes.append(bbox) labels.append(label) if not bboxes: bboxes = np.zeros((0, 4)) labels = np.zeros((0, )) else: bboxes = np.array(bboxes, ndmin=2) - 1 labels = np.array(labels) if not bboxes_ignore: bboxes_ignore = np.zeros((0, 4)) labels_ignore = np.zeros((0, )) else: bboxes_ignore = np.array(bboxes_ignore, ndmin=2) - 1 labels_ignore = np.array(labels_ignore) annotation = { 'filename': img_path, 'width': w, 'height': h, 'ann': { 'bboxes': bboxes.astype(np.float32), 'labels': labels.astype(np.int64), 'bboxes_ignore': bboxes_ignore.astype(np.float32), 'labels_ignore': labels_ignore.astype(np.int64) } } return annotation def cvt_annotations(devkit_path, years, split, out_file): if not isinstance(years, list): years = [years] annotations = [] for year in years: filelist = osp.join(devkit_path, 'VOC{}/ImageSets/Main/{}.txt'.format( year, split)) if not osp.isfile(filelist): print('filelist does not exist: {}, skip voc{} {}'.format( filelist, year, split)) return img_names = mmcv.list_from_file(filelist) xml_paths = [ osp.join(devkit_path, 'VOC{}/Annotations/{}.xml'.format( year, img_name)) for img_name in img_names ] img_paths = [ 'VOC{}/JPEGImages/{}.jpg'.format(year, img_name) for img_name in img_names ] part_annotations = mmcv.track_progress(parse_xml, list(zip(xml_paths, img_paths))) annotations.extend(part_annotations) mmcv.dump(annotations, out_file) return annotations def parse_args(): parser = argparse.ArgumentParser( description='Convert PASCAL VOC annotations to mmdetection format') parser.add_argument('devkit_path', help='pascal voc devkit path') parser.add_argument('-o', '--out-dir', help='output path') args = parser.parse_args() return args def main(): args = parse_args() devkit_path = args.devkit_path out_dir = args.out_dir if args.out_dir else devkit_path mmcv.mkdir_or_exist(out_dir) years = [] if osp.isdir(osp.join(devkit_path, 'VOC2007')): years.append('2007') if osp.isdir(osp.join(devkit_path, 'VOC2012')): years.append('2012') if '2007' in years and '2012' in years: years.append(['2007', '2012']) if not years: raise IOError('The devkit path {} contains neither "VOC2007" nor ' '"VOC2012" subfolder'.format(devkit_path)) for year in years: if year == '2007': prefix = 'voc07' elif year == '2012': prefix = 'voc12' elif year == ['2007', '2012']: prefix = 'voc0712' for split in ['train', 'val', 'trainval']: dataset_name = prefix + '_' + split print('processing {} ...'.format(dataset_name)) cvt_annotations(devkit_path, year, split, osp.join(out_dir, dataset_name + '.pkl')) if not isinstance(year, list): dataset_name = prefix + '_test' print('processing {} ...'.format(dataset_name)) cvt_annotations(devkit_path, year, 'test', osp.join(out_dir, dataset_name + '.pkl')) print('Done!') if __name__ == '__main__': main() ================================================ FILE: mmdetection/tools/dist_test.sh ================================================ #!/usr/bin/env bash PYTHON=${PYTHON:-"python"} CONFIG=$1 CHECKPOINT=$2 GPUS=$3 $PYTHON -m torch.distributed.launch --nproc_per_node=$GPUS \ $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} ================================================ FILE: mmdetection/tools/dist_train.sh ================================================ #!/usr/bin/env bash PYTHON=${PYTHON:-"python"} CONFIG=$1 GPUS=$2 $PYTHON -m torch.distributed.launch --nproc_per_node=$GPUS \ $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} ================================================ FILE: mmdetection/tools/publish_model.py ================================================ import argparse import subprocess import torch def parse_args(): parser = argparse.ArgumentParser( description='Process a checkpoint to be published') parser.add_argument('in_file', help='input checkpoint filename') parser.add_argument('out_file', help='output checkpoint filename') args = parser.parse_args() return args def process_checkpoint(in_file, out_file): checkpoint = torch.load(in_file, map_location='cpu') # remove optimizer for smaller file size if 'optimizer' in checkpoint: del checkpoint['optimizer'] # if it is necessary to remove some sensitive data in checkpoint['meta'], # add the code here. torch.save(checkpoint, out_file) sha = subprocess.check_output(['sha256sum', out_file]).decode() final_file = out_file.rstrip('.pth') + '-{}.pth'.format(sha[:8]) subprocess.Popen(['mv', out_file, final_file]) def main(): args = parse_args() process_checkpoint(args.in_file, args.out_file) if __name__ == '__main__': main() ================================================ FILE: mmdetection/tools/slurm_test.sh ================================================ #!/usr/bin/env bash set -x PARTITION=$1 JOB_NAME=$2 CONFIG=$3 CHECKPOINT=$4 GPUS=${GPUS:-8} GPUS_PER_NODE=${GPUS_PER_NODE:-8} CPUS_PER_TASK=${CPUS_PER_TASK:-5} PY_ARGS=${@:5} SRUN_ARGS=${SRUN_ARGS:-""} srun -p ${PARTITION} \ --job-name=${JOB_NAME} \ --gres=gpu:${GPUS_PER_NODE} \ --ntasks=${GPUS} \ --ntasks-per-node=${GPUS_PER_NODE} \ --cpus-per-task=${CPUS_PER_TASK} \ --kill-on-bad-exit=1 \ ${SRUN_ARGS} \ python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} ================================================ FILE: mmdetection/tools/slurm_train.sh ================================================ #!/usr/bin/env bash set -x PARTITION=$1 JOB_NAME=$2 CONFIG=$3 WORK_DIR=$4 GPUS=${5:-8} GPUS_PER_NODE=${GPUS_PER_NODE:-8} CPUS_PER_TASK=${CPUS_PER_TASK:-5} SRUN_ARGS=${SRUN_ARGS:-""} PY_ARGS=${PY_ARGS:-"--validate"} srun -p ${PARTITION} \ --job-name=${JOB_NAME} \ --gres=gpu:${GPUS_PER_NODE} \ --ntasks=${GPUS} \ --ntasks-per-node=${GPUS_PER_NODE} \ --cpus-per-task=${CPUS_PER_TASK} \ --kill-on-bad-exit=1 \ ${SRUN_ARGS} \ python -u tools/train.py ${CONFIG} --work_dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS} ================================================ FILE: mmdetection/tools/test.py ================================================ import argparse import os.path as osp import shutil import tempfile import mmcv import torch import torch.distributed as dist from mmcv.runner import load_checkpoint, get_dist_info from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmdet.apis import init_dist from mmdet.core import results2json, coco_eval from mmdet.datasets import build_dataloader, get_dataset from mmdet.models import build_detector def single_gpu_test(model, data_loader, show=False): model.eval() results = [] dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=not show, **data) results.append(result) if show: model.module.show_result(data, result, dataset.img_norm_cfg) batch_size = data['img'][0].size(0) for _ in range(batch_size): prog_bar.update() return results def multi_gpu_test(model, data_loader, tmpdir=None): model.eval() results = [] dataset = data_loader.dataset rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) results.append(result) if rank == 0: batch_size = data['img'][0].size(0) for _ in range(batch_size * world_size): prog_bar.update() # collect results from all ranks results = collect_results(results, len(dataset), tmpdir) return results def collect_results(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() tmpdir = torch.tensor(bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results def parse_args(): parser = argparse.ArgumentParser(description='MMDet test detector') parser.add_argument('config', help='test config file path') parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument('--out', help='output result file') parser.add_argument('--ann_file', default=None, type=str) parser.add_argument('--img_prefix', default=None, type=str) parser.add_argument( '--eval', type=str, nargs='+', choices=['proposal', 'proposal_fast', 'bbox', 'segm', 'keypoints'], help='eval types') parser.add_argument('--show', action='store_true', help='show results') parser.add_argument('--tmpdir', help='tmp dir for writing some results') parser.add_argument('--flip', action='store_true') parser.add_argument('--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() return args def main(): args = parse_args() if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') cfg = mmcv.Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True if args.ann_file is not None: cfg.data.test.ann_file = args.ann_file if args.img_prefix is not None: cfg.data.test.img_prefix = args.img_prefix if args.flip: cfg.data.test.flip_ratio = 1 # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) dataset = get_dataset(cfg.data.test) data_loader = build_dataloader(dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) load_checkpoint(model, args.checkpoint, map_location='cpu') if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader, args.show) else: model = MMDistributedDataParallel(model.cuda()) outputs = multi_gpu_test(model, data_loader, args.tmpdir) rank, _ = get_dist_info() if args.out and rank == 0: print('\nwriting results to {}'.format(args.out)) mmcv.dump(outputs, args.out) eval_types = args.eval if eval_types: print('Starting evaluate {}'.format(' and '.join(eval_types))) if eval_types == ['proposal_fast']: result_file = args.out coco_eval(result_file, eval_types, dataset.coco) else: if not isinstance(outputs[0], dict): result_file = args.out + '.json' results2json(dataset, outputs, result_file) coco_eval(result_file, eval_types, dataset.coco) else: for name in outputs[0]: print('\nEvaluating {}'.format(name)) outputs_ = [out[name] for out in outputs] result_file = args.out + '.{}.json'.format(name) results2json(dataset, outputs_, result_file) coco_eval(result_file, eval_types, dataset.coco) if __name__ == '__main__': main() ================================================ FILE: mmdetection/tools/test_ensemble.py ================================================ import argparse import os.path as osp import shutil import tempfile import mmcv import torch import torch.distributed as dist from mmcv.runner import load_checkpoint, get_dist_info from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmdet.apis import init_dist from mmdet.core import results2json, coco_eval from mmdet.datasets import build_dataloader, get_dataset from mmdet.models import build_detector from mmdet.models.detectors.ensemble_htc import EnsembleHTC def single_gpu_test(model, data_loader, show=False): model.eval() results = [] dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=not show, **data) results.append(result) if show: model.module.show_result(data, result, dataset.img_norm_cfg) batch_size = data['img'][0].size(0) for _ in range(batch_size): prog_bar.update() return results def multi_gpu_test(model, data_loader, tmpdir=None): model.eval() results = [] dataset = data_loader.dataset rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) results.append(result) if rank == 0: batch_size = data['img'][0].size(0) for _ in range(batch_size * world_size): prog_bar.update() # collect results from all ranks results = collect_results(results, len(dataset), tmpdir) return results def collect_results(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() tmpdir = torch.tensor(bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results def parse_args(): parser = argparse.ArgumentParser(description='MMDet test detector') parser.add_argument('config', type=str) parser.add_argument('--checkpoint', type=str, nargs='+') parser.add_argument('--out', help='output result file') parser.add_argument('--ann_file', default=None, type=str) parser.add_argument('--img_prefix', default=None, type=str) parser.add_argument( '--eval', type=str, nargs='+', choices=['proposal', 'proposal_fast', 'bbox', 'segm', 'keypoints'], help='eval types') parser.add_argument('--show', action='store_true', help='show results') parser.add_argument('--tmpdir', help='tmp dir for writing some results') parser.add_argument('--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() return args def main(): args = parse_args() if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') cfg = mmcv.Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True if args.ann_file is not None: cfg.data.test.ann_file = args.ann_file if args.img_prefix is not None: cfg.data.test.img_prefix = args.img_prefix # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) dataset = get_dataset(cfg.data.test) data_loader = build_dataloader(dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint models = [] print(f"checkpoints: {args.checkpoint}") for checkpoint in args.checkpoint: model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) load_checkpoint(model, checkpoint, map_location='cpu') models.append(model) model = EnsembleHTC(models) if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader, args.show) else: model = MMDistributedDataParallel(model.cuda()) outputs = multi_gpu_test(model, data_loader, args.tmpdir) rank, _ = get_dist_info() if args.out and rank == 0: print('\nwriting results to {}'.format(args.out)) mmcv.dump(outputs, args.out) eval_types = args.eval if eval_types: print('Starting evaluate {}'.format(' and '.join(eval_types))) if eval_types == ['proposal_fast']: result_file = args.out coco_eval(result_file, eval_types, dataset.coco) else: if not isinstance(outputs[0], dict): result_file = args.out + '.json' results2json(dataset, outputs, result_file) coco_eval(result_file, eval_types, dataset.coco) else: for name in outputs[0]: print('\nEvaluating {}'.format(name)) outputs_ = [out[name] for out in outputs] result_file = args.out + '.{}.json'.format(name) results2json(dataset, outputs_, result_file) coco_eval(result_file, eval_types, dataset.coco) if __name__ == '__main__': main() ================================================ FILE: mmdetection/tools/train.py ================================================ from __future__ import division import argparse from mmcv import Config from mmdet import __version__ from mmdet.datasets import get_dataset from mmdet.apis import (train_detector, init_dist, get_root_logger, set_random_seed) from mmdet.models import build_detector import torch def parse_args(): parser = argparse.ArgumentParser(description='Train a detector') parser.add_argument('config', help='train config file path') parser.add_argument('--work_dir', help='the dir to save logs and models') parser.add_argument( '--resume_from', help='the checkpoint file to resume from') parser.add_argument( '--validate', action='store_true', help='whether to evaluate the checkpoint during training') parser.add_argument( '--gpus', type=int, default=4, help='number of gpus to use ' '(only applicable to non-distributed training)') parser.add_argument('--seed', type=int, default=None, help='random seed') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() return args def main(): args = parse_args() cfg = Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # update configs according to CLI args if args.work_dir is not None: cfg.work_dir = args.work_dir if args.resume_from is not None: cfg.resume_from = args.resume_from cfg.gpus = args.gpus # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # init logger before other steps logger = get_root_logger(cfg.log_level) logger.info('Distributed training: {}'.format(distributed)) # set random seeds if args.seed is not None: logger.info('Set random seed to {}'.format(args.seed)) set_random_seed(args.seed) model = build_detector( cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) train_dataset = get_dataset(cfg.data.train) if cfg.checkpoint_config is not None: # save mmdet version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict( mmdet_version=__version__, config=cfg.text, classes=train_dataset.CLASSES) # add an attribute for visualization convenience model.CLASSES = train_dataset.CLASSES train_detector( model, train_dataset, cfg, distributed=distributed, validate=args.validate, logger=logger) if __name__ == '__main__': main() ================================================ FILE: mmdetection/tools/upgrade_model_version.py ================================================ import argparse import re from collections import OrderedDict import torch def convert(in_file, out_file): """Convert keys in checkpoints. There can be some breaking changes during the development of mmdetection, and this tool is used for upgrading checkpoints trained with old versions to the latest one. """ checkpoint = torch.load(in_file) in_state_dict = checkpoint.pop('state_dict') out_state_dict = OrderedDict() for key, val in in_state_dict.items(): # Use ConvModule instead of nn.Conv2d in RetinaNet # cls_convs.0.weight -> cls_convs.0.conv.weight m = re.search(r'(cls_convs|reg_convs).\d.(weight|bias)', key) if m is not None: param = m.groups()[1] new_key = key.replace(param, 'conv.{}'.format(param)) out_state_dict[new_key] = val continue out_state_dict[key] = val checkpoint['state_dict'] = out_state_dict torch.save(checkpoint, out_file) def main(): parser = argparse.ArgumentParser(description='Upgrade model version') parser.add_argument('in_file', help='input checkpoint file') parser.add_argument('out_file', help='output checkpoint file') args = parser.parse_args() convert(args.in_file, args.out_file) if __name__ == '__main__': main() ================================================ FILE: mmdetection/tools/voc_eval.py ================================================ from argparse import ArgumentParser import mmcv import numpy as np from mmdet import datasets from mmdet.core import eval_map def voc_eval(result_file, dataset, iou_thr=0.5): det_results = mmcv.load(result_file) gt_bboxes = [] gt_labels = [] gt_ignore = [] for i in range(len(dataset)): ann = dataset.get_ann_info(i) bboxes = ann['bboxes'] labels = ann['labels'] if 'bboxes_ignore' in ann: ignore = np.concatenate([ np.zeros(bboxes.shape[0], dtype=np.bool), np.ones(ann['bboxes_ignore'].shape[0], dtype=np.bool) ]) gt_ignore.append(ignore) bboxes = np.vstack([bboxes, ann['bboxes_ignore']]) labels = np.concatenate([labels, ann['labels_ignore']]) gt_bboxes.append(bboxes) gt_labels.append(labels) if not gt_ignore: gt_ignore = gt_ignore if hasattr(dataset, 'year') and dataset.year == 2007: dataset_name = 'voc07' else: dataset_name = dataset.CLASSES eval_map( det_results, gt_bboxes, gt_labels, gt_ignore=gt_ignore, scale_ranges=None, iou_thr=iou_thr, dataset=dataset_name, print_summary=True) def main(): parser = ArgumentParser(description='VOC Evaluation') parser.add_argument('result', help='result file path') parser.add_argument('config', help='config file path') parser.add_argument( '--iou-thr', type=float, default=0.5, help='IoU threshold for evaluation') args = parser.parse_args() cfg = mmcv.Config.fromfile(args.config) test_dataset = mmcv.runner.obj_from_dict(cfg.data.test, datasets) voc_eval(args.result, test_dataset, args.iou_thr) if __name__ == '__main__': main() ================================================ FILE: scrips/create_mmdetection_test.sh ================================================ #!/usr/bin/env bash PYTHONPATH=/kaggle-imaterialist python /kaggle-imaterialist/src/create_mmdetection_test.py \ --annotation=/data/sample_submission.csv \ --root=/data/test \ --output=/data/test_mmdetection.pkl ================================================ FILE: scrips/create_mmdetection_train.sh ================================================ #!/usr/bin/env bash PYTHONPATH=/kaggle-imaterialist python /kaggle-imaterialist/src/create_mmdetection_train.py \ --annotation=/data/train.csv.zip \ --root=/data/train \ --output=/data/train_mmdetection.pkl ================================================ FILE: scrips/dist_test.sh ================================================ #!/usr/bin/env bash PYTHON=${PYTHON:-"python"} CONFIG=$1 CHECKPOINT=$2 GPUS=$3 $PYTHON -m torch.distributed.launch --nproc_per_node=$GPUS \ /kaggle-imaterialist/mmdetection/tools/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} ================================================ FILE: scrips/dist_test_ensemble.sh ================================================ #!/usr/bin/env bash CONFIG=$1 GPUS=$2 MMDETECTION_PREDICTIONS=/data/test_ensemble_predictions.pkl SUBMISSION=/data/test_ensemble_submission.csv SUBMISSION_WA=/data/test_ensemble_submission_wa.csv python -m torch.distributed.launch --nproc_per_node=$GPUS /kaggle-imaterialist/mmdetection/tools/test_ensemble.py $CONFIG \ --checkpoint /dumps/htc_dconv_c3-c5_mstrain_x101_64x4d_fpn_20e/epoch_14.pth \ /dumps/htc_dconv_c3-c5_mstrain_x101_64x4d_fpn_20e/epoch_15.pth \ /dumps/htc_dconv_c3-c5_mstrain_x101_64x4d_fpn_20e/epoch_17.pth \ --launcher pytorch ${@:4} \ --out=${MMDETECTION_PREDICTIONS} PYTHONPATH=/kaggle-imaterialist python /kaggle-imaterialist/src/submit.py \ --annotation=/data/test_mmdetection.pkl \ --predictions=${MMDETECTION_PREDICTIONS} \ --output=${SUBMISSION} PYTHONPATH=/kaggle-imaterialist python /kaggle-imaterialist/src/rm_attribute_classes.py \ --submission=${SUBMISSION} \ --output=${SUBMISSION_WA} ================================================ FILE: scrips/dist_train.sh ================================================ #!/usr/bin/env bash PYTHON=${PYTHON:-"python"} CONFIG=$1 GPUS=$2 $PYTHON -m torch.distributed.launch --nproc_per_node=$GPUS \ /kaggle-imaterialist/mmdetection/tools/train.py $CONFIG --launcher pytorch ${@:3} ================================================ FILE: scrips/prepare_weights.sh ================================================ #!/usr/bin/env bash mkdir /dumps wget https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e_20190408-0e50669c.pth -O /dumps/htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e_20190408-0e50669c.pth python /kaggle-imaterialist/src/prune.py \ --weights=/dumps/htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e_20190408-0e50669c.pth \ --output=/dumps/htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e_20190408-0e50669c_prune.pth ================================================ FILE: scrips/split.sh ================================================ #!/usr/bin/env bash PYTHONPATH=/kaggle-imaterialist python /kaggle-imaterialist/src/split.py \ --annotation=/data/train_mmdetection.pkl \ --train_output=/data/train_99_mmdetection.pkl \ --val_output=/data/val_01_mmdetection.pkl ================================================ FILE: src/__init__.py ================================================ ================================================ FILE: src/create_mmdetection_test.py ================================================ import argparse import os import pickle from functools import partial from multiprocessing import Pool import jpeg4py as jpeg import pandas as pd from tqdm import tqdm def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--annotation', type=str) parser.add_argument('--root', type=str) parser.add_argument('--output', type=str) parser.add_argument('--n_jobs', type=int, default=40) return parser.parse_args() def convert(group: dict, root) -> dict: image_id, group = group image = jpeg.JPEG(os.path.join(root, image_id)).decode() height, width = image.shape[:2] return { 'filename': image_id, 'width': width, 'height': height, 'ann': { 'bboxes': None, 'labels': None, 'masks': None } } def main(): args = parse_args() annotation = pd.read_csv(args.annotation) print(len(annotation), len(set(annotation['ImageId']))) groups = list(annotation.groupby('ImageId')) with Pool(args.n_jobs) as p: samples = list(tqdm(iterable=p.imap_unordered(partial(convert, root=args.root), groups), total=len(groups))) with open(args.output, 'wb') as f: pickle.dump(samples, f) if __name__ == '__main__': main() ================================================ FILE: src/create_mmdetection_train.py ================================================ import argparse import os import pickle from multiprocessing import Pool import pandas as pd from tqdm import tqdm from src.utils import group2mmdetection def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--annotation', type=str) parser.add_argument('--root', type=str) parser.add_argument('--output', type=str) parser.add_argument('--n_jobs', type=int, default=80) parser.add_argument('--n_samples', type=int, default=-1) return parser.parse_args() def main(): args = parse_args() annotation = pd.read_csv(args.annotation) files = sorted(os.listdir(args.root)) if args.n_samples != -1: files = files[:args.n_samples] annotation = annotation.loc[annotation['ImageId'].isin(set(files))] print(len(annotation), len(set(annotation['ImageId']))) groups = list(annotation.groupby('ImageId')) with Pool(args.n_jobs) as p: samples = list(tqdm(iterable=p.imap_unordered(group2mmdetection, groups), total=len(groups))) with open(args.output, 'wb') as f: pickle.dump(samples, f) if __name__ == '__main__': main() ================================================ FILE: src/draw.py ================================================ import argparse import numpy as np from multiprocessing import Pool import cv2 import os import json from tqdm import tqdm import mmcv import os.path as osp from functools import partial import pycocotools.mask as mutils import pandas as pd N_CLASSES = 46 def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--annotation', type=str) parser.add_argument('--predictions', type=str) parser.add_argument('--submission', type=str, default=None) parser.add_argument('--classes', type=str) parser.add_argument('--root', type=str) parser.add_argument('--output', type=str) parser.add_argument('--n_jobs', type=int, default=80) parser.add_argument('--metric_threshold', type=float, default=0.1) return parser.parse_args() def get_spaced_colors(n, start_color=(75, 0, 130)): r, g, b = start_color step = 256 / n colors = [] for i in range(n): r += step g += step b += step colors.append((int(r) % 256, int(g) % 256, int(b) % 256)) return np.random.permutation(colors).reshape(-1, 1, 3).astype(np.uint8) def put_text(img, color, text, i, x_shift=10, y_shift=10): font = cv2.FONT_HERSHEY_SIMPLEX text_size = cv2.getTextSize(text, font, 1, 2)[0] text_x, text_y = x_shift, y_shift + (i + 1) * text_size[1] * 2 img = cv2.putText(img, text, (text_x, text_y), font, 1.5, tuple(map(int, color)), 3) return img def draw_masks(img, masks, colors, classes): if masks is not None: assert len(colors) == len(masks) mask_colors = [colors[i] for i, mask in enumerate(masks) if mask] mask_classes = [classes[i] for i, mask in enumerate(masks) if mask] masks = mmcv.concat_list(masks) for i, (mask, color, cls) in enumerate(zip(masks, mask_colors, mask_classes)): mask = mutils.decode(mask).astype(np.bool) img[mask] = img[mask] * 0.5 + color * 0.5 img = put_text(img, color[0], cls, i) return img def get_gt_masks(annotation): gt_masks = [[] for _ in range(N_CLASSES)] for mask, label in zip(annotation['ann']['masks'], annotation['ann']['labels']): gt_masks[label - 1].append(mask) return gt_masks def draw(args, root, output, metric_threshold, colors, classes): prediction, annotation, metric = args if metric is None: output_filename = annotation['filename'] elif metric < metric_threshold: output_filename = f"{100 * metric:0.3f}_{annotation['filename']}" else: return None img = cv2.imread(osp.join(root, annotation['filename']))[..., ::-1] _, prediction_masks = prediction prediction_img = draw_masks(img.copy(), prediction_masks, colors, classes) gt_img = draw_masks(img.copy(), get_gt_masks(annotation), colors, classes) output_image = np.hstack([img, gt_img, prediction_img]) return cv2.imwrite(osp.join(output, output_filename), output_image[..., ::-1]) def main(): args = parse_args() predictions = mmcv.load(args.predictions) annotation = mmcv.load(args.annotation) classes = json.load(open(args.classes)) classes = [x['name'] for x in classes['categories']] if args.submission is not None: submission = pd.read_csv(args.submission) submission = submission.drop_duplicates('ImageId') id2metric = dict(zip(submission['ImageId'], submission['mAP'])) metrics = [id2metric[x['filename']] for x in annotation] else: metrics = [None for _ in annotation] os.makedirs(args.output, exist_ok=True) colors = get_spaced_colors(N_CLASSES) partial_draw = partial( draw, root=args.root, output=args.output, metric_threshold=args.metric_threshold, colors=colors, classes=classes ) with Pool(args.n_jobs) as p: list( tqdm( iterable=p.imap_unordered(partial_draw, zip(predictions, annotation, metrics)), total=len(predictions) ) ) if __name__ == '__main__': main() ================================================ FILE: src/eda.py ================================================ import argparse from mmdet.datasets import get_dataset import numpy as np import mmcv from mmcv import Config import os import cv2 from tqdm import tqdm import os.path as osp from src.visualization import draw_bounding_boxes_on_image_array def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('config', type=str) parser.add_argument('--output', type=str) return parser.parse_args() def draw_masks(img, masks): if masks is not None: for i in range(masks.shape[-1]): mask = masks[..., i] color_mask = np.random.randint(0, 256, (1, 3), dtype=np.uint8) img[mask] = img[mask] * 0.5 + color_mask * 0.5 return img def main(): args = parse_args() os.makedirs(args.output, exist_ok=True) cfg = Config.fromfile(args.config) dataset = get_dataset(cfg.data.train) for i in tqdm(np.random.randint(0, len(dataset), 500)): data = dataset[i] img = data['img'].data.numpy().transpose(1, 2, 0) masks = data['gt_masks'].data.transpose(1, 2, 0).astype(bool) bboxes = data['gt_bboxes'].data.numpy() img = mmcv.imdenormalize(img, mean=cfg.img_norm_cfg.mean, std=cfg.img_norm_cfg.std, to_bgr=False) img = draw_masks(img, masks).astype(np.uint8) draw_bounding_boxes_on_image_array(img, bboxes, use_normalized_coordinates=False, thickness=5) cv2.imwrite(osp.join(args.output, f'{i}_{np.random.randint(0, 10000)}.jpg'), img[..., ::-1]) if __name__ == '__main__': main() ================================================ FILE: src/metric.py ================================================ import numpy as np from functools import partial def precision_at(threshold, iou): matches = iou > threshold true_positives = np.sum(matches, axis=1) == 1 # Correct objects false_positives = np.sum(matches, axis=0) == 0 # Missed objects false_negatives = np.sum(matches, axis=1) == 0 # Extra objects tp, fp, fn = np.sum(true_positives), np.sum(false_positives), np.sum(false_negatives) return tp, fp, fn def calc_iou(y_true, y_prediction): y_true, y_prediction = map(partial(np.expand_dims, axis=0), (y_true, y_prediction)) true_objects = len(np.unique(y_true)) pred_objects = len(np.unique(y_prediction)) # Compute intersection between all objects intersection = np.histogram2d(y_true.flatten(), y_prediction.flatten(), bins=(true_objects, pred_objects))[0] # Compute areas (needed for finding the union between all objects) area_true = np.histogram(y_true, bins=true_objects)[0] area_pred = np.histogram(y_prediction, bins=pred_objects)[0] area_true = np.expand_dims(area_true, -1) area_pred = np.expand_dims(area_pred, 0) # Compute union union = area_true + area_pred - intersection # Exclude background from the analysis intersection = intersection[1:, 1:] union = union[1:, 1:] union[union == 0] = 1e-9 # Compute the intersection over union iou = intersection / union return iou def calc_score_per_class(y_true, y_prediction): iou = calc_iou(y_true, y_prediction) # Loop over IoU thresholds precisions = [] for t in np.arange(0.5, 1.0, 0.05): tp, fp, fn = precision_at(t, iou) p = tp / (tp + fp + fn) precisions.append(p) return np.mean(precisions) ================================================ FILE: src/prune.py ================================================ import argparse import torch def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--weights', type=str) parser.add_argument('--output', type=str) return parser.parse_args() def main(): args = parse_args() weights = torch.load(args.weights) weights['state_dict'] = { k: v for k, v in weights['state_dict'].items() if not k.startswith('bbox_head') and not k.startswith('mask_head') } torch.save(weights, args.output) if __name__ == '__main__': main() ================================================ FILE: src/rle.py ================================================ from itertools import groupby from pycocotools import mask as mutils import numpy as np from tqdm import tqdm def kaggle_rle_encode(mask): pixels = mask.T.flatten() pixels = np.concatenate([[0], pixels, [0]]) rle = np.where(pixels[1:] != pixels[:-1])[0] + 1 rle[1::2] -= rle[::2] return rle.tolist() def kaggle_rle_decode(rle, h, w): starts, lengths = map(np.asarray, (rle[::2], rle[1::2])) starts -= 1 ends = starts + lengths img = np.zeros(h * w, dtype=np.uint8) for lo, hi in zip(starts, ends): img[lo:hi] = 1 return img.reshape((w, h)).T def coco_rle_encode(mask): rle = {'counts': [], 'size': list(mask.shape)} counts = rle.get('counts') for i, (value, elements) in enumerate(groupby(mask.ravel(order='F'))): if i == 0 and value == 1: counts.append(0) counts.append(len(list(elements))) return rle def coco_rle_decode(rle, h, w): return mutils.decode(mutils.frPyObjects(rle, h, w)) def kaggle2coco(kaggle_rle, h, w): if not len(kaggle_rle): return {'counts': [h * w], 'size': [h, w]} roll2 = np.roll(kaggle_rle, 2) roll2[:2] = 1 roll1 = np.roll(kaggle_rle, 1) roll1[:1] = 0 if h * w != kaggle_rle[-1] + kaggle_rle[-2] - 1: shift = 1 end_value = h * w - kaggle_rle[-1] - kaggle_rle[-2] + 1 else: shift = 0 end_value = 0 coco_rle = np.full(len(kaggle_rle) + shift, end_value) coco_rle[:len(coco_rle) - shift] = kaggle_rle.copy() coco_rle[:len(coco_rle) - shift:2] = (kaggle_rle - roll1 - roll2)[::2].copy() return {'counts': coco_rle.tolist(), 'size': [h, w]} def main(): for _ in tqdm(range(100)): h = np.random.randint(1, 1000) w = np.random.randint(1, 1000) mask = np.random.randint(0, 2, h * w).reshape(h, w) kaggle_rle = kaggle_rle_encode(mask) coco_rle = coco_rle_encode(mask) assert coco_rle == kaggle2coco(kaggle_rle, h, w) assert np.all(mask == kaggle_rle_decode(kaggle_rle, h, w)) assert np.all(mask == coco_rle_decode(coco_rle, h, w)) if __name__ == '__main__': main() ================================================ FILE: src/rm_attribute_classes.py ================================================ import argparse import pandas as pd ATTRIBUTE_CLASSES = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--submission', type=str) parser.add_argument('--output', type=str) return parser.parse_args() def main(): args = parse_args() submission = pd.read_csv(args.submission) submission_without_attributes = submission[~submission['ClassId'].isin(ATTRIBUTE_CLASSES)].copy() empty_ids = list(set(submission['ImageId']) - set(submission_without_attributes['ImageId'])) submission_empty = pd.DataFrame([empty_ids, [''] * len(empty_ids), ['23'] * len(empty_ids)] ).T.rename(columns={ 0: 'ImageId', 1: 'EncodedPixels', 2: 'ClassId' }) pd.concat([submission_without_attributes, submission_empty], sort=True).to_csv(args.output, index=False) if __name__ == '__main__': main() ================================================ FILE: src/split.py ================================================ import argparse import mmcv import numpy as np import pickle from iterstrat.ml_stratifiers import MultilabelStratifiedKFold def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--annotation', type=str) parser.add_argument('--train_output', type=str) parser.add_argument('--val_output', type=str) parser.add_argument('--n_splits', type=int, default=100) parser.add_argument('--n_jobs', type=int, default=40) parser.add_argument('--n_samples', type=int, default=-1) return parser.parse_args() def main(): args = parse_args() annotation = mmcv.load(args.annotation) all_labels = [x['ann']['labels'] for x in annotation] y = np.zeros((len(all_labels), max([max(x) for x in all_labels]))) for labels in all_labels: y[:, labels - 1] = 1 mskf = MultilabelStratifiedKFold(n_splits=args.n_splits, random_state=777) for train_index, val_index in mskf.split(y, y): train_annotation = [x for i, x in enumerate(annotation) if i in train_index] val_annotation = [x for i, x in enumerate(annotation) if i in val_index] with open(args.train_output, 'wb') as f: pickle.dump(train_annotation, f) with open(args.val_output, 'wb') as f: pickle.dump(val_annotation, f) print(f'train size: {len(train_annotation)}, val size: {len(val_annotation)}') break if __name__ == '__main__': main() ================================================ FILE: src/submit.py ================================================ import argparse import numpy as np import pandas as pd from multiprocessing import Pool import cv2 from tqdm import tqdm import mmcv import pycocotools.mask as mutils from src.rle import kaggle_rle_encode from src.metric import calc_score_per_class from src.utils import create_labeled_mask, check_overlaps, hard_overlaps_suppression def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--annotation', type=str) parser.add_argument('--predictions', type=str) parser.add_argument('--output', type=str) parser.add_argument('--n_jobs', type=int, default=80) parser.add_argument('--add_metric', action='store_true') return parser.parse_args() def decode_and_resize( masks, x_min=None, x_max=None, y_min=None, y_max=None, original_height=None, original_width=None ): binary_mask = mutils.decode(masks) if x_min is not None: crop_height, crop_width, channels = binary_mask.shape assert crop_height == y_max - y_min assert crop_width == x_max - x_min original_mask = np.zeros((original_height, original_width, channels)) original_mask[y_min:y_max, x_min:x_max] = binary_mask binary_mask = original_mask binary_mask = cv2.resize(binary_mask, (512, 512), cv2.INTER_NEAREST) if len(binary_mask.shape) == 2: binary_mask = binary_mask[..., np.newaxis] return binary_mask def create_mask(args): prediction, annotation = args bbox_prediction, mask_prediction = prediction samples = [] metrics = [] for cls, (masks, bboxes) in enumerate(zip(mask_prediction, bbox_prediction)): if masks: prediction_mask = decode_and_resize( masks=masks, x_min=annotation.get('x_min', None), x_max=annotation.get('x_max', None), y_min=annotation.get('y_min', None), y_max=annotation.get('y_max', None), original_height=annotation.get('original_height', None), original_width=annotation.get('original_width', None) ) if not check_overlaps(prediction_mask): prediction_mask = hard_overlaps_suppression(prediction_mask.astype(bool), bboxes[..., -1]) if annotation['ann']['masks'] is not None: indices = np.where(annotation['ann']['labels'] - 1 == cls)[0] if len(indices): true_mask = decode_and_resize([annotation['ann']['masks'][i] for i in indices]) true_labeled_mask = create_labeled_mask(true_mask) prediction_labeled_mask = create_labeled_mask(prediction_mask) metrics.append(calc_score_per_class(true_labeled_mask, prediction_labeled_mask)) else: metrics.append(0) for mask_id in range(prediction_mask.shape[-1]): rle = kaggle_rle_encode(prediction_mask[..., mask_id]) samples.append( { 'ImageId': annotation['filename'], 'EncodedPixels': ' '.join(map(str, rle)), 'ClassId': str(cls) } ) elif annotation['ann']['masks'] is not None and np.any(annotation['ann']['labels'] - 1 == cls): metrics.append(0) if not len(samples): samples.append({'ImageId': annotation['filename'], 'EncodedPixels': '', 'ClassId': '23'}) if not len(metrics): metrics.append(1) return {'samples': samples, 'metric': np.mean(metrics)} def main(): args = parse_args() predictions = mmcv.load(args.predictions) annotation = mmcv.load(args.annotation) print(f'predictions: {args.predictions}') print(f'output: {args.output}') with Pool(args.n_jobs) as p: results = list( tqdm(iterable=p.imap_unordered(create_mask, zip(predictions, annotation)), total=len(predictions)) ) samples = sum([x['samples'] for x in results], []) metrics = [x['metric'] for x in results] submission = pd.DataFrame(samples) if args.add_metric: submission['mAP'] = sum([[x['metric']] * len(x['samples']) for x in results], []) submission.to_csv(args.output, index=False) print(f'Mask mAP {np.mean(metrics)}') if __name__ == '__main__': main() ================================================ FILE: src/utils.py ================================================ import numpy as np from pycocotools import mask as mutils from src.rle import kaggle2coco def group2mmdetection(group: dict) -> dict: image_id, group = group assert group['Width'].max() == group['Width'].min() assert group['Height'].max() == group['Height'].min() height, width = group['Height'].max(), group['Width'].max() rles = group['EncodedPixels'].apply(lambda x: kaggle2coco(list(map(int, x.split())), height, width)).tolist() rles = mutils.frPyObjects(rles, height, width) masks = mutils.decode(rles) bboxes = mutils.toBbox(mutils.encode(np.asfortranarray(masks.astype(np.uint8)))) bboxes[:, 2] += bboxes[:, 0] bboxes[:, 3] += bboxes[:, 1] return { 'filename': image_id, 'width': width, 'height': height, 'ann': { 'bboxes': np.array(bboxes, dtype=np.float32), 'original_labels': group['ClassId'].values, 'labels': group['ClassId'].apply(lambda x: x.split('_')[0]).values.astype(np.int) + 1, 'masks': rles } } def create_labeled_mask(mask): return (np.arange(1, mask.shape[-1] + 1)[None, None, :] * mask).sum(-1) def check_overlaps(mask): overlap_mask = mask.sum(axis=-1) return np.array_equal(overlap_mask, overlap_mask.astype(bool)) def hard_overlaps_suppression(binary_mask, scores): not_overlap_mask = [] for i in np.argsort(scores)[::-1]: current_mask = binary_mask[..., i].copy() for mask in not_overlap_mask: current_mask = np.bitwise_and(current_mask, np.invert(mask)) not_overlap_mask.append(current_mask) return np.stack(not_overlap_mask, -1) ================================================ FILE: src/visualization.py ================================================ import numpy as np import PIL.Image as Image import PIL.ImageDraw as ImageDraw import PIL.ImageFont as ImageFont INDIGO = (75, 0, 130) def draw_bounding_box_on_image( image, x_min, y_min, x_max, y_max, color, thickness=4, display_str_list=(), use_normalized_coordinates=True, fontsize=20 ): draw = ImageDraw.Draw(image) im_width, im_height = image.size if use_normalized_coordinates: (left, right, top, bottom) = (x_min * im_width, x_max * im_width, y_min * im_height, y_max * im_height) else: (left, right, top, bottom) = (x_min, x_max, y_min, y_max) draw.line([(left, top), (left, bottom), (right, bottom), (right, top), (left, top)], width=thickness, fill=color) try: font = ImageFont.truetype('DejaVuSansMono.ttf', fontsize) except IOError: font = ImageFont.load_default() # If the total height of the display strings added to the top of the bounding # box exceeds the top of the image, stack the strings below the bounding box # instead of above. display_str_heights = [font.getsize(ds)[1] for ds in display_str_list] # Each display_str has a top and bottom margin of 0.05x. total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights) if top > total_display_str_height: text_bottom = top else: text_bottom = bottom + total_display_str_height # Reverse list and print from bottom to top. for display_str in display_str_list[::-1]: text_width, text_height = font.getsize(display_str) margin = np.ceil(0.05 * text_height) draw.rectangle([(left, text_bottom - text_height - 2 * margin), (left + text_width, text_bottom)], fill=color) draw.text((left + margin, text_bottom - text_height - margin), display_str, fill=color, font=font) text_bottom -= text_height - 2 * margin def draw_bounding_boxes_on_image_array( image, bboxes, color=INDIGO, thickness=4, use_normalized_coordinates=True, fontsize=20 ): image_pil = Image.fromarray(image) draw_bounding_boxes_on_image(image_pil, bboxes, color, thickness, use_normalized_coordinates, fontsize) np.copyto(image, np.array(image_pil)) def draw_bounding_boxes_on_image( image, bboxes, color=INDIGO, thickness=4, use_normalized_coordinates=True, fontsize=20 ): for bbox in bboxes: draw_bounding_box_on_image( image, bbox[0], bbox[1], bbox[2], bbox[3], color, thickness, (), use_normalized_coordinates, fontsize )