Repository: chanyn/Reasoning-RCNN
Branch: master
Commit: 9bd7c7ab0ffd
Files: 179
Total size: 138.1 MB
Directory structure:
gitextract_155bakyx/
├── .gitignore
├── .travis.yml
├── INSTALL.md
├── LICENSE
├── MODEL_ZOO.md
├── README.md
├── TECHNICAL_DETAILS.md
├── compile.sh
├── configs/
│ ├── ade_faster_rcnn_r101_fpn_1x.py
│ ├── coco_faster_rcnn_r101_fpn_1x.py
│ ├── coco_sgrb_fpn_ms.py
│ ├── hkrm/
│ │ ├── ade_faster_rcnn_r50_fpn_1x.py
│ │ ├── coco_faster_rcnn_r101_fpn_1x.py
│ │ └── vg_faster_rcnn_r101_fpn_1x.py
│ ├── pascal_voc/
│ │ ├── faster_rcnn_r50_fpn_1x_voc0712.py
│ │ ├── ssd300_voc.py
│ │ └── ssd512_voc.py
│ ├── rrcnn/
│ │ ├── ade_reasoning_rcnn_r101_fpn_1x.py
│ │ ├── coco_reasoning_rcnn_r101_fpn_1x.py
│ │ └── vg_reasoning_rcnn_r101_fpn_1x.py
│ ├── vg_faster_rcnn_r101_fpn_1x.py
│ └── vgbig_faster_rcnn_r101_fpn_1x.py
├── mmdet/
│ ├── __init__.py
│ ├── apis/
│ │ ├── __init__.py
│ │ ├── env.py
│ │ ├── inference.py
│ │ └── train.py
│ ├── core/
│ │ ├── __init__.py
│ │ ├── anchor/
│ │ │ ├── __init__.py
│ │ │ ├── anchor_generator.py
│ │ │ └── anchor_target.py
│ │ ├── bbox/
│ │ │ ├── __init__.py
│ │ │ ├── assign_sampling.py
│ │ │ ├── assigners/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── assign_result.py
│ │ │ │ ├── base_assigner.py
│ │ │ │ └── max_iou_assigner.py
│ │ │ ├── bbox_target.py
│ │ │ ├── geometry.py
│ │ │ ├── samplers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base_sampler.py
│ │ │ │ ├── combined_sampler.py
│ │ │ │ ├── instance_balanced_pos_sampler.py
│ │ │ │ ├── iou_balanced_neg_sampler.py
│ │ │ │ ├── ohem_sampler.py
│ │ │ │ ├── pseudo_sampler.py
│ │ │ │ ├── random_sampler.py
│ │ │ │ ├── random_sampler_fixnum.py
│ │ │ │ └── sampling_result.py
│ │ │ └── transforms.py
│ │ ├── evaluation/
│ │ │ ├── __init__.py
│ │ │ ├── bbox_overlaps.py
│ │ │ ├── class_names.py
│ │ │ ├── coco_utils.py
│ │ │ ├── eval_hooks.py
│ │ │ ├── mean_ap.py
│ │ │ └── recall.py
│ │ ├── loss/
│ │ │ ├── __init__.py
│ │ │ └── losses.py
│ │ ├── mask/
│ │ │ ├── __init__.py
│ │ │ ├── mask_target.py
│ │ │ └── utils.py
│ │ ├── post_processing/
│ │ │ ├── __init__.py
│ │ │ ├── bbox_nms.py
│ │ │ └── merge_augs.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── dist_utils.py
│ │ └── misc.py
│ ├── datasets/
│ │ ├── __init__.py
│ │ ├── coco.py
│ │ ├── concat_dataset.py
│ │ ├── custom.py
│ │ ├── extra_aug.py
│ │ ├── loader/
│ │ │ ├── __init__.py
│ │ │ ├── build_loader.py
│ │ │ └── sampler.py
│ │ ├── repeat_dataset.py
│ │ ├── transforms.py
│ │ ├── utils.py
│ │ ├── voc.py
│ │ └── xml_style.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── anchor_heads/
│ │ │ ├── __init__.py
│ │ │ ├── anchor_head.py
│ │ │ ├── retina_head.py
│ │ │ ├── rpn_head.py
│ │ │ └── ssd_head.py
│ │ ├── backbones/
│ │ │ ├── __init__.py
│ │ │ ├── resnet.py
│ │ │ ├── resnext.py
│ │ │ └── ssd_vgg.py
│ │ ├── bbox_heads/
│ │ │ ├── __init__.py
│ │ │ ├── bbox_head.py
│ │ │ ├── convfc_bbox_head.py
│ │ │ ├── convfc_bbox_head_enhanced.py
│ │ │ └── graph_bbox_head.py
│ │ ├── builder.py
│ │ ├── detectors/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── cascade_rcnn.py
│ │ │ ├── fast_rcnn.py
│ │ │ ├── faster_rcnn.py
│ │ │ ├── hkrm_rcnn.py
│ │ │ ├── mask_rcnn.py
│ │ │ ├── reasoning_rcnn.py
│ │ │ ├── retinanet.py
│ │ │ ├── rpn.py
│ │ │ ├── sgrn.py
│ │ │ ├── single_stage.py
│ │ │ ├── test_mixins.py
│ │ │ └── two_stage.py
│ │ ├── mask_heads/
│ │ │ ├── __init__.py
│ │ │ └── fcn_mask_head.py
│ │ ├── necks/
│ │ │ ├── __init__.py
│ │ │ └── fpn.py
│ │ ├── registry.py
│ │ ├── roi_extractors/
│ │ │ ├── __init__.py
│ │ │ └── single_level.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── conv_module.py
│ │ ├── norm.py
│ │ └── weight_init.py
│ └── ops/
│ ├── __init__.py
│ ├── dcn/
│ │ ├── __init__.py
│ │ ├── functions/
│ │ │ ├── __init__.py
│ │ │ ├── deform_conv.py
│ │ │ └── deform_pool.py
│ │ ├── modules/
│ │ │ ├── __init__.py
│ │ │ ├── deform_conv.py
│ │ │ └── deform_pool.py
│ │ ├── setup.py
│ │ └── src/
│ │ ├── deform_conv_cuda.cpp
│ │ ├── deform_conv_cuda_kernel.cu
│ │ ├── deform_pool_cuda.cpp
│ │ └── deform_pool_cuda_kernel.cu
│ ├── nms/
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ ├── __init__.py
│ │ ├── cpu_nms.pyx
│ │ ├── cpu_soft_nms.pyx
│ │ ├── gpu_nms.hpp
│ │ ├── gpu_nms.pyx
│ │ ├── nms_kernel.cu
│ │ ├── nms_wrapper.py
│ │ └── setup.py
│ ├── roi_align/
│ │ ├── __init__.py
│ │ ├── functions/
│ │ │ ├── __init__.py
│ │ │ └── roi_align.py
│ │ ├── gradcheck.py
│ │ ├── modules/
│ │ │ ├── __init__.py
│ │ │ └── roi_align.py
│ │ ├── setup.py
│ │ └── src/
│ │ ├── roi_align_cuda.cpp
│ │ └── roi_align_kernel.cu
│ └── roi_pool/
│ ├── __init__.py
│ ├── functions/
│ │ ├── __init__.py
│ │ └── roi_pool.py
│ ├── gradcheck.py
│ ├── modules/
│ │ ├── __init__.py
│ │ └── roi_pool.py
│ ├── setup.py
│ └── src/
│ ├── roi_pool_cuda.cpp
│ └── roi_pool_kernel.cu
├── setup.py
└── tools/
├── coco_eval.py
├── convert_datasets/
│ └── pascal_voc.py
├── dist_train.sh
├── graph/
│ ├── new_COCO_graph_a.pkl
│ ├── new_COCO_graph_r.pkl
│ ├── new_ade_graph_a.pkl
│ ├── new_ade_graph_r.pkl
│ ├── new_vg_big_graph_a.pkl
│ ├── new_vg_big_graph_r.pkl
│ ├── new_vg_graph_a.pkl
│ └── new_vg_graph_r.pkl
├── test.py
├── train.py
├── vis_subgraph.py
└── voc_eval.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
tools/work_dirs/
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
# cython generated cpp
mmdet/ops/nms/*.cpp
mmdet/version.py
data
.vscode
.idea
================================================
FILE: .travis.yml
================================================
dist: trusty
language: python
install:
- pip install flake8
python:
- "3.5"
- "3.6"
script:
- flake8
================================================
FILE: INSTALL.md
================================================
## Installation
### Requirements
- Linux (tested on Ubuntu 16.04 and CentOS 7.2)
- Python 3.4+
- PyTorch 0.4.1
- Cython
- [mmcv](https://github.com/open-mmlab/mmcv)
### Install mmdetection
a. Install PyTorch 0.4.1 and torchvision following the [official instructions](https://pytorch.org/).
b. Clone the mmdetection repository.
```shell
git clone https://github.com/open-mmlab/mmdetection.git
```
c. Compile cuda extensions.
```shell
cd mmdetection
pip install cython # or "conda install cython" if you prefer conda
./compile.sh # or "PYTHON=python3 ./compile.sh" if you use system python3 without virtual environments
```
d. Install mmdetection (other dependencies will be installed automatically).
```shell
python(3) setup.py install # add --user if you want to install it locally
# or "pip install ."
```
Note: You need to run the last step each time you pull updates from github.
The git commit id will be written to the version number and also saved in trained models.
### Prepare COCO dataset.
It is recommended to symlink the dataset root to `$MMDETECTION/data`.
```
mmdetection
├── mmdet
├── tools
├── configs
├── data
│ ├── coco
│ │ ├── annotations
│ │ ├── train2017
│ │ ├── val2017
│ │ ├── test2017
│ ├── VOCdevkit
│ │ ├── VOC2007
│ │ ├── VOC2012
```
### Scripts
Just for reference, [Here](https://gist.github.com/hellock/bf23cd7348c727d69d48682cb6909047) is
a script for setting up mmdetection with conda.
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: MODEL_ZOO.md
================================================
# Benchmark and Model Zoo
## Environment
### Hardware
- 8 NVIDIA Tesla V100 GPUs
- Intel Xeon 4114 CPU @ 2.20GHz
### Software environment
- Python 3.6 / 3.7
- PyTorch 0.4.1
- CUDA 9.0.176
- CUDNN 7.0.4
- NCCL 2.1.15
## Common settings
- All baselines were trained using 8 GPU with a batch size of 16 (2 images per GPU).
- All models were trained on `coco_2017_train`, and tested on the `coco_2017_val`.
- We use distributed training and BN layer stats are fixed.
- We adopt the same training schedules as Detectron. 1x indicates 12 epochs and 2x indicates 24 epochs, which corresponds to slightly less iterations than Detectron and the difference can be ignored.
- All pytorch-style pretrained backbones on ImageNet are from PyTorch model zoo.
- We report the training GPU memory as the maximum value of `torch.cuda.max_memory_cached()`
for all 8 GPUs. Note that this value is usually less than what `nvidia-smi` shows, but
closer to the actual requirements.
- We report the inference time as the overall time including data loading,
network forwarding and post processing.
- The training memory and time of 2x schedule is simply copied from 1x.
It should be very close to the actual memory and time.
## Baselines
We released RPN, Faster R-CNN and Mask R-CNN models in the first version. More models with different backbones will be added to the model zoo.
### RPN
| Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | AR1000 | Download |
|:--------:|:-------:|:-------:|:--------:|:-------------------:|:--------------:|:------:|:--------:|
| R-50-FPN | caffe | 1x | 4.5 | 0.379 | 14.4 | 58.2 | - |
| R-50-FPN | pytorch | 1x | 4.8 | 0.407 | 14.5 | 57.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_fpn_1x_20181010-4a9c0712.pth) |
| R-50-FPN | pytorch | 2x | 4.8 | 0.407 | 14.5 | 57.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_fpn_2x_20181010-88a4a471.pth) |
| R-101-FPN | caffe | 1x | 7.4 | 0.513 | 11.1 | 59.4 | - |
| R-101-FPN | pytorch | 1x | 8.0 | 0.552 | 11.1 | 58.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r101_fpn_1x_20181129-f50da4bd.pth) |
| R-101-FPN | pytorch | 2x | 8.0 | 0.552 | 11.1 | 59.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r101_fpn_2x_20181129-e42c6c9a.pth) |
| X-101-32x4d-FPN | pytorch |1x | 9.9 | 0.691 | 8.3 | 59.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_x101_32x4d_fpn_1x_20181218-7e379d26.pth)
| X-101-32x4d-FPN | pytorch |2x | 9.9 | 0.691 | 8.3 | 59.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_x101_32x4d_fpn_2x_20181218-0510af40.pth)
| X-101-64x4d-FPN | pytorch |1x | 14.6 | 1.032 | 6.2 | 59.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_x101_64x4d_fpn_1x_20181218-c1a24f1f.pth)
| X-101-64x4d-FPN | pytorch |2x | 14.6 | 1.032 | 6.2 | 60.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_x101_64x4d_fpn_2x_20181218-c22bdd70.pth)
### Faster R-CNN
| Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download |
|:--------:|:-------:|:-------:|:--------:|:-------------------:|:--------------:|:------:|:--------:|
| R-50-FPN | caffe | 1x | 4.9 | 0.525 | 10.0 | 36.7 | - |
| R-50-FPN | pytorch | 1x | 5.1 | 0.554 | 9.9 | 36.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_fpn_1x_20181010-3d1b3351.pth) |
| R-50-FPN | pytorch | 2x | 5.1 | 0.554 | 9.9 | 37.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_fpn_2x_20181010-443129e1.pth) |
| R-101-FPN | caffe | 1x | 7.4 | 0.663 | 8.4 | 38.8 | - |
| R-101-FPN | pytorch | 1x | 8.0 | 0.698 | 8.3 | 38.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r101_fpn_1x_20181129-d1468807.pth) |
| R-101-FPN | pytorch | 2x | 8.0 | 0.698 | 8.3 | 39.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r101_fpn_2x_20181129-73e7ade7.pth) |
| X-101-32x4d-FPN | pytorch | 1x| 9.9 | 0.842 | 7.0 | 40.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_x101_32x4d_fpn_1x_20181218-ad81c133.pth)
| X-101-32x4d-FPN | pytorch | 2x| 9.9 | 0.842 | 7.0 | 40.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_x101_32x4d_fpn_2x_20181218-0ed58946.pth)
| X-101-64x4d-FPN | pytorch | 1x| 14.1 | 1.181 | 5.2 | 41.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_x101_64x4d_fpn_1x_20181218-c9c69c8f.pth)
| X-101-64x4d-FPN | pytorch | 2x| 14.1 | 1.181 | 5.2 | 40.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_x101_64x4d_fpn_2x_20181218-fe94f9b8.pth)
### Mask R-CNN
| Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download |
|:--------:|:-------:|:-------:|:--------:|:-------------------:|:--------------:|:------:|:-------:|:--------:|
| R-50-FPN | caffe | 1x | 5.9 | 0.658 | 7.7 | 37.5 | 34.4 | - |
| R-50-FPN | pytorch | 1x | 5.8 | 0.690 | 7.7 | 37.3 | 34.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_fpn_1x_20181010-069fa190.pth) |
| R-50-FPN | pytorch | 2x | 5.8 | 0.690 | 7.7 | 38.6 | 35.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_fpn_2x_20181010-41d35c05.pth) |
| R-101-FPN | caffe | 1x | 8.8 | 0.791 | 7.0 | 39.9 | 36.1 | - |
| R-101-FPN | pytorch | 1x | 9.1 | 0.825 | 6.7 | 39.4 | 35.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r101_fpn_1x_20181129-34ad1961.pth) |
| R-101-FPN | pytorch | 2x | 9.1 | 0.825 | 6.7 | 40.4 | 36.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r101_fpn_2x_20181129-a254bdfc.pth) |
| X-101-32x4d-FPN | pytorch | 1x| 10.9 | 0.972 | 5.8 | 41.2 | 37.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_x101_32x4d_fpn_1x_20181218-44e635cc.pth)
| X-101-64x4d-FPN | pytorch | 2x| 10.9 | 0.972 | 5.8 | 41.4 | 37.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_x101_32x4d_fpn_2x_20181218-f023dffa.pth)
| X-101-32x4d-FPN | pytorch | 1x| 14.1 | 1.302 | 4.7 | 42.2 | 38.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_x101_64x4d_fpn_1x_20181218-cb159987.pth)
| X-101-64x4d-FPN | pytorch | 2x| 14.1 | 1.302 | 4.7 | 42.0 | 37.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_x101_64x4d_fpn_2x_20181218-ea936e44.pth)
### Fast R-CNN (with pre-computed proposals)
| Backbone | Style | Type | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download |
|:--------:|:-------:|:------:|:-------:|:--------:|:-------------------:|:--------------:|:------:|:-------:|:--------:|
| R-50-FPN | caffe | Faster | 1x | 3.5 | 0.348 | 14.6 | 36.6 | - | - |
| R-50-FPN | pytorch | Faster | 1x | 4.0 | 0.375 | 14.5 | 35.8 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_fpn_1x_20181010-08160859.pth) |
| R-50-FPN | pytorch | Faster | 2x | 4.0 | 0.375 | 14.5 | 37.1 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_fpn_2x_20181010-d263ada5.pth) |
| R-101-FPN| caffe | Faster | 1x | 7.1 | 0.484 | 11.9 | 38.4 | - | - |
| R-101-FPN| pytorch | Faster | 1x | 7.6 | 0.540 | 11.8 | 38.1 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r101_fpn_1x_20181129-ffaa2eb0.pth) |
| R-101-FPN| pytorch | Faster | 2x | 7.6 | 0.540 | 11.8 | 38.8 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r101_fpn_2x_20181129-9dba92ce.pth) |
| R-50-FPN | caffe | Mask | 1x | 5.4 | 0.473 | 10.7 | 37.3 | 34.5 | - |
| R-50-FPN | pytorch | Mask | 1x | 5.3 | 0.504 | 10.6 | 36.8 | 34.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_fpn_1x_20181010-e030a38f.pth) |
| R-50-FPN | pytorch | Mask | 2x | 5.3 | 0.504 | 10.6 | 37.9 | 34.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_fpn_2x_20181010-5048cb03.pth) |
| R-101-FPN| caffe | Mask | 1x | 8.6 | 0.607 | 9.5 | 39.4 | 36.1 | - |
| R-101-FPN| pytorch | Mask | 1x | 9.0 | 0.656 | 9.3 | 38.9 | 35.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r101_fpn_1x_20181129-2273fa9b.pth) |
| R-101-FPN| pytorch | Mask | 2x | 9.0 | 0.656 | 9.3 | 39.9 | 36.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r101_fpn_2x_20181129-bf63ec5e.pth) |
### RetinaNet
| Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download |
|:--------:|:-------:|:-------:|:--------:|:-------------------:|:--------------:|:------:|:--------:|
| R-50-FPN | caffe | 1x | 6.7 | 0.468 | 9.4 | 35.8 | - |
| R-50-FPN | pytorch | 1x | 6.9 | 0.496 | 9.1 | 35.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_r50_fpn_1x_20181125-3d3c2142.pth) |
| R-50-FPN | pytorch | 2x | 6.9 | 0.496 | 9.1 | 36.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_r50_fpn_2x_20181125-e0dbec97.pth) |
| R-101-FPN | caffe | 1x | 9.2 | 0.614 | 8.2 | 37.8 | - |
| R-101-FPN | pytorch | 1x | 9.6 | 0.643 | 8.1 | 37.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_r101_fpn_1x_20181129-f738a02f.pth) |
| R-101-FPN | pytorch | 2x | 9.6 | 0.643 | 8.1 | 38.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_r101_fpn_2x_20181129-f654534b.pth) |
| X-101-32x4d-FPN | pytorch | 1x| 10.8 | 0.792 | 6.7 | 38.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_x101_32x4d_fpn_1x_20181218-c140fb82.pth)
| X-101-32x4d-FPN | pytorch | 2x| 10.8 | 0.792 | 6.7 | 39.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_x101_32x4d_fpn_2x_20181218-605dcd0a.pth)
| X-101-64x4d-FPN | pytorch | 1x| 14.6 | 1.128 | 5.3 | 40.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_x101_64x4d_fpn_1x_20181218-2f6f778b.pth)
| X-101-64x4d-FPN | pytorch | 2x| 14.6 | 1.128 | 5.3 | 39.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_x101_64x4d_fpn_2x_20181218-2f598dc5.pth)
### Cascade R-CNN
| Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download |
|:--------:|:-------:|:-------:|:--------:|:-------------------:|:--------------:|:------:|:--------:|
| R-50-FPN | caffe | 1x | 5.0 | 0.592 | 8.1 | 40.3 | - |
| R-50-FPN | pytorch | 1x | 5.5 | 0.622 | 8.0 | 40.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r50_fpn_1x_20181123-b1987c4a.pth) |
| R-50-FPN | pytorch | 20e | 5.5 | 0.622 | 8.0 | 41.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r50_fpn_20e_20181123-db483a09.pth) |
| R-101-FPN | caffe | 1x | 8.5 | 0.731 | 7.0 | 42.2 | - |
| R-101-FPN | pytorch | 1x | 8.7 | 0.766 | 6.9 | 42.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r101_fpn_1x_20181129-d64ebac7.pth) |
| R-101-FPN | pytorch | 20e | 8.7 | 0.766 | 6.9 | 42.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r101_fpn_20e_20181129-b46dcede.pth) |
| X-101-32x4d-FPN | pytorch | 1x| 10.6 | 0.902 | 5.7 | 43.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_x101_32x4d_fpn_1x_20181218-941c0925.pth)
| X-101-32x4d-FPN | pytorch |20e| 10.6 | 0.902 | 5.7 | 44.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_x101_32x4d_fpn_2x_20181218-28f73c4c.pth)
| X-101-64x4d-FPN | pytorch | 1x| 14.1 | 1.251 | 4.6 | 44.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_x101_64x4d_fpn_1x_20181218-e2dc376a.pth)
| X-101-64x4d-FPN | pytorch |20e| 14.1 | 1.251 | 4.6 | 44.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_x101_64x4d_fpn_2x_20181218-5add321e.pth)
### Cascade Mask R-CNN
| Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download |
|:--------:|:-------:|:-------:|:--------:|:-------------------:|:--------------:|:------:|:-------:|:--------:|
| R-50-FPN | caffe | 1x | 7.5 | 0.880 | 5.8 | 41.0 | 35.6 | - |
| R-50-FPN | pytorch | 1x | 7.6 | 0.910 | 5.7 | 41.3 | 35.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r50_fpn_1x_20181123-88b170c9.pth) |
| R-50-FPN | pytorch | 20e | 7.6 | 0.910 | 5.7 | 42.4 | 36.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r50_fpn_20e_20181123-6e0c9713.pth) |
| R-101-FPN | caffe | 1x | 10.5 | 1.024 | 5.3 | 43.1 | 37.3 | - |
| R-101-FPN | pytorch | 1x | 10.9 | 1.055 | 5.2 | 42.7 | 37.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r101_fpn_1x_20181129-64f00602.pth) |
| R-101-FPN | pytorch | 20e | 10.9 | 1.055 | 5.2 | 43.4 | 37.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r101_fpn_20e_20181129-cb85151d.pth) |
| X-101-32x4d-FPN | pytorch | 1x| 12.67 | 1.181 | 4.2 | 44.4 | 38.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_x101_32x4d_fpn_1x_20181218-1d944c89.pth)
| X-101-32x4d-FPN | pytorch |20e| 12.67 | 1.181 | 4.2 | 44.9 | 38.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_x101_32x4d_fpn_20e_20181218-761a3473.pth)
| X-101-64x4d-FPN | pytorch | 1x| 10.87 | 1.125 | 3.6 | 45.5 | 39.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_x101_64x4d_fpn_1x_20181218-85953a91.pth)
| X-101-64x4d-FPN | pytorch |20e| 10.87 | 1.125 | 3.6 | 45.8 | 39.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_x101_64x4d_fpn_20e_20181218-630773a7.pth)
**Notes:**
- The `20e` schedule in Cascade (Mask) R-CNN indicates decreasing the lr at 16 and 19 epochs, with a total of 20 epochs.
- Cascade Mask R-CNN with X-101-64x4d-FPN was trained using 16 GPU with a batch size of 16 (1 images per GPU).
### SSD
| Backbone | Size | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download |
|:--------:|:----:|:------:|:-------:|:--------:|:-------------------:|:--------------:|:------:|:--------:|
| VGG16 | 300 | caffe | 120e | 3.5 | 0.286 | 22.9 / 29.2 | 25.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ssd300_coco_vgg16_caffe_120e_20181221-84d7110b.pth) |
| VGG16 | 512 | caffe | 120e | 6.3 | 0.458 | 17.3 / 21.2 | 29.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ssd512_coco_vgg16_caffe_120e_20181221-d48b0be8.pth) |
### SSD (PASCAL VOC)
| Backbone | Size | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download |
|:--------:|:----:|:------:|:-------:|:--------:|:-------------------:|:--------------:|:------:|:--------:|
| VGG16 | 300 | caffe | 240e | 1.2 | 0.189 | 40.1 / 58.0 | 77.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ssd300_voc_vgg16_caffe_240e_20181221-2f05dd40.pth) |
| VGG16 | 512 | caffe | 240e | 2.9 | 0.261 | 28.1 / 36.2 | 80.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ssd512_voc_vgg16_caffe_240e_20181221-7652ee18.pth) |
**Notes:**
- `cudnn.benchmark` is set as `True` for SSD training and testing.
- Inference time is reported for batch size = 1 and batch size = 8.
- The speed difference between VOC and COCO is caused by model parameters and nms.
### Group Normalization (GN)
| Backbone | model | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download |
|:-------------:|:----------:|:-------:|:--------:|:-------------------:|:--------------:|:------:|:-------:|:--------:|
| R-50-FPN (d) | Mask R-CNN | 2x | 7.2 | 0.806 | 5.4 | 39.9 | 36.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_fpn_gn_2x_20180113-86832cf2.pth) |
| R-50-FPN (d) | Mask R-CNN | 3x | 7.2 | 0.806 | 5.4 | 40.2 | 36.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_fpn_gn_3x_20180113-8e82f48d.pth) |
| R-101-FPN (d) | Mask R-CNN | 2x | 9.9 | 0.970 | 4.8 | 41.6 | 37.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r101_fpn_gn_2x_20180113-9598649c.pth) |
| R-101-FPN (d) | Mask R-CNN | 3x | 9.9 | 0.970 | 4.8 | 41.7 | 37.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r101_fpn_gn_3x_20180113-a14ffb96.pth) |
| R-50-FPN (c) | Mask R-CNN | 2x | 7.2 | 0.806 | 5.4 | 39.7 | 35.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_fpn_gn_contrib_2x_20180113-ec93305c.pth) |
| R-50-FPN (c) | Mask R-CNN | 3x | 7.2 | 0.806 | 5.4 | 40.1 | 36.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_fpn_gn_contrib_3x_20180113-9d230cab.pth) |
**Notes:**
- (d) means pretrained model converted from Detectron, and (c) means the contributed model pretrained by [@thangvubk](https://github.com/thangvubk).
- The `3x` schedule is epoch [28, 34, 36].
- The memory is measured with `torch.cuda.max_memory_allocated()` instead of `torch.cuda.max_memory_cached()`. We will update the memory usage of other models in the future.
### Deformable Convolution v2
| Backbone | Model | Style | Conv | Pool | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download |
|:---------:|:------------:|:-------:|:-------------:|:------:|:-------:|:--------:|:-------------------:|:--------------:|:------:|:-------:|:--------:|
| R-50-FPN | Faster | pytorch | dconv(c3-c5) | - | 1x | 3.9 | 0.594 | 10.2 | 40.0 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/faster_rcnn_dconv_c3-c5_r50_fpn_1x_20190125-e41688c9.pth) |
| R-50-FPN | Faster | pytorch | mdconv(c3-c5) | - | 1x | 3.7 | 0.598 | 10.0 | 40.3 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/faster_rcnn_mdconv_c3-c5_r50_fpn_1x_20190125-1b768045.pth) |
| R-50-FPN | Faster | pytorch | - | dpool | 1x | 4.6 | 0.714 | 8.7 | 37.9 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/faster_rcnn_dpool_r50_fpn_1x_20190125-f4fc1d70.pth) |
| R-50-FPN | Faster | pytorch | - | mdpool | 1x | 5.2 | 0.769 | 8.2 | 38.1 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/faster_rcnn_mdpool_r50_fpn_1x_20190125-473d0f3d.pth) |
| R-101-FPN | Faster | pytorch | dconv(c3-c5) | - | 1x | 5.8 | 0.811 | 8.0 | 42.1 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/faster_rcnn_dconv_c3-c5_r101_fpn_1x_20190125-a7e31b65.pth) |
| X-101-32x4d-FPN | Faster | pytorch | dconv(c3-c5) | - | 1x | 7.1 | 1.126 | 6.6 | 43.5 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/faster_rcnn_dconv_c3-c5_x101_32x4d_fpn_1x_20190201-6d46376f.pth) |
| R-50-FPN | Mask | pytorch | dconv(c3-c5) | - | 1x | 4.5 | 0.712 | 7.7 | 41.1 | 37.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/mask_rcnn_dconv_c3-c5_r50_fpn_1x_20190125-4f94ff79.pth) |
| R-50-FPN | Mask | pytorch | mdconv(c3-c5) | - | 1x | 4.5 | 0.712 | 7.7 | 41.4 | 37.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/mask_rcnn_mdconv_c3-c5_r50_fpn_1x_20190125-c5601dc3.pth) |
| R-101-FPN | Mask | pytorch | dconv(c3-c5) | - | 1x | 6.4 | 0.939 | 6.5 | 43.2 | 38.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/mask_rcnn_dconv_c3-c5_r101_fpn_1x_20190125-decb6db5.pth) |
| R-50-FPN | Cascade | pytorch | dconv(c3-c5) | - | 1x | 4.4 | 0.660 | 7.6 | 44.1 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/cascade_rcnn_dconv_c3-c5_r50_fpn_1x_20190125-dfa53166.pth) |
| R-101-FPN | Cascade | pytorch | dconv(c3-c5) | - | 1x | 6.3 | 0.881 | 6.8 | 45.1 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/cascade_rcnn_dconv_c3-c5_r101_fpn_1x_20190125-aaa877cc.pth) |
| R-50-FPN | Cascade Mask | pytorch | dconv(c3-c5) | - | 1x | 6.6 | 0.942 | 5.7 | 44.5 | 38.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/cascade_mask_rcnn_dconv_c3-c5_r50_fpn_1x_20190125-09d8a443.pth) |
| R-101-FPN | Cascade Mask | pytorch | dconv(c3-c5) | - | 1x | 8.5 | 1.156 | 5.1 | 45.8 | 39.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/dcn/cascade_mask_rcnn_dconv_c3-c5_r101_fpn_1x_20190125-0d62c190.pth) |
**Notes:**
- `dconv` and `mdconv` denote (modulated) deformable convolution, `c3-c5` means adding dconv in resnet stage 3 to 5. `dpool` and `mdpool` denote (modulated) deformable roi pooling.
- The memory is measured with `torch.cuda.max_memory_allocated()`. The batch size is 16 (2 images per GPU).
- The dcn ops are modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch, which should be more memory efficient and slightly faster.
## Comparison with Detectron
We compare mmdetection with [Detectron](https://github.com/facebookresearch/Detectron)
and [Detectron.pytorch](https://github.com/roytseng-tw/Detectron.pytorch),
a third-party port of Detectron to Pytorch. The backbone used is R-50-FPN.
In general, mmdetection has 3 advantages over Detectron.
- **Higher performance** (especially in terms of mask AP)
- **Faster training speed**
- **Memory efficient**
### Performance
Detectron and Detectron.pytorch use caffe-style ResNet as the backbone.
In order to utilize the PyTorch model zoo, we use pytorch-style ResNet in our experiments.
In the meanwhile, we train models with caffe-style ResNet in 1x experiments for comparison.
We find that pytorch-style ResNet usually converges slower than caffe-style ResNet,
thus leading to slightly lower results in 1x schedule, but the final results
of 2x schedule is higher.
We report results using both caffe-style (weights converted from
[here](https://github.com/facebookresearch/Detectron/blob/master/MODEL_ZOO.md#imagenet-pretrained-models))
and pytorch-style (weights from the official model zoo) ResNet backbone,
indicated as *pytorch-style results* / *caffe-style results*.
| Type |
Lr schd |
Detectron |
Detectron.pytorch |
mmdetection |
| RPN |
1x |
57.2 |
- |
57.1 / 58.2 |
| 2x |
- |
- |
57.6 / - |
| Faster R-CNN |
1x |
36.7 |
37.1 |
36.4 / 36.7 |
| 2x |
37.9 |
- |
37.7 / - |
| Mask R-CNN |
1x |
37.7 & 33.9 |
37.7 & 33.7 |
37.3 & 34.2 / 37.5 & 34.4 |
| 2x |
38.6 & 34.5 |
- |
38.6 & 35.1 / - |
| Fast R-CNN |
1x |
36.4 |
- |
35.8 / 36.6 |
| 2x |
36.8 |
- |
37.1 / - |
| Fast R-CNN (w/mask) |
1x |
37.3 & 33.7 |
- |
36.8 & 34.1 / 37.3 & 34.5 |
| 2x |
37.7 & 34.0 |
- |
37.9 & 34.8 / - |
### Training Speed
The training speed is measure with s/iter. The lower, the better.
| Type |
Detectron (P1001) |
Detectron.pytorch (XP2) |
mmdetection3 (V1004 / XP) |
| RPN |
0.416 |
- |
0.407 / 0.413 |
| Faster R-CNN |
0.544 |
1.015 |
0.554 / 0.579 |
| Mask R-CNN |
0.889 |
1.435 |
0.690 / 0.732 |
| Fast R-CNN |
0.285 |
- |
0.375 / 0.398 |
| Fast R-CNN (w/mask) |
0.377 |
- |
0.504 / 0.574 |
\*1. Detectron reports the speed on Facebook's Big Basin servers (P100),
on our V100 servers it is slower so we use the official reported values.
\*2. Detectron.pytorch does not report the runtime and we encountered some issue to
run it on V100, so we report the speed on TITAN XP.
\*3. The speed of pytorch-style ResNet is approximately 5% slower than caffe-style,
and we report the pytorch-style results here.
\*4. We also run the models on a DGX-1 server (P100) and the speed is almost the same as our V100 servers.
### Inference Speed
The inference speed is measured with fps (img/s) on a single GPU. The higher, the better.
| Type |
Detectron (P100) |
Detectron.pytorch (XP) |
mmdetection (V100 / XP) |
| RPN |
12.5 |
- |
14.5 / 15.4 |
| Faster R-CNN |
10.3 |
|
9.9 / 9.8 |
| Mask R-CNN |
8.5 |
|
7.7 / 7.4 |
| Fast R-CNN |
12.5 |
|
14.5 / 14.1 |
| Fast R-CNN (w/mask) |
9.9 |
|
10.6 / 10.3 |
### Training memory
We perform various tests and there is no doubt that mmdetection is more memory
efficient than Detectron, and the main cause is the deep learning framework itself, not our efforts.
Besides, Caffe2 and PyTorch have different apis to obtain memory usage
whose implementation is not exactly the same.
`nvidia-smi` shows a larger memory usage for both detectron and mmdetection, e.g.,
we observe a much higher memory usage when we train Mask R-CNN with 2 images per GPU using detectron (10.6G) and mmdetection (9.3G), which is obviously more than actually required.
> With mmdetection, we can train R-50 FPN Mask R-CNN with **4** images per GPU (TITAN XP, 12G),
which is a promising result.
================================================
FILE: README.md
================================================
# Environments
- pytorch 0.3.0/0.4.1
- oldest mmdetection
# Reasoning-RCNN
Reasoning-RCNN: Unifying Adaptive Global Reasoning into Large-scale Object Detection (CVPR2019 Oral)
```
# core files
configs/rrcnn/*
mmdet/models/detectors/reasoning_rcnn.py
mmdet/models/bbox_heads/graph_bbox_head.py
```
# SGRN
Spatial-Aware Graph Relation Network for Large-Scale Object Detection (CVPR2019)
```
# core files
configs/coco_sgrb_fpn_ms.py
mmdet/models/detectors/sgrn.py
mmdet/models/bbox_heads/convfc_bbox_head_enhanced.py
```
================================================
FILE: TECHNICAL_DETAILS.md
================================================
## Overview
In this section, we will introduce the main units of training a detector:
data loading, model and iteration pipeline.
## Data loading
Following typical conventions, we use `Dataset` and `DataLoader` for data loading
with multiple workers. `Dataset` returns a dict of data items corresponding
the arguments of models' forward method.
Since the data in object detection may not be the same size (image size, gt bbox size, etc.),
we introduce a new `DataContainer` type in `mmcv` to help collect and distribute
data of different size.
See [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/parallel/data_container.py) for more details.
## Model
In mmdetection, model components are basically categorized as 4 types.
- backbone: usually a FCN network to extract feature maps, e.g., ResNet.
- neck: the part between backbones and heads, e.g., FPN, ASPP.
- head: the part for specific tasks, e.g., bbox prediction and mask prediction.
- roi extractor: the part for extracting features from feature maps, e.g., RoI Align.
We also write implement some general detection pipelines with the above components,
such as `SingleStageDetector` and `TwoStageDetector`.
### Build a model with basic components
Following some basic pipelines (e.g., two-stage detectors), the model structure
can be customized through config files with no pains.
If we want to implement some new components, e.g, the path aggregation
FPN structure in [Path Aggregation Network for Instance Segmentation](https://arxiv.org/abs/1803.01534), there are two things to do.
1. create a new file in `mmdet/models/necks/pafpn.py`.
```python
class PAFPN(nn.Module):
def __init__(self,
in_channels,
out_channels,
num_outs,
start_level=0,
end_level=-1,
add_extra_convs=False):
pass
def forward(self, inputs):
# implementation is ignored
pass
```
2. modify the config file from
```python
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5)
```
to
```python
neck=dict(
type='PAFPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5)
```
We will release more components (backbones, necks, heads) for research purpose.
### Write a new model
To write a new detection pipeline, you need to inherit from `BaseDetector`,
which defines the following abstract methods.
- `extract_feat()`: given an image batch of shape (n, c, h, w), extract the feature map(s).
- `forward_train()`: forward method of the training mode
- `simple_test()`: single scale testing without augmentation
- `aug_test()`: testing with augmentation (multi-scale, flip, etc.)
[TwoStageDetector](https://github.com/hellock/mmdetection/blob/master/mmdet/models/detectors/two_stage.py)
is a good example which shows how to do that.
## Iteration pipeline
We adopt distributed training for both single machine and multiple machines.
Supposing that the server has 8 GPUs, 8 processes will be started and each process runs on a single GPU.
Each process keeps an isolated model, data loader, and optimizer.
Model parameters are only synchronized once at the begining.
After a forward and backward pass, gradients will be allreduced among all GPUs,
and the optimizer will update model parameters.
Since the gradients are allreduced, the model parameter stays the same for all processes after the iteration.
================================================
FILE: compile.sh
================================================
#!/usr/bin/env bash
PYTHON=${PYTHON:-"python"}
echo "Building roi align op..."
cd mmdet/ops/roi_align
if [ -d "build" ]; then
rm -r build
fi
$PYTHON setup.py build_ext --inplace
echo "Building roi pool op..."
cd ../roi_pool
if [ -d "build" ]; then
rm -r build
fi
$PYTHON setup.py build_ext --inplace
echo "Building nms op..."
cd ../nms
make clean
make PYTHON=${PYTHON}
echo "Building dcn..."
cd ../dcn
if [ -d "build" ]; then
rm -r build
fi
$PYTHON setup.py build_ext --inplace
================================================
FILE: configs/ade_faster_rcnn_r101_fpn_1x.py
================================================
# model settings
model = dict(
type='FasterRCNN',
pretrained='modelzoo://resnet101',
backbone=dict(
type='ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_scales=[8],
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[4, 8, 16, 32, 64],
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
use_sigmoid_cls=True),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=dict(
type='SharedFCBBoxHead',
num_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=446,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=False))
# model training and testing settings
train_cfg = dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
smoothl1_beta=1 / 9.0,
debug=False),
rcnn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False))
test_cfg = dict(
rpn=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=2000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)
# soft-nms is also supported for rcnn testing
# e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
)
# dataset settings
dataset_type = 'CocoDataset'
data_root = '/home/cyan/data/Detection/ADE_new/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
data = dict(
imgs_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'train.json',
img_prefix=data_root + 'train/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0.5,
with_mask=False,
with_crowd=True,
with_label=True),
val=dict(
type=dataset_type,
ann_file=data_root + 'val.json',
img_prefix=data_root + 'val/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=True,
with_label=True),
test=dict(
type=dataset_type,
ann_file=data_root + 'val.json',
img_prefix=data_root + 'val/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_label=False,
test_mode=True))
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[8, 11])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 12
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/faster_rcnn_r101_fpn_1x'
load_from = None
resume_from = None
workflow = [('train', 1)]
================================================
FILE: configs/coco_faster_rcnn_r101_fpn_1x.py
================================================
# model settings
model = dict(
type='FasterRCNN',
pretrained='modelzoo://resnet101',
backbone=dict(
type='ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_scales=[8],
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[4, 8, 16, 32, 64],
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
use_sigmoid_cls=True),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=dict(
type='SharedFCBBoxHead',
num_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=81,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=False))
# model training and testing settings
train_cfg = dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
smoothl1_beta=1 / 9.0,
debug=False),
rcnn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSamplerFixnum',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False))
test_cfg = dict(
rpn=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=2000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)
# soft-nms is also supported for rcnn testing
# e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
)
# dataset settings
dataset_type = 'CocoDataset'
data_root = '/home/cyan/data/Detection/coco2017/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
data = dict(
imgs_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_train2017.json',
img_prefix=data_root + 'images/train2017/',
img_scale=[(1333,600),(1333,1000)],
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0.5,
with_mask=False,
with_crowd=True,
with_label=True),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=True,
with_label=True),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'images/val2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_label=False,
test_mode=True))
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[8, 11])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 12
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/faster_rcnn_r101_fpn_1x'
load_from = None
resume_from = None
workflow = [('train', 1)]
================================================
FILE: configs/coco_sgrb_fpn_ms.py
================================================
# model settings
model = dict(
type='ThreeStageGraphDetector',
pretrained='modelzoo://resnet101',
backbone=dict(
type='ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_scales=[8],
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[4, 8, 16, 32, 64],
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
use_sigmoid_cls=True),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=[dict(
type='SharedFCRoIHead',
num_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=81,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=False),
dict(
type='ConvFCRoIHeadEnhance',
enhance_channels=256,
num_shared_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=81,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=False)
],
graph_convolution=dict(
latent_graph_channel=256,
n_kernels_gc=8,
n_graph_node=512,
neigh_size=32)
)
# model training and testing settings
train_cfg = dict(
rpn=dict(
pos_fraction=0.5,
pos_balance_sampling=False,
neg_pos_ub=256,
allowed_border=0,
crowd_thr=1.1,
anchor_batch_size=256,
pos_iou_thr=0.7,
neg_iou_thr=0.3,
neg_balance_thr=0,
min_pos_iou=0.3,
pos_weight=-1,
smoothl1_beta=1 / 9.0,
debug=False),
rcnn=dict(
pos_iou_thr=0.5,
neg_iou_thr=0.5,
crowd_thr=1.1,
roi_batch_size=512,
add_gt_as_proposals=False,
pos_fraction=0.25,
pos_balance_sampling=False,
neg_pos_ub=512,
neg_balance_thr=0,
min_pos_iou=0.5,
pos_weight=-1,
debug=False),
rcnn2=dict(
pos_iou_thr=0.6,
neg_iou_thr=0.6,
crowd_thr=1.1,
roi_batch_size=512,
add_gt_as_proposals=False,
pos_fraction=0.25,
pos_balance_sampling=False,
neg_pos_ub=512,
neg_balance_thr=0,
min_pos_iou=0.5,
pos_weight=-1,
debug=False))
test_cfg = dict(
rpn=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=2000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(score_thr=0.001, max_per_img=150, nms_thr=0.55))
# dataset settings
dataset_type = 'CocoDataset'
data_root = '/home/xuhang/data/detection_data/COCO2017/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
data = dict(
imgs_per_gpu=1,
workers_per_gpu=4,
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_train2017.json',
img_prefix=data_root + 'train2017/',
img_scale=[(1333, 600),(1333, 1000)],
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0.5,
with_mask=False,
with_crowd=True,
with_label=True),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=True,
with_label=True),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_label=False,
test_mode=True))
# optimizer
optimizer = dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[4])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 5
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/faster_rcnn_r101_fpn_1x'
#load_from = None
#resume_from = './exps/coco_three_stage_graph_fpn_ms/epoch_12.pth'
load_from = './tools/transfer_domian/model/vg_transfer_coco.pth'
resume_from = None
workflow = [('train', 1)]
================================================
FILE: configs/hkrm/ade_faster_rcnn_r50_fpn_1x.py
================================================
# model settings
model = dict(
type='HKRMRCNN',
pretrained='modelzoo://resnet50',
adja_gt='/home/cyan/code/mmdetection/tools/graph/new_ade_graph_a.pkl',
adjr_gt='/home/cyan/code/mmdetection/tools/graph/new_ade_graph_r.pkl',
backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_scales=[8],
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[4, 8, 16, 32, 64],
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
use_sigmoid_cls=True),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=dict(
type='GraphBBoxHead',
roi_feat_size=7,
num_shared_fcs=2,
in_channels=256,
fc_out_channels=1024,
num_classes=446,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=False,
num_attr_conv=4,
num_rela_conv=4,
num_spat_conv=2,
with_attr=True,
with_rela=True,
with_spat=True,))
# model training and testing settings
train_cfg = dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
smoothl1_beta=1 / 9.0,
debug=False),
rcnn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSamplerFixnum',
num=256,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False))
test_cfg = dict(
rpn=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=2000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)
# soft-nms is also supported for rcnn testing
# e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
)
# dataset settings
dataset_type = 'CocoDataset'
data_root = '/home/cyan/data/Detection/ADE_new/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
data = dict(
imgs_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'train.json',
img_prefix=data_root + 'train/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0.5,
with_mask=False,
with_crowd=True,
with_label=True),
val=dict(
type=dataset_type,
ann_file=data_root + 'val.json',
img_prefix=data_root + 'val/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=True,
with_label=True),
test=dict(
type=dataset_type,
ann_file=data_root + 'val.json',
img_prefix=data_root + 'val/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_label=False,
test_mode=True))
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[8, 11])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 12
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/hkrm_r101_fpn_1x'
load_from = None
resume_from = None
workflow = [('train', 1)]
================================================
FILE: configs/hkrm/coco_faster_rcnn_r101_fpn_1x.py
================================================
# model settings
model = dict(
type='HKRMRCNN',
adja_gt= './graph/new_ade_graph_a.pkl',
adjr_gt= './graph/new_ade_graph_r.pkl',
backbone=dict(
type='ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_scales=[8],
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[4, 8, 16, 32, 64],
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
use_sigmoid_cls=True),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=dict(
type='GraphBBoxHead',
roi_feat_size=7,
num_shared_fcs=2,
in_channels=256,
fc_out_channels=1024,
num_classes=81,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=False,
num_attr_conv=4,
num_rela_conv=4,
num_spat_conv=2,
with_attr=True,
with_rela=True,
with_spat=True,))
# model training and testing settings
train_cfg = dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
smoothl1_beta=1 / 9.0,
debug=False),
rcnn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSamplerFixnum',
num=256,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False))
test_cfg = dict(
rpn=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=2000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)
# soft-nms is also supported for rcnn testing
# e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
)
# dataset settings
dataset_type = 'CocoDataset'
data_root = '/home/cyan/data/Detection/coco2017/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
data = dict(
imgs_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_train2017.json',
img_prefix=data_root + 'images/train2017/',
img_scale=[(1333,600),(1333,1000)],
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0.5,
with_mask=False,
with_crowd=True,
with_label=True),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=True,
with_label=True),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'images/val2017/',
img_scale=(800, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_label=False,
test_mode=True))
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[8, 11])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 12
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/hkrm_r101_fpn_1x'
load_from = None
resume_from = None
workflow = [('train', 1)]
================================================
FILE: configs/hkrm/vg_faster_rcnn_r101_fpn_1x.py
================================================
# model settings
model = dict(
type='HKRMRCNN',
#pretrained='modelzoo://resnet101',
adja_gt='./graph/new_vg_graph_a.pkl',
adjr_gt='./graph/new_vg_graph_r.pkl',
backbone=dict(
type='ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_scales=[8],
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[4, 8, 16, 32, 64],
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
use_sigmoid_cls=True),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=dict(
type='GraphBBoxHead',
roi_feat_size=7,
num_shared_fcs=2,
in_channels=256,
fc_out_channels=1024,
num_classes=1001,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=False,
num_attr_conv=4,
num_rela_conv=4,
num_spat_conv=2,
with_attr=True,
with_rela=True,
with_spat=True, ))
# model training and testing settings
train_cfg = dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
smoothl1_beta=1 / 9.0,
debug=False),
rcnn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSamplerFixnum',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False))
test_cfg = dict(
rpn=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=512,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)
# soft-nms is also supported for rcnn testing
# e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
)
# dataset settings
dataset_type = 'CocoDataset'
data_root = '/home/cyan/data/Detection/vg/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
data = dict(
imgs_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'train.json',
img_prefix=data_root + 'train/',
img_scale=(1333, 200),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0.5,
with_mask=False,
with_crowd=True,
with_label=True),
val=dict(
type=dataset_type,
ann_file=data_root + 'val.json',
img_prefix=data_root + 'val/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=True,
with_label=True),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/val.json',
img_prefix=data_root + 'VG/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_label=False,
test_mode=True))
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[8, 11])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 12
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/hkrm_r101_fpn_1x'
load_from = None
resume_from = None
workflow = [('train', 1)]
================================================
FILE: configs/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712.py
================================================
# model settings
model = dict(
type='FasterRCNN',
pretrained='modelzoo://resnet50',
backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_scales=[8],
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[4, 8, 16, 32, 64],
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
use_sigmoid_cls=True),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=dict(
type='SharedFCBBoxHead',
num_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=21,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=False))
# model training and testing settings
train_cfg = dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
smoothl1_beta=1 / 9.0,
debug=False),
rcnn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False))
test_cfg = dict(
rpn=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=2000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)
# soft-nms is also supported for rcnn testing
# e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
)
# dataset settings
dataset_type = 'VOCDataset'
data_root = 'data/VOCdevkit/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
data = dict(
imgs_per_gpu=2,
workers_per_gpu=2,
train=dict(
type='RepeatDataset', # to avoid reloading datasets frequently
times=3,
dataset=dict(
type=dataset_type,
ann_file=[
data_root + 'VOC2007/ImageSets/Main/trainval.txt',
data_root + 'VOC2012/ImageSets/Main/trainval.txt'
],
img_prefix=[data_root + 'VOC2007/', data_root + 'VOC2012/'],
img_scale=(1000, 600),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0.5,
with_mask=False,
with_crowd=True,
with_label=True)),
val=dict(
type=dataset_type,
ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt',
img_prefix=data_root + 'VOC2007/',
img_scale=(1000, 600),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=True,
with_label=True),
test=dict(
type=dataset_type,
ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt',
img_prefix=data_root + 'VOC2007/',
img_scale=(1000, 600),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_label=False,
test_mode=True))
# optimizer
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(policy='step', step=[3]) # actual epoch = 3 * 3 = 9
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 4 # actual epoch = 4 * 3 = 12
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/faster_rcnn_r50_fpn_1x_voc0712'
load_from = None
resume_from = None
workflow = [('train', 1)]
================================================
FILE: configs/pascal_voc/ssd300_voc.py
================================================
# model settings
input_size = 300
model = dict(
type='SingleStageDetector',
pretrained='open-mmlab://vgg16_caffe',
backbone=dict(
type='SSDVGG',
input_size=input_size,
depth=16,
with_last_pool=False,
ceil_mode=True,
out_indices=(3, 4),
out_feature_indices=(22, 34),
l2_norm_scale=20),
neck=None,
bbox_head=dict(
type='SSDHead',
input_size=input_size,
in_channels=(512, 1024, 512, 256, 256, 256),
num_classes=21,
anchor_strides=(8, 16, 32, 64, 100, 300),
basesize_ratio_range=(0.2, 0.9),
anchor_ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]),
target_means=(.0, .0, .0, .0),
target_stds=(0.1, 0.1, 0.2, 0.2)))
cudnn_benchmark = True
train_cfg = dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.,
ignore_iof_thr=-1,
gt_max_assign_all=False),
smoothl1_beta=1.,
allowed_border=-1,
pos_weight=-1,
neg_pos_ratio=3,
debug=False)
test_cfg = dict(
nms=dict(type='nms', iou_thr=0.45),
min_bbox_size=0,
score_thr=0.02,
max_per_img=200)
# model training and testing settings
# dataset settings
dataset_type = 'VOCDataset'
data_root = 'data/VOCdevkit/'
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)
data = dict(
imgs_per_gpu=4,
workers_per_gpu=2,
train=dict(
type='RepeatDataset',
times=10,
dataset=dict(
type=dataset_type,
ann_file=[
data_root + 'VOC2007/ImageSets/Main/trainval.txt',
data_root + 'VOC2012/ImageSets/Main/trainval.txt'
],
img_prefix=[data_root + 'VOC2007/', data_root + 'VOC2012/'],
img_scale=(300, 300),
img_norm_cfg=img_norm_cfg,
size_divisor=None,
flip_ratio=0.5,
with_mask=False,
with_crowd=False,
with_label=True,
test_mode=False,
extra_aug=dict(
photo_metric_distortion=dict(
brightness_delta=32,
contrast_range=(0.5, 1.5),
saturation_range=(0.5, 1.5),
hue_delta=18),
expand=dict(
mean=img_norm_cfg['mean'],
to_rgb=img_norm_cfg['to_rgb'],
ratio_range=(1, 4)),
random_crop=dict(
min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3)),
resize_keep_ratio=False)),
val=dict(
type=dataset_type,
ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt',
img_prefix=data_root + 'VOC2007/',
img_scale=(300, 300),
img_norm_cfg=img_norm_cfg,
size_divisor=None,
flip_ratio=0,
with_mask=False,
with_label=False,
test_mode=True,
resize_keep_ratio=False),
test=dict(
type=dataset_type,
ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt',
img_prefix=data_root + 'VOC2007/',
img_scale=(300, 300),
img_norm_cfg=img_norm_cfg,
size_divisor=None,
flip_ratio=0,
with_mask=False,
with_label=False,
test_mode=True,
resize_keep_ratio=False))
# optimizer
optimizer = dict(type='SGD', lr=1e-3, momentum=0.9, weight_decay=5e-4)
optimizer_config = dict()
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[16, 20])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 24
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/ssd300_voc'
load_from = None
resume_from = None
workflow = [('train', 1)]
================================================
FILE: configs/pascal_voc/ssd512_voc.py
================================================
# model settings
input_size = 512
model = dict(
type='SingleStageDetector',
pretrained='open-mmlab://vgg16_caffe',
backbone=dict(
type='SSDVGG',
input_size=input_size,
depth=16,
with_last_pool=False,
ceil_mode=True,
out_indices=(3, 4),
out_feature_indices=(22, 34),
l2_norm_scale=20),
neck=None,
bbox_head=dict(
type='SSDHead',
input_size=input_size,
in_channels=(512, 1024, 512, 256, 256, 256, 256),
num_classes=21,
anchor_strides=(8, 16, 32, 64, 128, 256, 512),
basesize_ratio_range=(0.15, 0.9),
anchor_ratios=([2], [2, 3], [2, 3], [2, 3], [2, 3], [2], [2]),
target_means=(.0, .0, .0, .0),
target_stds=(0.1, 0.1, 0.2, 0.2)))
cudnn_benchmark = True
train_cfg = dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.,
ignore_iof_thr=-1,
gt_max_assign_all=False),
smoothl1_beta=1.,
allowed_border=-1,
pos_weight=-1,
neg_pos_ratio=3,
debug=False)
test_cfg = dict(
nms=dict(type='nms', iou_thr=0.45),
min_bbox_size=0,
score_thr=0.02,
max_per_img=200)
# model training and testing settings
# dataset settings
dataset_type = 'VOCDataset'
data_root = 'data/VOCdevkit/'
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)
data = dict(
imgs_per_gpu=4,
workers_per_gpu=2,
train=dict(
type='RepeatDataset',
times=10,
dataset=dict(
type=dataset_type,
ann_file=[
data_root + 'VOC2007/ImageSets/Main/trainval.txt',
data_root + 'VOC2012/ImageSets/Main/trainval.txt'
],
img_prefix=[data_root + 'VOC2007/', data_root + 'VOC2012/'],
img_scale=(512, 512),
img_norm_cfg=img_norm_cfg,
size_divisor=None,
flip_ratio=0.5,
with_mask=False,
with_crowd=False,
with_label=True,
test_mode=False,
extra_aug=dict(
photo_metric_distortion=dict(
brightness_delta=32,
contrast_range=(0.5, 1.5),
saturation_range=(0.5, 1.5),
hue_delta=18),
expand=dict(
mean=img_norm_cfg['mean'],
to_rgb=img_norm_cfg['to_rgb'],
ratio_range=(1, 4)),
random_crop=dict(
min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3)),
resize_keep_ratio=False)),
val=dict(
type=dataset_type,
ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt',
img_prefix=data_root + 'VOC2007/',
img_scale=(512, 512),
img_norm_cfg=img_norm_cfg,
size_divisor=None,
flip_ratio=0,
with_mask=False,
with_label=False,
test_mode=True,
resize_keep_ratio=False),
test=dict(
type=dataset_type,
ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt',
img_prefix=data_root + 'VOC2007/',
img_scale=(512, 512),
img_norm_cfg=img_norm_cfg,
size_divisor=None,
flip_ratio=0,
with_mask=False,
with_label=False,
test_mode=True,
resize_keep_ratio=False))
# optimizer
optimizer = dict(type='SGD', lr=1e-3, momentum=0.9, weight_decay=5e-4)
optimizer_config = dict()
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[16, 20])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 24
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/ssd512_voc'
load_from = None
resume_from = None
workflow = [('train', 1)]
================================================
FILE: configs/rrcnn/ade_reasoning_rcnn_r101_fpn_1x.py
================================================
# model settings
model = dict(
type='ReasoningRCNN',
num_stages=2,
adj_gt='./graph/new_ade_graph_r.pkl',
graph_out_channels=256,
pretrained='modelzoo://resnet101',
backbone=dict(
type='ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_scales=[8],
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[4, 8, 16, 32, 64],
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
use_sigmoid_cls=True),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=[
dict(
type='BBoxHead',
with_avg_pool=False,
in_channels=1024,
roi_feat_size=1,
num_classes=446,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=True),
dict(
type='BBoxHead',
with_avg_pool=False,
in_channels=1280,
roi_feat_size=1,
num_classes=446,
target_means=[0., 0., 0., 0.],
target_stds=[0.05, 0.05, 0.1, 0.1],
reg_class_agnostic=True)
])
# model training and testing settings
train_cfg = dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
smoothl1_beta=1 / 9.0,
debug=False),
rcnn=[
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSamplerFixnum',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.6,
neg_iou_thr=0.6,
min_pos_iou=0.6,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSamplerFixnum',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False)
],
stage_loss_weights=[1, 0.5])
test_cfg = dict(
rpn=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=2000,
nms_thr=0.6,
min_bbox_size=0),
rcnn=dict(
score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100),
keep_all_stages=True)
# dataset settings
dataset_type = 'CocoDataset'
data_root = '/home/cyan/data/Detection/ADE_new/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
data = dict(
imgs_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'train.json',
img_prefix=data_root + 'train/',
img_scale=(1333, 200),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0.5,
with_mask=False,
with_crowd=True,
with_label=True),
val=dict(
type=dataset_type,
ann_file=data_root + 'val.json',
img_prefix=data_root + 'val/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=True,
with_label=True),
test=dict(
type=dataset_type,
ann_file=data_root + 'val.json',
img_prefix=data_root + 'val/',
img_scale=(800, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_label=False,
test_mode=True))
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[8, 11])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 12
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/rrcnn_r101_fpn_1x'
load_from = './work_dirs/ade_fpn_r101/pretrained_model.pth'
resume_from = None
workflow = [('train', 1)]
================================================
FILE: configs/rrcnn/coco_reasoning_rcnn_r101_fpn_1x.py
================================================
# model settings
model = dict(
type='ReasoningRCNN',
num_stages=2,
adj_gt='./graph/new_COCO_graph_r.pkl', # relation graph: './graph/new_ade_graph_r.pkl'
graph_out_channels=256,
pretrained='modelzoo://resnet101',
backbone=dict(
type='ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_scales=[8],
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[4, 8, 16, 32, 64],
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
use_sigmoid_cls=True),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=[
dict(
type='BBoxHead',
with_avg_pool=False,
in_channels=1024,
roi_feat_size=1,
num_classes=81,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=True),
dict(
type='BBoxHead',
with_avg_pool=False,
in_channels=1280,
roi_feat_size=1,
num_classes=81,
target_means=[0., 0., 0., 0.],
target_stds=[0.05, 0.05, 0.1, 0.1],
reg_class_agnostic=True)
])
# model training and testing settings
train_cfg = dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
smoothl1_beta=1 / 9.0,
debug=False),
rcnn=[
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSamplerFixnum',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.6,
neg_iou_thr=0.6,
min_pos_iou=0.6,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSamplerFixnum',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False)
],
stage_loss_weights=[1, 0.5])
test_cfg = dict(
rpn=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=2000,
nms_thr=0.6,
min_bbox_size=0),
rcnn=dict(
score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100),
keep_all_stages=True)
# dataset settings
dataset_type = 'CocoDataset'
data_root = '/home/cyan/data/Detection/coco2017/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
data = dict(
imgs_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_train2017.json',
img_prefix=data_root + 'images/train2017/',
img_scale=(1333, 200),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0.5,
with_mask=False,
with_crowd=True,
with_label=True),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'images/val2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=True,
with_label=True),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'images/val2017',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_label=False,
test_mode=True))
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[8, 11])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 12
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/rrcnn_r101_fpn_1x'
load_from = './work_dirs/coco_fpn_r101/pretrained_model.pth'
resume_from = None
workflow = [('train', 1)]
================================================
FILE: configs/rrcnn/vg_reasoning_rcnn_r101_fpn_1x.py
================================================
# model settings
model = dict(
type='ReasoningRCNN',
num_stages=2,
adj_gt='./graph/new_COCO_graph_r.pkl',
graph_out_channels=256,
pretrained='modelzoo://resnet101',
backbone=dict(
type='ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_scales=[8],
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[4, 8, 16, 32, 64],
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
use_sigmoid_cls=True),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=[
dict(
type='BBoxHead',
with_avg_pool=False,
in_channels=1024,
roi_feat_size=1,
num_classes=3001,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=True),
dict(
type='BBoxHead',
with_avg_pool=False,
in_channels=1280,
roi_feat_size=1,
num_classes=3001,
target_means=[0., 0., 0., 0.],
target_stds=[0.05, 0.05, 0.1, 0.1],
reg_class_agnostic=True)
])
# model training and testing settings
train_cfg = dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
smoothl1_beta=1 / 9.0,
debug=False),
rcnn=[
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSamplerFixnum',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.6,
neg_iou_thr=0.6,
min_pos_iou=0.6,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSamplerFixnum',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False)
],
stage_loss_weights=[1, 0.5])
test_cfg = dict(
rpn=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=2000,
nms_thr=0.6,
min_bbox_size=0),
rcnn=dict(
score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100),
keep_all_stages=True)
# dataset settings
dataset_type = 'CocoDataset'
data_root = '/home/cyan/data/Detection/vg/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
data = dict(
imgs_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_train2017.json',
img_prefix=data_root + 'images/train2017/',
img_scale=(1333, 200),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0.5,
with_mask=False,
with_crowd=True,
with_label=True),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'images/val2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=True,
with_label=True),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/val_big.json',
img_prefix=data_root + 'VG',
img_scale=(800, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_label=False,
test_mode=True))
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[8, 11])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 12
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/rrcnn_r101_fpn_1x'
load_from = './work_dirs/vg_fpn_r101/pretrained_model.pth'
resume_from = None
workflow = [('train', 1)]
================================================
FILE: configs/vg_faster_rcnn_r101_fpn_1x.py
================================================
# model settings
model = dict(
type='FasterRCNN',
pretrained='modelzoo://resnet101',
backbone=dict(
type='ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_scales=[8],
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[4, 8, 16, 32, 64],
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
use_sigmoid_cls=True),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=dict(
type='SharedFCBBoxHead',
num_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=1001,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=False))
# model training and testing settings
train_cfg = dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
smoothl1_beta=1 / 9.0,
debug=False),
rcnn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False))
test_cfg = dict(
rpn=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=2000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)
# soft-nms is also supported for rcnn testing
# e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
)
# dataset settings
dataset_type = 'CocoDataset'
data_root = '/home/cyan/data/Detection/vg/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
data = dict(
imgs_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'train.json',
img_prefix=data_root + 'train/',
img_scale=(1333, 200),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0.5,
with_mask=False,
with_crowd=True,
with_label=True),
val=dict(
type=dataset_type,
ann_file=data_root + 'val.json',
img_prefix=data_root + 'val/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=True,
with_label=True),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/val.json',
img_prefix=data_root + 'VG/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_label=False,
test_mode=True))
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[8, 11])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 12
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/faster_rcnn_r101_fpn_1x'
load_from = None
resume_from = None
workflow = [('train', 1)]
================================================
FILE: configs/vgbig_faster_rcnn_r101_fpn_1x.py
================================================
# model settings
model = dict(
type='FasterRCNN',
pretrained='modelzoo://resnet101',
backbone=dict(
type='ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_scales=[8],
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[4, 8, 16, 32, 64],
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
use_sigmoid_cls=True),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=dict(
type='SharedFCBBoxHead',
num_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=1001,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=False))
# model training and testing settings
train_cfg = dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
smoothl1_beta=1 / 9.0,
debug=False),
rcnn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False))
test_cfg = dict(
rpn=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=2000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)
# soft-nms is also supported for rcnn testing
# e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
)
# dataset settings
dataset_type = 'CocoDataset'
data_root = '/home/cyan/data/Detection/vg/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
data = dict(
imgs_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/train_big.json',
img_prefix=data_root + 'VG/',
img_scale=(1333, 200),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0.5,
with_mask=False,
with_crowd=True,
with_label=True),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/val_big.json',
img_prefix=data_root + 'VG/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=True,
with_label=True),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/val_big.json',
img_prefix=data_root + 'VG/',
img_scale=(1333, 400),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_label=True,
test_mode=True))
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[8, 11])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 12
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/faster_rcnn_r50_fpn_1x'
load_from = None
resume_from = None
workflow = [('train', 1)]
================================================
FILE: mmdet/__init__.py
================================================
from .version import __version__, short_version
__all__ = ['__version__', 'short_version']
================================================
FILE: mmdet/apis/__init__.py
================================================
from .env import init_dist, get_root_logger, set_random_seed
from .train import train_detector
from .inference import inference_detector, show_result
__all__ = [
'init_dist', 'get_root_logger', 'set_random_seed', 'train_detector',
'inference_detector', 'show_result'
]
================================================
FILE: mmdet/apis/env.py
================================================
import logging
import os
import random
import numpy as np
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from mmcv.runner import get_dist_info
def init_dist(launcher, backend='nccl', **kwargs):
if mp.get_start_method(allow_none=True) is None:
mp.set_start_method('spawn')
if launcher == 'pytorch':
_init_dist_pytorch(backend, **kwargs)
elif launcher == 'mpi':
_init_dist_mpi(backend, **kwargs)
elif launcher == 'slurm':
_init_dist_slurm(backend, **kwargs)
else:
raise ValueError('Invalid launcher type: {}'.format(launcher))
def _init_dist_pytorch(backend, **kwargs):
# TODO: use local_rank instead of rank % num_gpus
rank = int(os.environ['RANK'])
num_gpus = torch.cuda.device_count()
torch.cuda.set_device(rank % num_gpus)
dist.init_process_group(backend=backend, **kwargs)
def _init_dist_mpi(backend, **kwargs):
raise NotImplementedError
def _init_dist_slurm(backend, **kwargs):
raise NotImplementedError
def set_random_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
def get_root_logger(log_level=logging.INFO):
logger = logging.getLogger()
if not logger.hasHandlers():
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(message)s',
level=log_level)
rank, _ = get_dist_info()
if rank != 0:
logger.setLevel('ERROR')
return logger
================================================
FILE: mmdet/apis/inference.py
================================================
import mmcv
import numpy as np
import torch
from mmdet.datasets import to_tensor
from mmdet.datasets.transforms import ImageTransform
from mmdet.core import get_classes
def _prepare_data(img, img_transform, cfg, device):
ori_shape = img.shape
img, img_shape, pad_shape, scale_factor = img_transform(
img, scale=cfg.data.test.img_scale)
img = to_tensor(img).to(device).unsqueeze(0)
img_meta = [
dict(
ori_shape=ori_shape,
img_shape=img_shape,
pad_shape=pad_shape,
scale_factor=scale_factor,
flip=False)
]
return dict(img=[img], img_meta=[img_meta])
def _inference_single(model, img, img_transform, cfg, device):
img = mmcv.imread(img)
data = _prepare_data(img, img_transform, cfg, device)
with torch.no_grad():
result = model(return_loss=False, rescale=True, **data)
return result
def _inference_generator(model, imgs, img_transform, cfg, device):
for img in imgs:
yield _inference_single(model, img, img_transform, cfg, device)
def inference_detector(model, imgs, cfg, device='cuda:0'):
img_transform = ImageTransform(
size_divisor=cfg.data.test.size_divisor, **cfg.img_norm_cfg)
model = model.to(device)
model.eval()
if not isinstance(imgs, list):
return _inference_single(model, imgs, img_transform, cfg, device)
else:
return _inference_generator(model, imgs, img_transform, cfg, device)
def show_result(img, result, dataset='coco', score_thr=0.3):
class_names = get_classes(dataset)
labels = [
np.full(bbox.shape[0], i, dtype=np.int32)
for i, bbox in enumerate(result)
]
labels = np.concatenate(labels)
bboxes = np.vstack(result)
img = mmcv.imread(img)
mmcv.imshow_det_bboxes(
img.copy(),
bboxes,
labels,
class_names=class_names,
score_thr=score_thr)
================================================
FILE: mmdet/apis/train.py
================================================
from __future__ import division
from collections import OrderedDict
import torch
from mmcv.runner import Runner, DistSamplerSeedHook
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmdet.core import (DistOptimizerHook, DistEvalmAPHook,
CocoDistEvalRecallHook, CocoDistEvalmAPHook)
from mmdet.datasets import build_dataloader
from mmdet.models import RPN
from .env import get_root_logger
def parse_losses(losses):
log_vars = OrderedDict()
for loss_name, loss_value in losses.items():
if isinstance(loss_value, torch.Tensor):
log_vars[loss_name] = loss_value.mean()
elif isinstance(loss_value, list):
log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
else:
raise TypeError(
'{} is not a tensor or list of tensors'.format(loss_name))
loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key)
log_vars['loss'] = loss
for name in log_vars:
log_vars[name] = log_vars[name].item()
return loss, log_vars
def batch_processor(model, data, train_mode):
losses = model(**data)
loss, log_vars = parse_losses(losses)
outputs = dict(
loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
return outputs
def train_detector(model,
dataset,
cfg,
distributed=False,
validate=False,
logger=None):
if logger is None:
logger = get_root_logger(cfg.log_level)
# start training
if distributed:
_dist_train(model, dataset, cfg, validate=validate)
else:
_non_dist_train(model, dataset, cfg, validate=validate)
def _dist_train(model, dataset, cfg, validate=False):
# prepare data loaders
data_loaders = [
build_dataloader(
dataset,
cfg.data.imgs_per_gpu,
cfg.data.workers_per_gpu,
dist=True)
]
# put model on gpus
model = MMDistributedDataParallel(model.cuda())
# build runner
runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
cfg.log_level)
# register hooks
optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
runner.register_training_hooks(cfg.lr_config, optimizer_config,
cfg.checkpoint_config, cfg.log_config)
runner.register_hook(DistSamplerSeedHook())
# register eval hooks
if validate:
if isinstance(model.module, RPN):
# TODO: implement recall hooks for other datasets
runner.register_hook(CocoDistEvalRecallHook(cfg.data.val))
else:
if cfg.data.val.type == 'CocoDataset':
runner.register_hook(CocoDistEvalmAPHook(cfg.data.val))
else:
runner.register_hook(DistEvalmAPHook(cfg.data.val))
if cfg.resume_from:
runner.resume(cfg.resume_from)
elif cfg.load_from:
runner.load_checkpoint(cfg.load_from)
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False):
# prepare data loaders
data_loaders = [
build_dataloader(
dataset,
cfg.data.imgs_per_gpu,
cfg.data.workers_per_gpu,
cfg.gpus,
dist=False)
]
# put model on gpus
model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
# build runner
runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
cfg.log_level)
runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
cfg.checkpoint_config, cfg.log_config)
if cfg.resume_from:
runner.resume(cfg.resume_from)
elif cfg.load_from:
runner.load_checkpoint(cfg.load_from)
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
================================================
FILE: mmdet/core/__init__.py
================================================
from .anchor import * # noqa: F401, F403
from .bbox import * # noqa: F401, F403
from .mask import * # noqa: F401, F403
from .loss import * # noqa: F401, F403
from .evaluation import * # noqa: F401, F403
from .post_processing import * # noqa: F401, F403
from .utils import * # noqa: F401, F403
================================================
FILE: mmdet/core/anchor/__init__.py
================================================
from .anchor_generator import AnchorGenerator
from .anchor_target import anchor_target
__all__ = ['AnchorGenerator', 'anchor_target']
================================================
FILE: mmdet/core/anchor/anchor_generator.py
================================================
import torch
class AnchorGenerator(object):
def __init__(self, base_size, scales, ratios, scale_major=True, ctr=None):
self.base_size = base_size
self.scales = torch.Tensor(scales)
self.ratios = torch.Tensor(ratios)
self.scale_major = scale_major
self.ctr = ctr
self.base_anchors = self.gen_base_anchors()
@property
def num_base_anchors(self):
return self.base_anchors.size(0)
def gen_base_anchors(self):
w = self.base_size
h = self.base_size
if self.ctr is None:
x_ctr = 0.5 * (w - 1)
y_ctr = 0.5 * (h - 1)
else:
x_ctr, y_ctr = self.ctr
h_ratios = torch.sqrt(self.ratios)
w_ratios = 1 / h_ratios
if self.scale_major:
ws = (w * w_ratios[:, None] * self.scales[None, :]).view(-1)
hs = (h * h_ratios[:, None] * self.scales[None, :]).view(-1)
else:
ws = (w * self.scales[:, None] * w_ratios[None, :]).view(-1)
hs = (h * self.scales[:, None] * h_ratios[None, :]).view(-1)
base_anchors = torch.stack(
[
x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1),
x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1)
],
dim=-1).round()
return base_anchors
def _meshgrid(self, x, y, row_major=True):
xx = x.repeat(len(y))
yy = y.view(-1, 1).repeat(1, len(x)).view(-1)
if row_major:
return xx, yy
else:
return yy, xx
def grid_anchors(self, featmap_size, stride=16, device='cuda'):
base_anchors = self.base_anchors.to(device)
feat_h, feat_w = featmap_size
shift_x = torch.arange(0, feat_w, device=device) * stride
shift_y = torch.arange(0, feat_h, device=device) * stride
shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
shifts = shifts.type_as(base_anchors)
# first feat_w elements correspond to the first row of shifts
# add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
# shifted anchors (K, A, 4), reshape to (K*A, 4)
all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
all_anchors = all_anchors.view(-1, 4)
# first A rows correspond to A anchors of (0, 0) in feature map,
# then (0, 1), (0, 2), ...
return all_anchors
def valid_flags(self, featmap_size, valid_size, device='cuda'):
feat_h, feat_w = featmap_size
valid_h, valid_w = valid_size
assert valid_h <= feat_h and valid_w <= feat_w
valid_x = torch.zeros(feat_w, dtype=torch.uint8, device=device)
valid_y = torch.zeros(feat_h, dtype=torch.uint8, device=device)
valid_x[:valid_w] = 1
valid_y[:valid_h] = 1
valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
valid = valid_xx & valid_yy
valid = valid[:, None].expand(
valid.size(0), self.num_base_anchors).contiguous().view(-1)
return valid
================================================
FILE: mmdet/core/anchor/anchor_target.py
================================================
import torch
from ..bbox import assign_and_sample, build_assigner, PseudoSampler, bbox2delta
from ..utils import multi_apply
def anchor_target(anchor_list,
valid_flag_list,
gt_bboxes_list,
img_metas,
target_means,
target_stds,
cfg,
gt_labels_list=None,
label_channels=1,
sampling=True,
unmap_outputs=True):
"""Compute regression and classification targets for anchors.
Args:
anchor_list (list[list]): Multi level anchors of each image.
valid_flag_list (list[list]): Multi level valid flags of each image.
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
img_metas (list[dict]): Meta info of each image.
target_means (Iterable): Mean value of regression targets.
target_stds (Iterable): Std value of regression targets.
cfg (dict): RPN train configs.
Returns:
tuple
"""
num_imgs = len(img_metas)
assert len(anchor_list) == len(valid_flag_list) == num_imgs
# anchor number of multi levels
num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
# concat all level anchors and flags to a single tensor
for i in range(num_imgs):
assert len(anchor_list[i]) == len(valid_flag_list[i])
anchor_list[i] = torch.cat(anchor_list[i])
valid_flag_list[i] = torch.cat(valid_flag_list[i])
# compute targets for each image
if gt_labels_list is None:
gt_labels_list = [None for _ in range(num_imgs)]
(all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
pos_inds_list, neg_inds_list) = multi_apply(
anchor_target_single,
anchor_list,
valid_flag_list,
gt_bboxes_list,
gt_labels_list,
img_metas,
target_means=target_means,
target_stds=target_stds,
cfg=cfg,
label_channels=label_channels,
sampling=sampling,
unmap_outputs=unmap_outputs)
# no valid anchors
if any([labels is None for labels in all_labels]):
return None
# sampled anchors of all images
num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
# split targets to a list w.r.t. multiple levels
labels_list = images_to_levels(all_labels, num_level_anchors)
label_weights_list = images_to_levels(all_label_weights, num_level_anchors)
bbox_targets_list = images_to_levels(all_bbox_targets, num_level_anchors)
bbox_weights_list = images_to_levels(all_bbox_weights, num_level_anchors)
return (labels_list, label_weights_list, bbox_targets_list,
bbox_weights_list, num_total_pos, num_total_neg)
def images_to_levels(target, num_level_anchors):
"""Convert targets by image to targets by feature level.
[target_img0, target_img1] -> [target_level0, target_level1, ...]
"""
target = torch.stack(target, 0)
level_targets = []
start = 0
for n in num_level_anchors:
end = start + n
level_targets.append(target[:, start:end].squeeze(0))
start = end
return level_targets
def anchor_target_single(flat_anchors,
valid_flags,
gt_bboxes,
gt_labels,
img_meta,
target_means,
target_stds,
cfg,
label_channels=1,
sampling=True,
unmap_outputs=True):
inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
img_meta['img_shape'][:2],
cfg.allowed_border)
if not inside_flags.any():
return (None, ) * 6
# assign gt and sample anchors
anchors = flat_anchors[inside_flags, :]
if sampling:
assign_result, sampling_result = assign_and_sample(
anchors, gt_bboxes, None, None, cfg)
else:
bbox_assigner = build_assigner(cfg.assigner)
assign_result = bbox_assigner.assign(anchors, gt_bboxes, None,
gt_labels)
bbox_sampler = PseudoSampler()
sampling_result = bbox_sampler.sample(assign_result, anchors,
gt_bboxes)
num_valid_anchors = anchors.shape[0]
bbox_targets = torch.zeros_like(anchors)
bbox_weights = torch.zeros_like(anchors)
labels = anchors.new_zeros(num_valid_anchors, dtype=torch.long)
label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
pos_inds = sampling_result.pos_inds
neg_inds = sampling_result.neg_inds
if len(pos_inds) > 0:
pos_bbox_targets = bbox2delta(sampling_result.pos_bboxes,
sampling_result.pos_gt_bboxes,
target_means, target_stds)
bbox_targets[pos_inds, :] = pos_bbox_targets
bbox_weights[pos_inds, :] = 1.0
if gt_labels is None:
labels[pos_inds] = 1
else:
labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
if cfg.pos_weight <= 0:
label_weights[pos_inds] = 1.0
else:
label_weights[pos_inds] = cfg.pos_weight
if len(neg_inds) > 0:
label_weights[neg_inds] = 1.0
# map up to original set of anchors
if unmap_outputs:
num_total_anchors = flat_anchors.size(0)
labels = unmap(labels, num_total_anchors, inside_flags)
label_weights = unmap(label_weights, num_total_anchors, inside_flags)
if label_channels > 1:
labels, label_weights = expand_binary_labels(
labels, label_weights, label_channels)
bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
neg_inds)
def expand_binary_labels(labels, label_weights, label_channels):
bin_labels = labels.new_full((labels.size(0), label_channels), 0)
inds = torch.nonzero(labels >= 1).squeeze()
if inds.numel() > 0:
bin_labels[inds, labels[inds] - 1] = 1
bin_label_weights = label_weights.view(-1, 1).expand(
label_weights.size(0), label_channels)
return bin_labels, bin_label_weights
def anchor_inside_flags(flat_anchors, valid_flags, img_shape,
allowed_border=0):
img_h, img_w = img_shape[:2]
if allowed_border >= 0:
inside_flags = valid_flags & \
(flat_anchors[:, 0] >= -allowed_border) & \
(flat_anchors[:, 1] >= -allowed_border) & \
(flat_anchors[:, 2] < img_w + allowed_border) & \
(flat_anchors[:, 3] < img_h + allowed_border)
else:
inside_flags = valid_flags
return inside_flags
def unmap(data, count, inds, fill=0):
""" Unmap a subset of item (data) back to the original set of items (of
size count) """
if data.dim() == 1:
ret = data.new_full((count, ), fill)
ret[inds] = data
else:
new_size = (count, ) + data.size()[1:]
ret = data.new_full(new_size, fill)
ret[inds, :] = data
return ret
================================================
FILE: mmdet/core/bbox/__init__.py
================================================
from .geometry import bbox_overlaps
from .assigners import BaseAssigner, MaxIoUAssigner, AssignResult
from .samplers import (BaseSampler, PseudoSampler, RandomSampler,
InstanceBalancedPosSampler, IoUBalancedNegSampler,
CombinedSampler, SamplingResult, RandomSamplerFixnum)
from .assign_sampling import build_assigner, build_sampler, assign_and_sample
from .transforms import (bbox2delta, delta2bbox, bbox_flip, bbox_mapping,
bbox_mapping_back, bbox2roi, roi2bbox, bbox2result)
from .bbox_target import bbox_target
__all__ = [
'bbox_overlaps', 'BaseAssigner', 'MaxIoUAssigner', 'AssignResult',
'BaseSampler', 'PseudoSampler', 'RandomSampler',
'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler',
'SamplingResult', 'build_assigner', 'build_sampler', 'assign_and_sample',
'bbox2delta', 'delta2bbox', 'bbox_flip', 'bbox_mapping',
'bbox_mapping_back', 'bbox2roi', 'roi2bbox', 'bbox2result', 'bbox_target',
'RandomSamplerFixnum'
]
================================================
FILE: mmdet/core/bbox/assign_sampling.py
================================================
import mmcv
from . import assigners, samplers
def build_assigner(cfg, **kwargs):
if isinstance(cfg, assigners.BaseAssigner):
return cfg
elif isinstance(cfg, dict):
return mmcv.runner.obj_from_dict(
cfg, assigners, default_args=kwargs)
else:
raise TypeError('Invalid type {} for building a sampler'.format(
type(cfg)))
def build_sampler(cfg, **kwargs):
if isinstance(cfg, samplers.BaseSampler):
return cfg
elif isinstance(cfg, dict):
return mmcv.runner.obj_from_dict(
cfg, samplers, default_args=kwargs)
else:
raise TypeError('Invalid type {} for building a sampler'.format(
type(cfg)))
def assign_and_sample(bboxes, gt_bboxes, gt_bboxes_ignore, gt_labels, cfg):
bbox_assigner = build_assigner(cfg.assigner)
bbox_sampler = build_sampler(cfg.sampler)
assign_result = bbox_assigner.assign(bboxes, gt_bboxes, gt_bboxes_ignore,
gt_labels)
sampling_result = bbox_sampler.sample(assign_result, bboxes, gt_bboxes,
gt_labels)
return assign_result, sampling_result
================================================
FILE: mmdet/core/bbox/assigners/__init__.py
================================================
from .base_assigner import BaseAssigner
from .max_iou_assigner import MaxIoUAssigner
from .assign_result import AssignResult
__all__ = ['BaseAssigner', 'MaxIoUAssigner', 'AssignResult']
================================================
FILE: mmdet/core/bbox/assigners/assign_result.py
================================================
import torch
class AssignResult(object):
def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):
self.num_gts = num_gts
self.gt_inds = gt_inds
self.max_overlaps = max_overlaps
self.labels = labels
def add_gt_(self, gt_labels):
self_inds = torch.arange(
1, len(gt_labels) + 1, dtype=torch.long, device=gt_labels.device)
self.gt_inds = torch.cat([self_inds, self.gt_inds])
self.max_overlaps = torch.cat(
[self.max_overlaps.new_ones(self.num_gts), self.max_overlaps])
if self.labels is not None:
self.labels = torch.cat([gt_labels, self.labels])
================================================
FILE: mmdet/core/bbox/assigners/base_assigner.py
================================================
from abc import ABCMeta, abstractmethod
class BaseAssigner(metaclass=ABCMeta):
@abstractmethod
def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
pass
================================================
FILE: mmdet/core/bbox/assigners/max_iou_assigner.py
================================================
import torch
from .base_assigner import BaseAssigner
from .assign_result import AssignResult
from ..geometry import bbox_overlaps
class MaxIoUAssigner(BaseAssigner):
"""Assign a corresponding gt bbox or background to each bbox.
Each proposals will be assigned with `-1`, `0`, or a positive integer
indicating the ground truth index.
- -1: don't care
- 0: negative sample, no assigned gt
- positive integer: positive sample, index (1-based) of assigned gt
Args:
pos_iou_thr (float): IoU threshold for positive bboxes.
neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
min_pos_iou (float): Minimum iou for a bbox to be considered as a
positive bbox. Positive samples can have smaller IoU than
pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
gt_max_assign_all (bool): Whether to assign all bboxes with the same
highest overlap with some gt to that gt.
ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
`gt_bboxes_ignore` is specified). Negative values mean not
ignoring any bboxes.
"""
def __init__(self,
pos_iou_thr,
neg_iou_thr,
min_pos_iou=.0,
gt_max_assign_all=True,
ignore_iof_thr=-1):
self.pos_iou_thr = pos_iou_thr
self.neg_iou_thr = neg_iou_thr
self.min_pos_iou = min_pos_iou
self.gt_max_assign_all = gt_max_assign_all
self.ignore_iof_thr = ignore_iof_thr
def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
"""Assign gt to bboxes.
This method assign a gt bbox to every bbox (proposal/anchor), each bbox
will be assigned with -1, 0, or a positive number. -1 means don't care,
0 means negative sample, positive number is the index (1-based) of
assigned gt.
The assignment is done in following steps, the order matters.
1. assign every bbox to -1
2. assign proposals whose iou with all gts < neg_iou_thr to 0
3. for each bbox, if the iou with its nearest gt >= pos_iou_thr,
assign it to that bbox
4. for each gt bbox, assign its nearest proposals (may be more than
one) to itself
Args:
bboxes (Tensor): Bounding boxes to be assigned, shape(n, 4).
gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
labelled as `ignored`, e.g., crowd boxes in COCO.
gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
Returns:
:obj:`AssignResult`: The assign result.
"""
if bboxes.shape[0] == 0 or gt_bboxes.shape[0] == 0:
raise ValueError('No gt or bboxes')
bboxes = bboxes[:, :4]
overlaps = bbox_overlaps(gt_bboxes, bboxes)
if (self.ignore_iof_thr > 0) and (gt_bboxes_ignore is not None) and (
gt_bboxes_ignore.numel() > 0):
ignore_overlaps = bbox_overlaps(
bboxes, gt_bboxes_ignore, mode='iof')
ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
ignore_bboxes_inds = torch.nonzero(
ignore_max_overlaps > self.ignore_iof_thr).squeeze()
if ignore_bboxes_inds.numel() > 0:
overlaps[ignore_bboxes_inds[:, 0], :] = -1
assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
return assign_result
def assign_wrt_overlaps(self, overlaps, gt_labels=None):
"""Assign w.r.t. the overlaps of bboxes with gts.
Args:
overlaps (Tensor): Overlaps between k gt_bboxes and n bboxes,
shape(k, n).
gt_labels (Tensor, optional): Labels of k gt_bboxes, shape (k, ).
Returns:
:obj:`AssignResult`: The assign result.
"""
if overlaps.numel() == 0:
raise ValueError('No gt or proposals')
num_gts, num_bboxes = overlaps.size(0), overlaps.size(1)
# 1. assign -1 by default
assigned_gt_inds = overlaps.new_full(
(num_bboxes, ), -1, dtype=torch.long)
# for each anchor, which gt best overlaps with it
# for each anchor, the max iou of all gts
max_overlaps, argmax_overlaps = overlaps.max(dim=0)
# for each gt, which anchor best overlaps with it
# for each gt, the max iou of all proposals
gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1)
# 2. assign negative: below
if isinstance(self.neg_iou_thr, float):
assigned_gt_inds[(max_overlaps >= 0)
& (max_overlaps < self.neg_iou_thr)] = 0
elif isinstance(self.neg_iou_thr, tuple):
assert len(self.neg_iou_thr) == 2
assigned_gt_inds[(max_overlaps >= self.neg_iou_thr[0])
& (max_overlaps < self.neg_iou_thr[1])] = 0
# 3. assign positive: above positive IoU threshold
pos_inds = max_overlaps >= self.pos_iou_thr
assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1
# 4. assign fg: for each gt, proposals with highest IoU
for i in range(num_gts):
if gt_max_overlaps[i] >= self.min_pos_iou:
if self.gt_max_assign_all:
max_iou_inds = overlaps[i, :] == gt_max_overlaps[i]
assigned_gt_inds[max_iou_inds] = i + 1
else:
assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1
if gt_labels is not None:
assigned_labels = assigned_gt_inds.new_zeros((num_bboxes, ))
pos_inds = torch.nonzero(assigned_gt_inds > 0).squeeze()
if pos_inds.numel() > 0:
assigned_labels[pos_inds] = gt_labels[
assigned_gt_inds[pos_inds] - 1]
else:
assigned_labels = None
return AssignResult(
num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels)
================================================
FILE: mmdet/core/bbox/bbox_target.py
================================================
import torch
from .transforms import bbox2delta
from ..utils import multi_apply
def bbox_target(pos_bboxes_list,
neg_bboxes_list,
pos_gt_bboxes_list,
pos_gt_labels_list,
cfg,
reg_classes=1,
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
concat=True):
labels, label_weights, bbox_targets, bbox_weights = multi_apply(
bbox_target_single,
pos_bboxes_list,
neg_bboxes_list,
pos_gt_bboxes_list,
pos_gt_labels_list,
cfg=cfg,
reg_classes=reg_classes,
target_means=target_means,
target_stds=target_stds)
if concat:
labels = torch.cat(labels, 0)
label_weights = torch.cat(label_weights, 0)
bbox_targets = torch.cat(bbox_targets, 0)
bbox_weights = torch.cat(bbox_weights, 0)
return labels, label_weights, bbox_targets, bbox_weights
def bbox_target_single(pos_bboxes,
neg_bboxes,
pos_gt_bboxes,
pos_gt_labels,
cfg,
reg_classes=1,
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0]):
num_pos = pos_bboxes.size(0)
num_neg = neg_bboxes.size(0)
num_samples = num_pos + num_neg
labels = pos_bboxes.new_zeros(num_samples, dtype=torch.long)
label_weights = pos_bboxes.new_zeros(num_samples)
bbox_targets = pos_bboxes.new_zeros(num_samples, 4)
bbox_weights = pos_bboxes.new_zeros(num_samples, 4)
if num_pos > 0:
labels[:num_pos] = pos_gt_labels
pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
label_weights[:num_pos] = pos_weight
pos_bbox_targets = bbox2delta(pos_bboxes, pos_gt_bboxes, target_means,
target_stds)
bbox_targets[:num_pos, :] = pos_bbox_targets
bbox_weights[:num_pos, :] = 1
if num_neg > 0:
label_weights[-num_neg:] = 1.0
if reg_classes > 1:
bbox_targets, bbox_weights = expand_target(bbox_targets, bbox_weights,
labels, reg_classes)
return labels, label_weights, bbox_targets, bbox_weights
def expand_target(bbox_targets, bbox_weights, labels, num_classes):
bbox_targets_expand = bbox_targets.new_zeros((bbox_targets.size(0),
4 * num_classes))
bbox_weights_expand = bbox_weights.new_zeros((bbox_weights.size(0),
4 * num_classes))
for i in torch.nonzero(labels > 0).squeeze(-1):
start, end = labels[i] * 4, (labels[i] + 1) * 4
bbox_targets_expand[i, start:end] = bbox_targets[i, :]
bbox_weights_expand[i, start:end] = bbox_weights[i, :]
return bbox_targets_expand, bbox_weights_expand
================================================
FILE: mmdet/core/bbox/geometry.py
================================================
import torch
def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False):
"""Calculate overlap between two set of bboxes.
If ``is_aligned`` is ``False``, then calculate the ious between each bbox
of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
bboxes1 and bboxes2.
Args:
bboxes1 (Tensor): shape (m, 4)
bboxes2 (Tensor): shape (n, 4), if is_aligned is ``True``, then m and n
must be equal.
mode (str): "iou" (intersection over union) or iof (intersection over
foreground).
Returns:
ious(Tensor): shape (m, n) if is_aligned == False else shape (m, 1)
"""
assert mode in ['iou', 'iof']
rows = bboxes1.size(0)
cols = bboxes2.size(0)
if is_aligned:
assert rows == cols
if rows * cols == 0:
return bboxes1.new(rows, 1) if is_aligned else bboxes1.new(rows, cols)
if is_aligned:
lt = torch.max(bboxes1[:, :2], bboxes2[:, :2]) # [rows, 2]
rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:]) # [rows, 2]
wh = (rb - lt + 1).clamp(min=0) # [rows, 2]
overlap = wh[:, 0] * wh[:, 1]
area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * (
bboxes1[:, 3] - bboxes1[:, 1] + 1)
if mode == 'iou':
area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * (
bboxes2[:, 3] - bboxes2[:, 1] + 1)
ious = overlap / (area1 + area2 - overlap)
else:
ious = overlap / area1
else:
lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2]) # [rows, cols, 2]
rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:]) # [rows, cols, 2]
wh = (rb - lt + 1).clamp(min=0) # [rows, cols, 2]
overlap = wh[:, :, 0] * wh[:, :, 1]
area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * (
bboxes1[:, 3] - bboxes1[:, 1] + 1)
if mode == 'iou':
area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * (
bboxes2[:, 3] - bboxes2[:, 1] + 1)
ious = overlap / (area1[:, None] + area2 - overlap)
else:
ious = overlap / (area1[:, None])
return ious
================================================
FILE: mmdet/core/bbox/samplers/__init__.py
================================================
from .base_sampler import BaseSampler
from .pseudo_sampler import PseudoSampler
from .random_sampler import RandomSampler
from .instance_balanced_pos_sampler import InstanceBalancedPosSampler
from .iou_balanced_neg_sampler import IoUBalancedNegSampler
from .combined_sampler import CombinedSampler
from .ohem_sampler import OHEMSampler
from .sampling_result import SamplingResult
from .random_sampler_fixnum import RandomSamplerFixnum
__all__ = [
'BaseSampler', 'PseudoSampler', 'RandomSampler',
'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler',
'OHEMSampler', 'SamplingResult', 'RandomSamplerFixnum'
]
================================================
FILE: mmdet/core/bbox/samplers/base_sampler.py
================================================
from abc import ABCMeta, abstractmethod
import torch
from .sampling_result import SamplingResult
class BaseSampler(metaclass=ABCMeta):
def __init__(self,
num,
pos_fraction,
neg_pos_ub=-1,
add_gt_as_proposals=True,
**kwargs):
self.num = num
self.pos_fraction = pos_fraction
self.neg_pos_ub = neg_pos_ub
self.add_gt_as_proposals = add_gt_as_proposals
self.pos_sampler = self
self.neg_sampler = self
@abstractmethod
def _sample_pos(self, assign_result, num_expected, **kwargs):
pass
@abstractmethod
def _sample_neg(self, assign_result, num_expected, **kwargs):
pass
def sample(self,
assign_result,
bboxes,
gt_bboxes,
gt_labels=None,
**kwargs):
"""Sample positive and negative bboxes.
This is a simple implementation of bbox sampling given candidates,
assigning results and ground truth bboxes.
Args:
assign_result (:obj:`AssignResult`): Bbox assigning results.
bboxes (Tensor): Boxes to be sampled from.
gt_bboxes (Tensor): Ground truth bboxes.
gt_labels (Tensor, optional): Class labels of ground truth bboxes.
Returns:
:obj:`SamplingResult`: Sampling result.
"""
bboxes = bboxes[:, :4]
gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.uint8)
if self.add_gt_as_proposals:
bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
assign_result.add_gt_(gt_labels)
gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.uint8)
gt_flags = torch.cat([gt_ones, gt_flags])
num_expected_pos = int(self.num * self.pos_fraction)
pos_inds = self.pos_sampler._sample_pos(
assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
# We found that sampled indices have duplicated items occasionally.
# (may be a bug of PyTorch)
pos_inds = pos_inds.unique()
num_sampled_pos = pos_inds.numel()
num_expected_neg = self.num - num_sampled_pos
if self.neg_pos_ub >= 0:
_pos = max(1, num_sampled_pos)
neg_upper_bound = int(self.neg_pos_ub * _pos)
if num_expected_neg > neg_upper_bound:
num_expected_neg = neg_upper_bound
neg_inds = self.neg_sampler._sample_neg(
assign_result, num_expected_neg, bboxes=bboxes, **kwargs)
neg_inds = neg_inds.unique()
return SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
assign_result, gt_flags)
================================================
FILE: mmdet/core/bbox/samplers/combined_sampler.py
================================================
from .base_sampler import BaseSampler
from ..assign_sampling import build_sampler
class CombinedSampler(BaseSampler):
def __init__(self, pos_sampler, neg_sampler, **kwargs):
super(CombinedSampler, self).__init__(**kwargs)
self.pos_sampler = build_sampler(pos_sampler, **kwargs)
self.neg_sampler = build_sampler(neg_sampler, **kwargs)
def _sample_pos(self, **kwargs):
raise NotImplementedError
def _sample_neg(self, **kwargs):
raise NotImplementedError
================================================
FILE: mmdet/core/bbox/samplers/instance_balanced_pos_sampler.py
================================================
import numpy as np
import torch
from .random_sampler import RandomSampler
class InstanceBalancedPosSampler(RandomSampler):
def _sample_pos(self, assign_result, num_expected, **kwargs):
pos_inds = torch.nonzero(assign_result.gt_inds > 0)
if pos_inds.numel() != 0:
pos_inds = pos_inds.squeeze(1)
if pos_inds.numel() <= num_expected:
return pos_inds
else:
unique_gt_inds = assign_result.gt_inds[pos_inds].unique()
num_gts = len(unique_gt_inds)
num_per_gt = int(round(num_expected / float(num_gts)) + 1)
sampled_inds = []
for i in unique_gt_inds:
inds = torch.nonzero(assign_result.gt_inds == i.item())
if inds.numel() != 0:
inds = inds.squeeze(1)
else:
continue
if len(inds) > num_per_gt:
inds = self.random_choice(inds, num_per_gt)
sampled_inds.append(inds)
sampled_inds = torch.cat(sampled_inds)
if len(sampled_inds) < num_expected:
num_extra = num_expected - len(sampled_inds)
extra_inds = np.array(
list(set(pos_inds.cpu()) - set(sampled_inds.cpu())))
if len(extra_inds) > num_extra:
extra_inds = self.random_choice(extra_inds, num_extra)
extra_inds = torch.from_numpy(extra_inds).to(
assign_result.gt_inds.device).long()
sampled_inds = torch.cat([sampled_inds, extra_inds])
elif len(sampled_inds) > num_expected:
sampled_inds = self.random_choice(sampled_inds, num_expected)
return sampled_inds
================================================
FILE: mmdet/core/bbox/samplers/iou_balanced_neg_sampler.py
================================================
import numpy as np
import torch
from .random_sampler import RandomSampler
class IoUBalancedNegSampler(RandomSampler):
def __init__(self,
num,
pos_fraction,
hard_thr=0.1,
hard_fraction=0.5,
**kwargs):
super(IoUBalancedNegSampler, self).__init__(num, pos_fraction,
**kwargs)
assert hard_thr > 0
assert 0 < hard_fraction < 1
self.hard_thr = hard_thr
self.hard_fraction = hard_fraction
def _sample_neg(self, assign_result, num_expected, **kwargs):
neg_inds = torch.nonzero(assign_result.gt_inds == 0)
if neg_inds.numel() != 0:
neg_inds = neg_inds.squeeze(1)
if len(neg_inds) <= num_expected:
return neg_inds
else:
max_overlaps = assign_result.max_overlaps.cpu().numpy()
# balance sampling for negative samples
neg_set = set(neg_inds.cpu().numpy())
easy_set = set(
np.where(
np.logical_and(max_overlaps >= 0,
max_overlaps < self.hard_thr))[0])
hard_set = set(np.where(max_overlaps >= self.hard_thr)[0])
easy_neg_inds = list(easy_set & neg_set)
hard_neg_inds = list(hard_set & neg_set)
num_expected_hard = int(num_expected * self.hard_fraction)
if len(hard_neg_inds) > num_expected_hard:
sampled_hard_inds = self.random_choice(hard_neg_inds,
num_expected_hard)
else:
sampled_hard_inds = np.array(hard_neg_inds, dtype=np.int)
num_expected_easy = num_expected - len(sampled_hard_inds)
if len(easy_neg_inds) > num_expected_easy:
sampled_easy_inds = self.random_choice(easy_neg_inds,
num_expected_easy)
else:
sampled_easy_inds = np.array(easy_neg_inds, dtype=np.int)
sampled_inds = np.concatenate((sampled_easy_inds,
sampled_hard_inds))
if len(sampled_inds) < num_expected:
num_extra = num_expected - len(sampled_inds)
extra_inds = np.array(list(neg_set - set(sampled_inds)))
if len(extra_inds) > num_extra:
extra_inds = self.random_choice(extra_inds, num_extra)
sampled_inds = np.concatenate((sampled_inds, extra_inds))
sampled_inds = torch.from_numpy(sampled_inds).long().to(
assign_result.gt_inds.device)
return sampled_inds
================================================
FILE: mmdet/core/bbox/samplers/ohem_sampler.py
================================================
import torch
from .base_sampler import BaseSampler
from ..transforms import bbox2roi
class OHEMSampler(BaseSampler):
def __init__(self,
num,
pos_fraction,
context,
neg_pos_ub=-1,
add_gt_as_proposals=True,
**kwargs):
super(OHEMSampler, self).__init__(num, pos_fraction, neg_pos_ub,
add_gt_as_proposals)
self.bbox_roi_extractor = context.bbox_roi_extractor
self.bbox_head = context.bbox_head
def hard_mining(self, inds, num_expected, bboxes, labels, feats):
with torch.no_grad():
rois = bbox2roi([bboxes])
bbox_feats = self.bbox_roi_extractor(
feats[:self.bbox_roi_extractor.num_inputs], rois)
cls_score, _ = self.bbox_head(bbox_feats)
loss = self.bbox_head.loss(
cls_score=cls_score,
bbox_pred=None,
labels=labels,
label_weights=cls_score.new_ones(cls_score.size(0)),
bbox_targets=None,
bbox_weights=None,
reduce=False)['loss_cls']
_, topk_loss_inds = loss.topk(num_expected)
return inds[topk_loss_inds]
def _sample_pos(self,
assign_result,
num_expected,
bboxes=None,
feats=None,
**kwargs):
# Sample some hard positive samples
pos_inds = torch.nonzero(assign_result.gt_inds > 0)
if pos_inds.numel() != 0:
pos_inds = pos_inds.squeeze(1)
if pos_inds.numel() <= num_expected:
return pos_inds
else:
return self.hard_mining(pos_inds, num_expected, bboxes[pos_inds],
assign_result.labels[pos_inds], feats)
def _sample_neg(self,
assign_result,
num_expected,
bboxes=None,
feats=None,
**kwargs):
# Sample some hard negative samples
neg_inds = torch.nonzero(assign_result.gt_inds == 0)
if neg_inds.numel() != 0:
neg_inds = neg_inds.squeeze(1)
if len(neg_inds) <= num_expected:
return neg_inds
else:
return self.hard_mining(neg_inds, num_expected, bboxes[neg_inds],
assign_result.labels[neg_inds], feats)
================================================
FILE: mmdet/core/bbox/samplers/pseudo_sampler.py
================================================
import torch
from .base_sampler import BaseSampler
from .sampling_result import SamplingResult
class PseudoSampler(BaseSampler):
def __init__(self, **kwargs):
pass
def _sample_pos(self, **kwargs):
raise NotImplementedError
def _sample_neg(self, **kwargs):
raise NotImplementedError
def sample(self, assign_result, bboxes, gt_bboxes, **kwargs):
pos_inds = torch.nonzero(
assign_result.gt_inds > 0).squeeze(-1).unique()
neg_inds = torch.nonzero(
assign_result.gt_inds == 0).squeeze(-1).unique()
gt_flags = bboxes.new_zeros(bboxes.shape[0], dtype=torch.uint8)
sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
assign_result, gt_flags)
return sampling_result
================================================
FILE: mmdet/core/bbox/samplers/random_sampler.py
================================================
import numpy as np
import torch
from .base_sampler import BaseSampler
class RandomSampler(BaseSampler):
def __init__(self,
num,
pos_fraction,
neg_pos_ub=-1,
add_gt_as_proposals=True,
**kwargs):
super(RandomSampler, self).__init__(num, pos_fraction, neg_pos_ub,
add_gt_as_proposals)
@staticmethod
def random_choice(gallery, num):
"""Random select some elements from the gallery.
It seems that Pytorch's implementation is slower than numpy so we use
numpy to randperm the indices.
"""
assert len(gallery) >= num
if isinstance(gallery, list):
gallery = np.array(gallery)
cands = np.arange(len(gallery))
np.random.shuffle(cands)
rand_inds = cands[:num]
if not isinstance(gallery, np.ndarray):
rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device)
return gallery[rand_inds]
def _sample_pos(self, assign_result, num_expected, **kwargs):
"""Randomly sample some positive samples."""
pos_inds = torch.nonzero(assign_result.gt_inds > 0)
if pos_inds.numel() != 0:
pos_inds = pos_inds.squeeze(1)
if pos_inds.numel() <= num_expected:
return pos_inds
else:
return self.random_choice(pos_inds, num_expected)
def _sample_neg(self, assign_result, num_expected, **kwargs):
"""Randomly sample some negative samples."""
neg_inds = torch.nonzero(assign_result.gt_inds == 0)
if neg_inds.numel() != 0:
neg_inds = neg_inds.squeeze(1)
if len(neg_inds) <= num_expected:
return neg_inds
else:
return self.random_choice(neg_inds, num_expected)
================================================
FILE: mmdet/core/bbox/samplers/random_sampler_fixnum.py
================================================
import numpy as np
import torch
from .base_sampler import BaseSampler
from .sampling_result import SamplingResult
class RandomSamplerFixnum(BaseSampler):
def __init__(self,
num,
pos_fraction,
neg_pos_ub=-1,
add_gt_as_proposals=True,
**kwargs):
super(RandomSamplerFixnum, self).__init__(num, pos_fraction, neg_pos_ub,
add_gt_as_proposals)
@staticmethod
def random_choice(gallery, num):
"""Random select some elements from the gallery.
It seems that Pytorch's implementation is slower than numpy so we use
numpy to randperm the indices.
"""
assert len(gallery) >= num
if isinstance(gallery, list):
gallery = np.array(gallery)
cands = np.arange(len(gallery))
np.random.shuffle(cands)
rand_inds = cands[:num]
if not isinstance(gallery, np.ndarray):
rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device)
return gallery[rand_inds]
# def _sample_pos(self, assign_result, num_expected, **kwargs):
# """Randomly sample some positive samples."""
# pos_inds = torch.nonzero(assign_result.gt_inds > 0)
# if pos_inds.numel() != 0:
# pos_inds = pos_inds.squeeze(1)
# if pos_inds.numel() <= num_expected:
# return pos_inds
# else:
# return self.random_choice(pos_inds, num_expected)
def _sample_pos(self, assign_result, num_expected, **kwargs):
"""Balance sampling for positive bboxes/anchors.
1. calculate average positive num for each gt: num_per_gt
2. sample at most num_per_gt positives for each gt
3. random sampling from rest anchors if not enough fg
"""
pos_inds = torch.nonzero(assign_result.gt_inds > 0)
if pos_inds.numel() != 0:
pos_inds = pos_inds.squeeze(1)
if pos_inds.numel() <= num_expected:
repeat_ = num_expected // pos_inds.numel()
return torch.cat((pos_inds.repeat(repeat_), self.random_choice(pos_inds, num_expected % pos_inds.numel())))
else:
return self.random_choice(pos_inds, num_expected)
# def _sample_neg(self, assign_result, num_expected, **kwargs):
# """Randomly sample some negative samples."""
# neg_inds = torch.nonzero(assign_result.gt_inds == 0)
# if neg_inds.numel() != 0:
# neg_inds = neg_inds.squeeze(1)
# if len(neg_inds) <= num_expected:
# return neg_inds
# else:
# return self.random_choice(neg_inds, num_expected)
def _sample_neg(self, assign_result, num_expected, **kwargs):
"""Balance sampling for negative bboxes/anchors.
Negative samples are split into 2 set: hard (balance_thr <= iou <
neg_iou_thr) and easy(iou < balance_thr). The sampling ratio is controlled
by `hard_fraction`.
"""
neg_inds = torch.nonzero(assign_result.gt_inds == 0)
if neg_inds.numel() != 0:
neg_inds = neg_inds.squeeze(1)
if len(neg_inds) <= num_expected:
repeat_ = num_expected // neg_inds.numel()
return torch.cat((neg_inds.repeat(repeat_), self.random_choice(neg_inds, num_expected % neg_inds.numel())))
else:
return self.random_choice(neg_inds, num_expected)
def sample(self,
assign_result,
bboxes,
gt_bboxes,
gt_labels=None,
has_roi_score=False,
**kwargs):
"""Sample positive and negative bboxes.
This is a simple implementation of bbox sampling given candidates,
assigning results and ground truth bboxes.
Args:
assign_result (:obj:`AssignResult`): Bbox assigning results.
bboxes (Tensor): Boxes to be sampled from.
gt_bboxes (Tensor): Ground truth bboxes.
gt_labels (Tensor, optional): Class labels of ground truth bboxes.
Returns:
:obj:`SamplingResult`: Sampling result.
"""
if has_roi_score:
gt_bboxes_new = gt_bboxes.new_ones((gt_bboxes.shape[0], 5))
gt_bboxes_new[:, :4] = gt_bboxes
gt_bboxes = gt_bboxes_new
else:
bboxes = bboxes[:, :4]
gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.uint8)
if self.add_gt_as_proposals:
bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
assign_result.add_gt_(gt_labels)
gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.uint8)
gt_flags = torch.cat([gt_ones, gt_flags])
num_expected_pos = int(self.num * self.pos_fraction)
# sample pos inds must be fixed
pos_inds = self.pos_sampler._sample_pos(
assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
# We found that sampled indices have duplicated items occasionally.
# (may be a bug of PyTorch)
# pos_inds = pos_inds.unique()
num_sampled_pos = pos_inds.numel()
num_expected_neg = self.num - num_sampled_pos
if self.neg_pos_ub >= 0:
_pos = max(1, num_sampled_pos)
neg_upper_bound = int(self.neg_pos_ub * _pos)
if num_expected_neg > neg_upper_bound:
num_expected_neg = neg_upper_bound
neg_inds = self.neg_sampler._sample_neg(
assign_result, num_expected_neg, bboxes=bboxes, **kwargs)
# neg_inds = neg_inds.unique()
return SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
assign_result, gt_flags)
================================================
FILE: mmdet/core/bbox/samplers/sampling_result.py
================================================
import torch
class SamplingResult(object):
def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,
gt_flags):
self.pos_inds = pos_inds
self.neg_inds = neg_inds
self.pos_bboxes = bboxes[pos_inds]
self.neg_bboxes = bboxes[neg_inds]
self.pos_is_gt = gt_flags[pos_inds]
self.num_gts = gt_bboxes.shape[0]
self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds, :]
if assign_result.labels is not None:
self.pos_gt_labels = assign_result.labels[pos_inds]
else:
self.pos_gt_labels = None
@property
def bboxes(self):
return torch.cat([self.pos_bboxes, self.neg_bboxes])
================================================
FILE: mmdet/core/bbox/transforms.py
================================================
import mmcv
import numpy as np
import torch
def bbox2delta(proposals, gt, means=[0, 0, 0, 0], stds=[1, 1, 1, 1]):
assert proposals.size() == gt.size()
proposals = proposals.float()
gt = gt.float()
px = (proposals[..., 0] + proposals[..., 2]) * 0.5
py = (proposals[..., 1] + proposals[..., 3]) * 0.5
pw = proposals[..., 2] - proposals[..., 0] + 1.0
ph = proposals[..., 3] - proposals[..., 1] + 1.0
gx = (gt[..., 0] + gt[..., 2]) * 0.5
gy = (gt[..., 1] + gt[..., 3]) * 0.5
gw = gt[..., 2] - gt[..., 0] + 1.0
gh = gt[..., 3] - gt[..., 1] + 1.0
dx = (gx - px) / pw
dy = (gy - py) / ph
dw = torch.log(gw / pw)
dh = torch.log(gh / ph)
deltas = torch.stack([dx, dy, dw, dh], dim=-1)
means = deltas.new_tensor(means).unsqueeze(0)
stds = deltas.new_tensor(stds).unsqueeze(0)
deltas = deltas.sub_(means).div_(stds)
return deltas
def delta2bbox(rois,
deltas,
means=[0, 0, 0, 0],
stds=[1, 1, 1, 1],
max_shape=None,
wh_ratio_clip=16 / 1000):
means = deltas.new_tensor(means).repeat(1, deltas.size(1) // 4)
stds = deltas.new_tensor(stds).repeat(1, deltas.size(1) // 4)
denorm_deltas = deltas * stds + means
dx = denorm_deltas[:, 0::4]
dy = denorm_deltas[:, 1::4]
dw = denorm_deltas[:, 2::4]
dh = denorm_deltas[:, 3::4]
max_ratio = np.abs(np.log(wh_ratio_clip))
dw = dw.clamp(min=-max_ratio, max=max_ratio)
dh = dh.clamp(min=-max_ratio, max=max_ratio)
px = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx)
py = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy)
pw = (rois[:, 2] - rois[:, 0] + 1.0).unsqueeze(1).expand_as(dw)
ph = (rois[:, 3] - rois[:, 1] + 1.0).unsqueeze(1).expand_as(dh)
gw = pw * dw.exp()
gh = ph * dh.exp()
gx = torch.addcmul(px, 1, pw, dx) # gx = px + pw * dx
gy = torch.addcmul(py, 1, ph, dy) # gy = py + ph * dy
x1 = gx - gw * 0.5 + 0.5
y1 = gy - gh * 0.5 + 0.5
x2 = gx + gw * 0.5 - 0.5
y2 = gy + gh * 0.5 - 0.5
if max_shape is not None:
x1 = x1.clamp(min=0, max=max_shape[1] - 1)
y1 = y1.clamp(min=0, max=max_shape[0] - 1)
x2 = x2.clamp(min=0, max=max_shape[1] - 1)
y2 = y2.clamp(min=0, max=max_shape[0] - 1)
bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view_as(deltas)
return bboxes
def bbox_flip(bboxes, img_shape):
"""Flip bboxes horizontally.
Args:
bboxes(Tensor or ndarray): Shape (..., 4*k)
img_shape(tuple): Image shape.
Returns:
Same type as `bboxes`: Flipped bboxes.
"""
if isinstance(bboxes, torch.Tensor):
assert bboxes.shape[-1] % 4 == 0
flipped = bboxes.clone()
flipped[:, 0::4] = img_shape[1] - bboxes[:, 2::4] - 1
flipped[:, 2::4] = img_shape[1] - bboxes[:, 0::4] - 1
return flipped
elif isinstance(bboxes, np.ndarray):
return mmcv.bbox_flip(bboxes, img_shape)
def bbox_mapping(bboxes, img_shape, scale_factor, flip):
"""Map bboxes from the original image scale to testing scale"""
new_bboxes = bboxes * scale_factor
if flip:
new_bboxes = bbox_flip(new_bboxes, img_shape)
return new_bboxes
def bbox_mapping_back(bboxes, img_shape, scale_factor, flip):
"""Map bboxes from testing scale to original image scale"""
new_bboxes = bbox_flip(bboxes, img_shape) if flip else bboxes
new_bboxes = new_bboxes / scale_factor
return new_bboxes
def bbox2roi(bbox_list):
"""Convert a list of bboxes to roi format.
Args:
bbox_list (list[Tensor]): a list of bboxes corresponding to a batch
of images.
Returns:
Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2]
"""
rois_list = []
for img_id, bboxes in enumerate(bbox_list):
if bboxes.size(0) > 0:
img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
rois = torch.cat([img_inds, bboxes[:, :4]], dim=-1)
else:
rois = bboxes.new_zeros((0, 5))
rois_list.append(rois)
rois = torch.cat(rois_list, 0)
return rois
def roi2bbox(rois):
bbox_list = []
img_ids = torch.unique(rois[:, 0].cpu(), sorted=True)
for img_id in img_ids:
inds = (rois[:, 0] == img_id.item())
bbox = rois[inds, 1:]
bbox_list.append(bbox)
return bbox_list
def bbox2result(bboxes, labels, num_classes):
"""Convert detection results to a list of numpy arrays.
Args:
bboxes (Tensor): shape (n, 5)
labels (Tensor): shape (n, )
num_classes (int): class number, including background class
Returns:
list(ndarray): bbox results of each class
"""
if bboxes.shape[0] == 0:
return [
np.zeros((0, 5), dtype=np.float32) for i in range(num_classes - 1)
]
else:
bboxes = bboxes.cpu().numpy()
labels = labels.cpu().numpy()
return [bboxes[labels == i, :] for i in range(num_classes - 1)]
================================================
FILE: mmdet/core/evaluation/__init__.py
================================================
from .class_names import (voc_classes, imagenet_det_classes,
imagenet_vid_classes, coco_classes, dataset_aliases,
get_classes)
from .coco_utils import coco_eval, fast_eval_recall, results2json
from .eval_hooks import (DistEvalHook, DistEvalmAPHook, CocoDistEvalRecallHook,
CocoDistEvalmAPHook)
from .mean_ap import average_precision, eval_map, print_map_summary
from .recall import (eval_recalls, print_recall_summary, plot_num_recall,
plot_iou_recall)
__all__ = [
'voc_classes', 'imagenet_det_classes', 'imagenet_vid_classes',
'coco_classes', 'dataset_aliases', 'get_classes', 'coco_eval',
'fast_eval_recall', 'results2json', 'DistEvalHook', 'DistEvalmAPHook',
'CocoDistEvalRecallHook', 'CocoDistEvalmAPHook', 'average_precision',
'eval_map', 'print_map_summary', 'eval_recalls', 'print_recall_summary',
'plot_num_recall', 'plot_iou_recall'
]
================================================
FILE: mmdet/core/evaluation/bbox_overlaps.py
================================================
import numpy as np
def bbox_overlaps(bboxes1, bboxes2, mode='iou'):
"""Calculate the ious between each bbox of bboxes1 and bboxes2.
Args:
bboxes1(ndarray): shape (n, 4)
bboxes2(ndarray): shape (k, 4)
mode(str): iou (intersection over union) or iof (intersection
over foreground)
Returns:
ious(ndarray): shape (n, k)
"""
assert mode in ['iou', 'iof']
bboxes1 = bboxes1.astype(np.float32)
bboxes2 = bboxes2.astype(np.float32)
rows = bboxes1.shape[0]
cols = bboxes2.shape[0]
ious = np.zeros((rows, cols), dtype=np.float32)
if rows * cols == 0:
return ious
exchange = False
if bboxes1.shape[0] > bboxes2.shape[0]:
bboxes1, bboxes2 = bboxes2, bboxes1
ious = np.zeros((cols, rows), dtype=np.float32)
exchange = True
area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * (
bboxes1[:, 3] - bboxes1[:, 1] + 1)
area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * (
bboxes2[:, 3] - bboxes2[:, 1] + 1)
for i in range(bboxes1.shape[0]):
x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0])
y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1])
x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2])
y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3])
overlap = np.maximum(x_end - x_start + 1, 0) * np.maximum(
y_end - y_start + 1, 0)
if mode == 'iou':
union = area1[i] + area2 - overlap
else:
union = area1[i] if not exchange else area2
ious[i, :] = overlap / union
if exchange:
ious = ious.T
return ious
================================================
FILE: mmdet/core/evaluation/class_names.py
================================================
import mmcv
def voc_classes():
return [
'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
]
def imagenet_det_classes():
return [
'accordion', 'airplane', 'ant', 'antelope', 'apple', 'armadillo',
'artichoke', 'axe', 'baby_bed', 'backpack', 'bagel', 'balance_beam',
'banana', 'band_aid', 'banjo', 'baseball', 'basketball', 'bathing_cap',
'beaker', 'bear', 'bee', 'bell_pepper', 'bench', 'bicycle', 'binder',
'bird', 'bookshelf', 'bow_tie', 'bow', 'bowl', 'brassiere', 'burrito',
'bus', 'butterfly', 'camel', 'can_opener', 'car', 'cart', 'cattle',
'cello', 'centipede', 'chain_saw', 'chair', 'chime', 'cocktail_shaker',
'coffee_maker', 'computer_keyboard', 'computer_mouse', 'corkscrew',
'cream', 'croquet_ball', 'crutch', 'cucumber', 'cup_or_mug', 'diaper',
'digital_clock', 'dishwasher', 'dog', 'domestic_cat', 'dragonfly',
'drum', 'dumbbell', 'electric_fan', 'elephant', 'face_powder', 'fig',
'filing_cabinet', 'flower_pot', 'flute', 'fox', 'french_horn', 'frog',
'frying_pan', 'giant_panda', 'goldfish', 'golf_ball', 'golfcart',
'guacamole', 'guitar', 'hair_dryer', 'hair_spray', 'hamburger',
'hammer', 'hamster', 'harmonica', 'harp', 'hat_with_a_wide_brim',
'head_cabbage', 'helmet', 'hippopotamus', 'horizontal_bar', 'horse',
'hotdog', 'iPod', 'isopod', 'jellyfish', 'koala_bear', 'ladle',
'ladybug', 'lamp', 'laptop', 'lemon', 'lion', 'lipstick', 'lizard',
'lobster', 'maillot', 'maraca', 'microphone', 'microwave', 'milk_can',
'miniskirt', 'monkey', 'motorcycle', 'mushroom', 'nail', 'neck_brace',
'oboe', 'orange', 'otter', 'pencil_box', 'pencil_sharpener', 'perfume',
'person', 'piano', 'pineapple', 'ping-pong_ball', 'pitcher', 'pizza',
'plastic_bag', 'plate_rack', 'pomegranate', 'popsicle', 'porcupine',
'power_drill', 'pretzel', 'printer', 'puck', 'punching_bag', 'purse',
'rabbit', 'racket', 'ray', 'red_panda', 'refrigerator',
'remote_control', 'rubber_eraser', 'rugby_ball', 'ruler',
'salt_or_pepper_shaker', 'saxophone', 'scorpion', 'screwdriver',
'seal', 'sheep', 'ski', 'skunk', 'snail', 'snake', 'snowmobile',
'snowplow', 'soap_dispenser', 'soccer_ball', 'sofa', 'spatula',
'squirrel', 'starfish', 'stethoscope', 'stove', 'strainer',
'strawberry', 'stretcher', 'sunglasses', 'swimming_trunks', 'swine',
'syringe', 'table', 'tape_player', 'tennis_ball', 'tick', 'tie',
'tiger', 'toaster', 'traffic_light', 'train', 'trombone', 'trumpet',
'turtle', 'tv_or_monitor', 'unicycle', 'vacuum', 'violin',
'volleyball', 'waffle_iron', 'washer', 'water_bottle', 'watercraft',
'whale', 'wine_bottle', 'zebra'
]
def imagenet_vid_classes():
return [
'airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car',
'cattle', 'dog', 'domestic_cat', 'elephant', 'fox', 'giant_panda',
'hamster', 'horse', 'lion', 'lizard', 'monkey', 'motorcycle', 'rabbit',
'red_panda', 'sheep', 'snake', 'squirrel', 'tiger', 'train', 'turtle',
'watercraft', 'whale', 'zebra'
]
def coco_classes():
return [
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
'truck', 'boat', 'traffic_light', 'fire_hydrant', 'stop_sign',
'parking_meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
'sports_ball', 'kite', 'baseball_bat', 'baseball_glove', 'skateboard',
'surfboard', 'tennis_racket', 'bottle', 'wine_glass', 'cup', 'fork',
'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
'broccoli', 'carrot', 'hot_dog', 'pizza', 'donut', 'cake', 'chair',
'couch', 'potted_plant', 'bed', 'dining_table', 'toilet', 'tv',
'laptop', 'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave',
'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
'scissors', 'teddy_bear', 'hair_drier', 'toothbrush'
]
dataset_aliases = {
'voc': ['voc', 'pascal_voc', 'voc07', 'voc12'],
'imagenet_det': ['det', 'imagenet_det', 'ilsvrc_det'],
'imagenet_vid': ['vid', 'imagenet_vid', 'ilsvrc_vid'],
'coco': ['coco', 'mscoco', 'ms_coco']
}
def get_classes(dataset):
"""Get class names of a dataset."""
alias2name = {}
for name, aliases in dataset_aliases.items():
for alias in aliases:
alias2name[alias] = name
if mmcv.is_str(dataset):
if dataset in alias2name:
labels = eval(alias2name[dataset] + '_classes()')
else:
raise ValueError('Unrecognized dataset: {}'.format(dataset))
else:
raise TypeError('dataset must a str, but got {}'.format(type(dataset)))
return labels
================================================
FILE: mmdet/core/evaluation/coco_utils.py
================================================
import mmcv
import numpy as np
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from .recall import eval_recalls
def coco_eval(result_file, result_types, coco, max_dets=(100, 300, 1000)):
for res_type in result_types:
assert res_type in [
'proposal', 'proposal_fast', 'bbox', 'segm', 'keypoints'
]
if mmcv.is_str(coco):
coco = COCO(coco)
assert isinstance(coco, COCO)
if result_types == ['proposal_fast']:
ar = fast_eval_recall(result_file, coco, np.array(max_dets))
for i, num in enumerate(max_dets):
print('AR@{}\t= {:.4f}'.format(num, ar[i]))
return
assert result_file.endswith('.json')
coco_dets = coco.loadRes(result_file)
img_ids = coco.getImgIds()
for res_type in result_types:
iou_type = 'bbox' if res_type == 'proposal' else res_type
cocoEval = COCOeval(coco, coco_dets, iou_type)
cocoEval.params.imgIds = img_ids
if res_type == 'proposal':
cocoEval.params.useCats = 0
cocoEval.params.maxDets = list(max_dets)
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()
def fast_eval_recall(results,
coco,
max_dets,
iou_thrs=np.arange(0.5, 0.96, 0.05)):
if mmcv.is_str(results):
assert results.endswith('.pkl')
results = mmcv.load(results)
elif not isinstance(results, list):
raise TypeError(
'results must be a list of numpy arrays or a filename, not {}'.
format(type(results)))
gt_bboxes = []
img_ids = coco.getImgIds()
for i in range(len(img_ids)):
ann_ids = coco.getAnnIds(imgIds=img_ids[i])
ann_info = coco.loadAnns(ann_ids)
if len(ann_info) == 0:
gt_bboxes.append(np.zeros((0, 4)))
continue
bboxes = []
for ann in ann_info:
if ann.get('ignore', False) or ann['iscrowd']:
continue
x1, y1, w, h = ann['bbox']
bboxes.append([x1, y1, x1 + w - 1, y1 + h - 1])
bboxes = np.array(bboxes, dtype=np.float32)
if bboxes.shape[0] == 0:
bboxes = np.zeros((0, 4))
gt_bboxes.append(bboxes)
recalls = eval_recalls(
gt_bboxes, results, max_dets, iou_thrs, print_summary=False)
ar = recalls.mean(axis=1)
return ar
def xyxy2xywh(bbox):
_bbox = bbox.tolist()
return [
_bbox[0],
_bbox[1],
_bbox[2] - _bbox[0] + 1,
_bbox[3] - _bbox[1] + 1,
]
def proposal2json(dataset, results):
json_results = []
for idx in range(len(dataset)):
img_id = dataset.img_ids[idx]
bboxes = results[idx]
for i in range(bboxes.shape[0]):
data = dict()
data['image_id'] = img_id
data['bbox'] = xyxy2xywh(bboxes[i])
data['score'] = float(bboxes[i][4])
data['category_id'] = 1
json_results.append(data)
return json_results
def det2json(dataset, results):
json_results = []
for idx in range(len(dataset)):
img_id = dataset.img_ids[idx]
result = results[idx]
for label in range(len(result)):
bboxes = result[label]
for i in range(bboxes.shape[0]):
data = dict()
data['image_id'] = img_id
data['bbox'] = xyxy2xywh(bboxes[i])
data['score'] = float(bboxes[i][4])
data['category_id'] = dataset.cat_ids[label]
json_results.append(data)
return json_results
def segm2json(dataset, results):
json_results = []
for idx in range(len(dataset)):
img_id = dataset.img_ids[idx]
det, seg = results[idx]
for label in range(len(det)):
bboxes = det[label]
segms = seg[label]
for i in range(bboxes.shape[0]):
data = dict()
data['image_id'] = img_id
data['bbox'] = xyxy2xywh(bboxes[i])
data['score'] = float(bboxes[i][4])
data['category_id'] = dataset.cat_ids[label]
segms[i]['counts'] = segms[i]['counts'].decode()
data['segmentation'] = segms[i]
json_results.append(data)
return json_results
def results2json(dataset, results, out_file):
if isinstance(results[0], list):
json_results = det2json(dataset, results)
elif isinstance(results[0], tuple):
json_results = segm2json(dataset, results)
elif isinstance(results[0], np.ndarray):
json_results = proposal2json(dataset, results)
else:
raise TypeError('invalid type of results')
mmcv.dump(json_results, out_file)
================================================
FILE: mmdet/core/evaluation/eval_hooks.py
================================================
import os
import os.path as osp
import shutil
import time
import mmcv
import numpy as np
import torch
from mmcv.runner import Hook, obj_from_dict
from mmcv.parallel import scatter, collate
from pycocotools.cocoeval import COCOeval
from torch.utils.data import Dataset
from .coco_utils import results2json, fast_eval_recall
from .mean_ap import eval_map
from mmdet import datasets
class DistEvalHook(Hook):
def __init__(self, dataset, interval=1):
if isinstance(dataset, Dataset):
self.dataset = dataset
elif isinstance(dataset, dict):
self.dataset = obj_from_dict(dataset, datasets,
{'test_mode': True})
else:
raise TypeError(
'dataset must be a Dataset object or a dict, not {}'.format(
type(dataset)))
self.interval = interval
self.lock_dir = None
def _barrier(self, rank, world_size):
"""Due to some issues with `torch.distributed.barrier()`, we have to
implement this ugly barrier function.
"""
if rank == 0:
for i in range(1, world_size):
tmp = osp.join(self.lock_dir, '{}.pkl'.format(i))
while not (osp.exists(tmp)):
time.sleep(1)
for i in range(1, world_size):
tmp = osp.join(self.lock_dir, '{}.pkl'.format(i))
os.remove(tmp)
else:
tmp = osp.join(self.lock_dir, '{}.pkl'.format(rank))
mmcv.dump([], tmp)
while osp.exists(tmp):
time.sleep(1)
def before_run(self, runner):
self.lock_dir = osp.join(runner.work_dir, '.lock_map_hook')
if runner.rank == 0:
if osp.exists(self.lock_dir):
shutil.rmtree(self.lock_dir)
mmcv.mkdir_or_exist(self.lock_dir)
def after_run(self, runner):
if runner.rank == 0:
shutil.rmtree(self.lock_dir)
def after_train_epoch(self, runner):
if not self.every_n_epochs(runner, self.interval):
return
runner.model.eval()
results = [None for _ in range(len(self.dataset))]
prog_bar = mmcv.ProgressBar(len(self.dataset))
for idx in range(runner.rank, len(self.dataset), runner.world_size):
data = self.dataset[idx]
data_gpu = scatter(
collate([data], samples_per_gpu=1),
[torch.cuda.current_device()])[0]
# compute output
with torch.no_grad():
result = runner.model(
return_loss=False, rescale=True, **data_gpu)
results[idx] = result
batch_size = runner.world_size
for _ in range(batch_size):
prog_bar.update()
if runner.rank == 0:
print('\n')
self._barrier(runner.rank, runner.world_size)
for i in range(1, runner.world_size):
tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(i))
tmp_results = mmcv.load(tmp_file)
for idx in range(i, len(results), runner.world_size):
results[idx] = tmp_results[idx]
os.remove(tmp_file)
self.evaluate(runner, results)
else:
tmp_file = osp.join(runner.work_dir,
'temp_{}.pkl'.format(runner.rank))
mmcv.dump(results, tmp_file)
self._barrier(runner.rank, runner.world_size)
self._barrier(runner.rank, runner.world_size)
def evaluate(self):
raise NotImplementedError
class DistEvalmAPHook(DistEvalHook):
def evaluate(self, runner, results):
gt_bboxes = []
gt_labels = []
gt_ignore = [] if self.dataset.with_crowd else None
for i in range(len(self.dataset)):
ann = self.dataset.get_ann_info(i)
bboxes = ann['bboxes']
labels = ann['labels']
if gt_ignore is not None:
ignore = np.concatenate([
np.zeros(bboxes.shape[0], dtype=np.bool),
np.ones(ann['bboxes_ignore'].shape[0], dtype=np.bool)
])
gt_ignore.append(ignore)
bboxes = np.vstack([bboxes, ann['bboxes_ignore']])
labels = np.concatenate([labels, ann['labels_ignore']])
gt_bboxes.append(bboxes)
gt_labels.append(labels)
# If the dataset is VOC2007, then use 11 points mAP evaluation.
if hasattr(self.dataset, 'year') and self.dataset.year == 2007:
ds_name = 'voc07'
else:
ds_name = self.dataset.CLASSES
mean_ap, eval_results = eval_map(
results,
gt_bboxes,
gt_labels,
gt_ignore=gt_ignore,
scale_ranges=None,
iou_thr=0.5,
dataset=ds_name,
print_summary=True)
runner.log_buffer.output['mAP'] = mean_ap
runner.log_buffer.ready = True
class CocoDistEvalRecallHook(DistEvalHook):
def __init__(self,
dataset,
proposal_nums=(100, 300, 1000),
iou_thrs=np.arange(0.5, 0.96, 0.05)):
super(CocoDistEvalRecallHook, self).__init__(dataset)
self.proposal_nums = np.array(proposal_nums, dtype=np.int32)
self.iou_thrs = np.array(iou_thrs, dtype=np.float32)
def evaluate(self, runner, results):
# the official coco evaluation is too slow, here we use our own
# implementation instead, which may get slightly different results
ar = fast_eval_recall(results, self.dataset.coco, self.proposal_nums,
self.iou_thrs)
for i, num in enumerate(self.proposal_nums):
runner.log_buffer.output['AR@{}'.format(num)] = ar[i]
runner.log_buffer.ready = True
class CocoDistEvalmAPHook(DistEvalHook):
def evaluate(self, runner, results):
tmp_file = osp.join(runner.work_dir, 'temp_0.json')
results2json(self.dataset, results, tmp_file)
res_types = ['bbox',
'segm'] if runner.model.module.with_mask else ['bbox']
cocoGt = self.dataset.coco
cocoDt = cocoGt.loadRes(tmp_file)
imgIds = cocoGt.getImgIds()
for res_type in res_types:
iou_type = res_type
cocoEval = COCOeval(cocoGt, cocoDt, iou_type)
cocoEval.params.imgIds = imgIds
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()
field = '{}_mAP'.format(res_type)
runner.log_buffer.output[field] = cocoEval.stats[0]
runner.log_buffer.ready = True
os.remove(tmp_file)
================================================
FILE: mmdet/core/evaluation/mean_ap.py
================================================
import mmcv
import numpy as np
from terminaltables import AsciiTable
from .bbox_overlaps import bbox_overlaps
from .class_names import get_classes
def average_precision(recalls, precisions, mode='area'):
"""Calculate average precision (for single or multiple scales).
Args:
recalls (ndarray): shape (num_scales, num_dets) or (num_dets, )
precisions (ndarray): shape (num_scales, num_dets) or (num_dets, )
mode (str): 'area' or '11points', 'area' means calculating the area
under precision-recall curve, '11points' means calculating
the average precision of recalls at [0, 0.1, ..., 1]
Returns:
float or ndarray: calculated average precision
"""
no_scale = False
if recalls.ndim == 1:
no_scale = True
recalls = recalls[np.newaxis, :]
precisions = precisions[np.newaxis, :]
assert recalls.shape == precisions.shape and recalls.ndim == 2
num_scales = recalls.shape[0]
ap = np.zeros(num_scales, dtype=np.float32)
if mode == 'area':
zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
ones = np.ones((num_scales, 1), dtype=recalls.dtype)
mrec = np.hstack((zeros, recalls, ones))
mpre = np.hstack((zeros, precisions, zeros))
for i in range(mpre.shape[1] - 1, 0, -1):
mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
for i in range(num_scales):
ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
ap[i] = np.sum(
(mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
elif mode == '11points':
for i in range(num_scales):
for thr in np.arange(0, 1 + 1e-3, 0.1):
precs = precisions[i, recalls[i, :] >= thr]
prec = precs.max() if precs.size > 0 else 0
ap[i] += prec
ap /= 11
else:
raise ValueError(
'Unrecognized mode, only "area" and "11points" are supported')
if no_scale:
ap = ap[0]
return ap
def tpfp_imagenet(det_bboxes,
gt_bboxes,
gt_ignore,
default_iou_thr,
area_ranges=None):
"""Check if detected bboxes are true positive or false positive.
Args:
det_bbox (ndarray): the detected bbox
gt_bboxes (ndarray): ground truth bboxes of this image
gt_ignore (ndarray): indicate if gts are ignored for evaluation or not
default_iou_thr (float): the iou thresholds for medium and large bboxes
area_ranges (list or None): gt bbox area ranges
Returns:
tuple: two arrays (tp, fp) whose elements are 0 and 1
"""
num_dets = det_bboxes.shape[0]
num_gts = gt_bboxes.shape[0]
if area_ranges is None:
area_ranges = [(None, None)]
num_scales = len(area_ranges)
# tp and fp are of shape (num_scales, num_gts), each row is tp or fp
# of a certain scale.
tp = np.zeros((num_scales, num_dets), dtype=np.float32)
fp = np.zeros((num_scales, num_dets), dtype=np.float32)
if gt_bboxes.shape[0] == 0:
if area_ranges == [(None, None)]:
fp[...] = 1
else:
det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0] + 1) * (
det_bboxes[:, 3] - det_bboxes[:, 1] + 1)
for i, (min_area, max_area) in enumerate(area_ranges):
fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
return tp, fp
ious = bbox_overlaps(det_bboxes, gt_bboxes - 1)
gt_w = gt_bboxes[:, 2] - gt_bboxes[:, 0] + 1
gt_h = gt_bboxes[:, 3] - gt_bboxes[:, 1] + 1
iou_thrs = np.minimum((gt_w * gt_h) / ((gt_w + 10.0) * (gt_h + 10.0)),
default_iou_thr)
# sort all detections by scores in descending order
sort_inds = np.argsort(-det_bboxes[:, -1])
for k, (min_area, max_area) in enumerate(area_ranges):
gt_covered = np.zeros(num_gts, dtype=bool)
# if no area range is specified, gt_area_ignore is all False
if min_area is None:
gt_area_ignore = np.zeros_like(gt_ignore, dtype=bool)
else:
gt_areas = gt_w * gt_h
gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
for i in sort_inds:
max_iou = -1
matched_gt = -1
# find best overlapped available gt
for j in range(num_gts):
# different from PASCAL VOC: allow finding other gts if the
# best overlaped ones are already matched by other det bboxes
if gt_covered[j]:
continue
elif ious[i, j] >= iou_thrs[j] and ious[i, j] > max_iou:
max_iou = ious[i, j]
matched_gt = j
# there are 4 cases for a det bbox:
# 1. it matches a gt, tp = 1, fp = 0
# 2. it matches an ignored gt, tp = 0, fp = 0
# 3. it matches no gt and within area range, tp = 0, fp = 1
# 4. it matches no gt but is beyond area range, tp = 0, fp = 0
if matched_gt >= 0:
gt_covered[matched_gt] = 1
if not (gt_ignore[matched_gt] or gt_area_ignore[matched_gt]):
tp[k, i] = 1
elif min_area is None:
fp[k, i] = 1
else:
bbox = det_bboxes[i, :4]
area = (bbox[2] - bbox[0] + 1) * (bbox[3] - bbox[1] + 1)
if area >= min_area and area < max_area:
fp[k, i] = 1
return tp, fp
def tpfp_default(det_bboxes, gt_bboxes, gt_ignore, iou_thr, area_ranges=None):
"""Check if detected bboxes are true positive or false positive.
Args:
det_bbox (ndarray): the detected bbox
gt_bboxes (ndarray): ground truth bboxes of this image
gt_ignore (ndarray): indicate if gts are ignored for evaluation or not
iou_thr (float): the iou thresholds
Returns:
tuple: (tp, fp), two arrays whose elements are 0 and 1
"""
num_dets = det_bboxes.shape[0]
num_gts = gt_bboxes.shape[0]
if area_ranges is None:
area_ranges = [(None, None)]
num_scales = len(area_ranges)
# tp and fp are of shape (num_scales, num_gts), each row is tp or fp of
# a certain scale
tp = np.zeros((num_scales, num_dets), dtype=np.float32)
fp = np.zeros((num_scales, num_dets), dtype=np.float32)
# if there is no gt bboxes in this image, then all det bboxes
# within area range are false positives
if gt_bboxes.shape[0] == 0:
if area_ranges == [(None, None)]:
fp[...] = 1
else:
det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0] + 1) * (
det_bboxes[:, 3] - det_bboxes[:, 1] + 1)
for i, (min_area, max_area) in enumerate(area_ranges):
fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
return tp, fp
ious = bbox_overlaps(det_bboxes, gt_bboxes)
ious_max = ious.max(axis=1)
ious_argmax = ious.argmax(axis=1)
sort_inds = np.argsort(-det_bboxes[:, -1])
for k, (min_area, max_area) in enumerate(area_ranges):
gt_covered = np.zeros(num_gts, dtype=bool)
# if no area range is specified, gt_area_ignore is all False
if min_area is None:
gt_area_ignore = np.zeros_like(gt_ignore, dtype=bool)
else:
gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0] + 1) * (
gt_bboxes[:, 3] - gt_bboxes[:, 1] + 1)
gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
for i in sort_inds:
if ious_max[i] >= iou_thr:
matched_gt = ious_argmax[i]
if not (gt_ignore[matched_gt] or gt_area_ignore[matched_gt]):
if not gt_covered[matched_gt]:
gt_covered[matched_gt] = True
tp[k, i] = 1
else:
fp[k, i] = 1
# otherwise ignore this detected bbox, tp = 0, fp = 0
elif min_area is None:
fp[k, i] = 1
else:
bbox = det_bboxes[i, :4]
area = (bbox[2] - bbox[0] + 1) * (bbox[3] - bbox[1] + 1)
if area >= min_area and area < max_area:
fp[k, i] = 1
return tp, fp
def get_cls_results(det_results, gt_bboxes, gt_labels, gt_ignore, class_id):
"""Get det results and gt information of a certain class."""
cls_dets = [det[class_id]
for det in det_results] # det bboxes of this class
cls_gts = [] # gt bboxes of this class
cls_gt_ignore = []
for j in range(len(gt_bboxes)):
gt_bbox = gt_bboxes[j]
cls_inds = (gt_labels[j] == class_id + 1)
cls_gt = gt_bbox[cls_inds, :] if gt_bbox.shape[0] > 0 else gt_bbox
cls_gts.append(cls_gt)
if gt_ignore is None:
cls_gt_ignore.append(np.zeros(cls_gt.shape[0], dtype=np.int32))
else:
cls_gt_ignore.append(gt_ignore[j][cls_inds])
return cls_dets, cls_gts, cls_gt_ignore
def eval_map(det_results,
gt_bboxes,
gt_labels,
gt_ignore=None,
scale_ranges=None,
iou_thr=0.5,
dataset=None,
print_summary=True):
"""Evaluate mAP of a dataset.
Args:
det_results (list): a list of list, [[cls1_det, cls2_det, ...], ...]
gt_bboxes (list): ground truth bboxes of each image, a list of K*4
array.
gt_labels (list): ground truth labels of each image, a list of K array
gt_ignore (list): gt ignore indicators of each image, a list of K array
scale_ranges (list, optional): [(min1, max1), (min2, max2), ...]
iou_thr (float): IoU threshold
dataset (None or str or list): dataset name or dataset classes, there
are minor differences in metrics for different datsets, e.g.
"voc07", "imagenet_det", etc.
print_summary (bool): whether to print the mAP summary
Returns:
tuple: (mAP, [dict, dict, ...])
"""
assert len(det_results) == len(gt_bboxes) == len(gt_labels)
if gt_ignore is not None:
assert len(gt_ignore) == len(gt_labels)
for i in range(len(gt_ignore)):
assert len(gt_labels[i]) == len(gt_ignore[i])
area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges]
if scale_ranges is not None else None)
num_scales = len(scale_ranges) if scale_ranges is not None else 1
eval_results = []
num_classes = len(det_results[0]) # positive class num
gt_labels = [
label if label.ndim == 1 else label[:, 0] for label in gt_labels
]
for i in range(num_classes):
# get gt and det bboxes of this class
cls_dets, cls_gts, cls_gt_ignore = get_cls_results(
det_results, gt_bboxes, gt_labels, gt_ignore, i)
# calculate tp and fp for each image
tpfp_func = (tpfp_imagenet
if dataset in ['det', 'vid'] else tpfp_default)
tpfp = [
tpfp_func(cls_dets[j], cls_gts[j], cls_gt_ignore[j], iou_thr,
area_ranges) for j in range(len(cls_dets))
]
tp, fp = tuple(zip(*tpfp))
# calculate gt number of each scale, gts ignored or beyond scale
# are not counted
num_gts = np.zeros(num_scales, dtype=int)
for j, bbox in enumerate(cls_gts):
if area_ranges is None:
num_gts[0] += np.sum(np.logical_not(cls_gt_ignore[j]))
else:
gt_areas = (bbox[:, 2] - bbox[:, 0] + 1) * (
bbox[:, 3] - bbox[:, 1] + 1)
for k, (min_area, max_area) in enumerate(area_ranges):
num_gts[k] += np.sum(
np.logical_not(cls_gt_ignore[j]) &
(gt_areas >= min_area) & (gt_areas < max_area))
# sort all det bboxes by score, also sort tp and fp
cls_dets = np.vstack(cls_dets)
num_dets = cls_dets.shape[0]
sort_inds = np.argsort(-cls_dets[:, -1])
tp = np.hstack(tp)[:, sort_inds]
fp = np.hstack(fp)[:, sort_inds]
# calculate recall and precision with tp and fp
tp = np.cumsum(tp, axis=1)
fp = np.cumsum(fp, axis=1)
eps = np.finfo(np.float32).eps
recalls = tp / np.maximum(num_gts[:, np.newaxis], eps)
precisions = tp / np.maximum((tp + fp), eps)
# calculate AP
if scale_ranges is None:
recalls = recalls[0, :]
precisions = precisions[0, :]
num_gts = num_gts.item()
mode = 'area' if dataset != 'voc07' else '11points'
ap = average_precision(recalls, precisions, mode)
eval_results.append({
'num_gts': num_gts,
'num_dets': num_dets,
'recall': recalls,
'precision': precisions,
'ap': ap
})
if scale_ranges is not None:
# shape (num_classes, num_scales)
all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results])
all_num_gts = np.vstack(
[cls_result['num_gts'] for cls_result in eval_results])
mean_ap = [
all_ap[all_num_gts[:, i] > 0, i].mean()
if np.any(all_num_gts[:, i] > 0) else 0.0
for i in range(num_scales)
]
else:
aps = []
for cls_result in eval_results:
if cls_result['num_gts'] > 0:
aps.append(cls_result['ap'])
mean_ap = np.array(aps).mean().item() if aps else 0.0
if print_summary:
print_map_summary(mean_ap, eval_results, dataset)
return mean_ap, eval_results
def print_map_summary(mean_ap, results, dataset=None):
"""Print mAP and results of each class.
Args:
mean_ap(float): calculated from `eval_map`
results(list): calculated from `eval_map`
dataset(None or str or list): dataset name or dataset classes.
"""
num_scales = len(results[0]['ap']) if isinstance(results[0]['ap'],
np.ndarray) else 1
num_classes = len(results)
recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
precisions = np.zeros((num_scales, num_classes), dtype=np.float32)
aps = np.zeros((num_scales, num_classes), dtype=np.float32)
num_gts = np.zeros((num_scales, num_classes), dtype=int)
for i, cls_result in enumerate(results):
if cls_result['recall'].size > 0:
recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1]
precisions[:, i] = np.array(
cls_result['precision'], ndmin=2)[:, -1]
aps[:, i] = cls_result['ap']
num_gts[:, i] = cls_result['num_gts']
if dataset is None:
label_names = [str(i) for i in range(1, num_classes + 1)]
elif mmcv.is_str(dataset):
label_names = get_classes(dataset)
else:
label_names = dataset
if not isinstance(mean_ap, list):
mean_ap = [mean_ap]
header = ['class', 'gts', 'dets', 'recall', 'precision', 'ap']
for i in range(num_scales):
table_data = [header]
for j in range(num_classes):
row_data = [
label_names[j], num_gts[i, j], results[j]['num_dets'],
'{:.3f}'.format(recalls[i, j]), '{:.3f}'.format(
precisions[i, j]), '{:.3f}'.format(aps[i, j])
]
table_data.append(row_data)
table_data.append(['mAP', '', '', '', '', '{:.3f}'.format(mean_ap[i])])
table = AsciiTable(table_data)
table.inner_footing_row_border = True
print(table.table)
================================================
FILE: mmdet/core/evaluation/recall.py
================================================
import numpy as np
from terminaltables import AsciiTable
from .bbox_overlaps import bbox_overlaps
def _recalls(all_ious, proposal_nums, thrs):
img_num = all_ious.shape[0]
total_gt_num = sum([ious.shape[0] for ious in all_ious])
_ious = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32)
for k, proposal_num in enumerate(proposal_nums):
tmp_ious = np.zeros(0)
for i in range(img_num):
ious = all_ious[i][:, :proposal_num].copy()
gt_ious = np.zeros((ious.shape[0]))
if ious.size == 0:
tmp_ious = np.hstack((tmp_ious, gt_ious))
continue
for j in range(ious.shape[0]):
gt_max_overlaps = ious.argmax(axis=1)
max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps]
gt_idx = max_ious.argmax()
gt_ious[j] = max_ious[gt_idx]
box_idx = gt_max_overlaps[gt_idx]
ious[gt_idx, :] = -1
ious[:, box_idx] = -1
tmp_ious = np.hstack((tmp_ious, gt_ious))
_ious[k, :] = tmp_ious
_ious = np.fliplr(np.sort(_ious, axis=1))
recalls = np.zeros((proposal_nums.size, thrs.size))
for i, thr in enumerate(thrs):
recalls[:, i] = (_ious >= thr).sum(axis=1) / float(total_gt_num)
return recalls
def set_recall_param(proposal_nums, iou_thrs):
"""Check proposal_nums and iou_thrs and set correct format.
"""
if isinstance(proposal_nums, list):
_proposal_nums = np.array(proposal_nums)
elif isinstance(proposal_nums, int):
_proposal_nums = np.array([proposal_nums])
else:
_proposal_nums = proposal_nums
if iou_thrs is None:
_iou_thrs = np.array([0.5])
elif isinstance(iou_thrs, list):
_iou_thrs = np.array(iou_thrs)
elif isinstance(iou_thrs, float):
_iou_thrs = np.array([iou_thrs])
else:
_iou_thrs = iou_thrs
return _proposal_nums, _iou_thrs
def eval_recalls(gts,
proposals,
proposal_nums=None,
iou_thrs=None,
print_summary=True):
"""Calculate recalls.
Args:
gts(list or ndarray): a list of arrays of shape (n, 4)
proposals(list or ndarray): a list of arrays of shape (k, 4) or (k, 5)
proposal_nums(int or list of int or ndarray): top N proposals
thrs(float or list or ndarray): iou thresholds
Returns:
ndarray: recalls of different ious and proposal nums
"""
img_num = len(gts)
assert img_num == len(proposals)
proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs)
all_ious = []
for i in range(img_num):
if proposals[i].ndim == 2 and proposals[i].shape[1] == 5:
scores = proposals[i][:, 4]
sort_idx = np.argsort(scores)[::-1]
img_proposal = proposals[i][sort_idx, :]
else:
img_proposal = proposals[i]
prop_num = min(img_proposal.shape[0], proposal_nums[-1])
if gts[i] is None or gts[i].shape[0] == 0:
ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32)
else:
ious = bbox_overlaps(gts[i], img_proposal[:prop_num, :4])
all_ious.append(ious)
all_ious = np.array(all_ious)
recalls = _recalls(all_ious, proposal_nums, iou_thrs)
if print_summary:
print_recall_summary(recalls, proposal_nums, iou_thrs)
return recalls
def print_recall_summary(recalls,
proposal_nums,
iou_thrs,
row_idxs=None,
col_idxs=None):
"""Print recalls in a table.
Args:
recalls(ndarray): calculated from `bbox_recalls`
proposal_nums(ndarray or list): top N proposals
iou_thrs(ndarray or list): iou thresholds
row_idxs(ndarray): which rows(proposal nums) to print
col_idxs(ndarray): which cols(iou thresholds) to print
"""
proposal_nums = np.array(proposal_nums, dtype=np.int32)
iou_thrs = np.array(iou_thrs)
if row_idxs is None:
row_idxs = np.arange(proposal_nums.size)
if col_idxs is None:
col_idxs = np.arange(iou_thrs.size)
row_header = [''] + iou_thrs[col_idxs].tolist()
table_data = [row_header]
for i, num in enumerate(proposal_nums[row_idxs]):
row = [
'{:.3f}'.format(val)
for val in recalls[row_idxs[i], col_idxs].tolist()
]
row.insert(0, num)
table_data.append(row)
table = AsciiTable(table_data)
print(table.table)
def plot_num_recall(recalls, proposal_nums):
"""Plot Proposal_num-Recalls curve.
Args:
recalls(ndarray or list): shape (k,)
proposal_nums(ndarray or list): same shape as `recalls`
"""
if isinstance(proposal_nums, np.ndarray):
_proposal_nums = proposal_nums.tolist()
else:
_proposal_nums = proposal_nums
if isinstance(recalls, np.ndarray):
_recalls = recalls.tolist()
else:
_recalls = recalls
import matplotlib.pyplot as plt
f = plt.figure()
plt.plot([0] + _proposal_nums, [0] + _recalls)
plt.xlabel('Proposal num')
plt.ylabel('Recall')
plt.axis([0, proposal_nums.max(), 0, 1])
f.show()
def plot_iou_recall(recalls, iou_thrs):
"""Plot IoU-Recalls curve.
Args:
recalls(ndarray or list): shape (k,)
iou_thrs(ndarray or list): same shape as `recalls`
"""
if isinstance(iou_thrs, np.ndarray):
_iou_thrs = iou_thrs.tolist()
else:
_iou_thrs = iou_thrs
if isinstance(recalls, np.ndarray):
_recalls = recalls.tolist()
else:
_recalls = recalls
import matplotlib.pyplot as plt
f = plt.figure()
plt.plot(_iou_thrs + [1.0], _recalls + [0.])
plt.xlabel('IoU')
plt.ylabel('Recall')
plt.axis([iou_thrs.min(), 1, 0, 1])
f.show()
================================================
FILE: mmdet/core/loss/__init__.py
================================================
from .losses import (weighted_nll_loss, weighted_cross_entropy,
weighted_binary_cross_entropy, sigmoid_focal_loss,
weighted_sigmoid_focal_loss, mask_cross_entropy,
smooth_l1_loss, weighted_smoothl1, accuracy)
__all__ = [
'weighted_nll_loss', 'weighted_cross_entropy',
'weighted_binary_cross_entropy', 'sigmoid_focal_loss',
'weighted_sigmoid_focal_loss', 'mask_cross_entropy', 'smooth_l1_loss',
'weighted_smoothl1', 'accuracy'
]
================================================
FILE: mmdet/core/loss/losses.py
================================================
# TODO merge naive and weighted loss.
import torch
import torch.nn.functional as F
def weighted_nll_loss(pred, label, weight, avg_factor=None):
if avg_factor is None:
avg_factor = max(torch.sum(weight > 0).float().item(), 1.)
raw = F.nll_loss(pred, label, reduction='none')
return torch.sum(raw * weight)[None] / avg_factor
def weighted_cross_entropy(pred, label, weight, avg_factor=None, reduce=True):
if avg_factor is None:
avg_factor = max(torch.sum(weight > 0).float().item(), 1.)
raw = F.cross_entropy(pred, label, reduction='none')
if reduce:
return torch.sum(raw * weight)[None] / avg_factor
else:
return raw * weight / avg_factor
def weighted_binary_cross_entropy(pred, label, weight, avg_factor=None):
if avg_factor is None:
avg_factor = max(torch.sum(weight > 0).float().item(), 1.)
return F.binary_cross_entropy_with_logits(
pred, label.float(), weight.float(),
reduction='sum')[None] / avg_factor
def sigmoid_focal_loss(pred,
target,
weight,
gamma=2.0,
alpha=0.25,
reduction='elementwise_mean'):
pred_sigmoid = pred.sigmoid()
target = target.type_as(pred)
pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
weight = (alpha * target + (1 - alpha) * (1 - target)) * weight
weight = weight * pt.pow(gamma)
return F.binary_cross_entropy_with_logits(
pred, target, weight, reduction=reduction)
def weighted_sigmoid_focal_loss(pred,
target,
weight,
gamma=2.0,
alpha=0.25,
avg_factor=None,
num_classes=80):
if avg_factor is None:
avg_factor = torch.sum(weight > 0).float().item() / num_classes + 1e-6
return sigmoid_focal_loss(
pred, target, weight, gamma=gamma, alpha=alpha,
reduction='sum')[None] / avg_factor
def mask_cross_entropy(pred, target, label):
num_rois = pred.size()[0]
inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
pred_slice = pred[inds, label].squeeze(1)
return F.binary_cross_entropy_with_logits(
pred_slice, target, reduction='elementwise_mean')[None]
def smooth_l1_loss(pred, target, beta=1.0, reduction='elementwise_mean'):
assert beta > 0
assert pred.size() == target.size() and target.numel() > 0
diff = torch.abs(pred - target)
loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
diff - 0.5 * beta)
reduction = F._Reduction.get_enum(reduction)
# none: 0, elementwise_mean:1, sum: 2
if reduction == 0:
return loss
elif reduction == 1:
return loss.sum() / pred.numel()
elif reduction == 2:
return loss.sum()
def weighted_smoothl1(pred, target, weight, beta=1.0, avg_factor=None):
if avg_factor is None:
avg_factor = torch.sum(weight > 0).float().item() / 4 + 1e-6
loss = smooth_l1_loss(pred, target, beta, reduction='none')
return torch.sum(loss * weight)[None] / avg_factor
def accuracy(pred, target, topk=1):
if isinstance(topk, int):
topk = (topk, )
return_single = True
else:
return_single = False
maxk = max(topk)
_, pred_label = pred.topk(maxk, 1, True, True)
pred_label = pred_label.t()
correct = pred_label.eq(target.view(1, -1).expand_as(pred_label))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / pred.size(0)))
return res[0] if return_single else res
================================================
FILE: mmdet/core/mask/__init__.py
================================================
from .utils import split_combined_polys
from .mask_target import mask_target
__all__ = ['split_combined_polys', 'mask_target']
================================================
FILE: mmdet/core/mask/mask_target.py
================================================
import torch
import numpy as np
import mmcv
def mask_target(pos_proposals_list, pos_assigned_gt_inds_list, gt_masks_list,
cfg):
cfg_list = [cfg for _ in range(len(pos_proposals_list))]
mask_targets = map(mask_target_single, pos_proposals_list,
pos_assigned_gt_inds_list, gt_masks_list, cfg_list)
mask_targets = torch.cat(list(mask_targets))
return mask_targets
def mask_target_single(pos_proposals, pos_assigned_gt_inds, gt_masks, cfg):
mask_size = cfg.mask_size
num_pos = pos_proposals.size(0)
mask_targets = []
if num_pos > 0:
proposals_np = pos_proposals.cpu().numpy()
pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy()
for i in range(num_pos):
gt_mask = gt_masks[pos_assigned_gt_inds[i]]
bbox = proposals_np[i, :].astype(np.int32)
x1, y1, x2, y2 = bbox
w = np.maximum(x2 - x1 + 1, 1)
h = np.maximum(y2 - y1 + 1, 1)
# mask is uint8 both before and after resizing
target = mmcv.imresize(gt_mask[y1:y1 + h, x1:x1 + w],
(mask_size, mask_size))
mask_targets.append(target)
mask_targets = torch.from_numpy(np.stack(mask_targets)).float().to(
pos_proposals.device)
else:
mask_targets = pos_proposals.new_zeros((0, mask_size, mask_size))
return mask_targets
================================================
FILE: mmdet/core/mask/utils.py
================================================
import mmcv
def split_combined_polys(polys, poly_lens, polys_per_mask):
"""Split the combined 1-D polys into masks.
A mask is represented as a list of polys, and a poly is represented as
a 1-D array. In dataset, all masks are concatenated into a single 1-D
tensor. Here we need to split the tensor into original representations.
Args:
polys (list): a list (length = image num) of 1-D tensors
poly_lens (list): a list (length = image num) of poly length
polys_per_mask (list): a list (length = image num) of poly number
of each mask
Returns:
list: a list (length = image num) of list (length = mask num) of
list (length = poly num) of numpy array
"""
mask_polys_list = []
for img_id in range(len(polys)):
polys_single = polys[img_id]
polys_lens_single = poly_lens[img_id].tolist()
polys_per_mask_single = polys_per_mask[img_id].tolist()
split_polys = mmcv.slice_list(polys_single, polys_lens_single)
mask_polys = mmcv.slice_list(split_polys, polys_per_mask_single)
mask_polys_list.append(mask_polys)
return mask_polys_list
================================================
FILE: mmdet/core/post_processing/__init__.py
================================================
from .bbox_nms import multiclass_nms
from .merge_augs import (merge_aug_proposals, merge_aug_bboxes,
merge_aug_scores, merge_aug_masks)
__all__ = [
'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes',
'merge_aug_scores', 'merge_aug_masks'
]
================================================
FILE: mmdet/core/post_processing/bbox_nms.py
================================================
import torch
from mmdet.ops.nms import nms_wrapper
def multiclass_nms(multi_bboxes, multi_scores, score_thr, nms_cfg, max_num=-1):
"""NMS for multi-class bboxes.
Args:
multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
multi_scores (Tensor): shape (n, #class)
score_thr (float): bbox threshold, bboxes with scores lower than it
will not be considered.
nms_thr (float): NMS IoU threshold
max_num (int): if there are more than max_num bboxes after NMS,
only top max_num will be kept.
Returns:
tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels
are 0-based.
"""
num_classes = multi_scores.shape[1]
bboxes, labels = [], []
nms_cfg_ = nms_cfg.copy()
nms_type = nms_cfg_.pop('type', 'nms')
nms_op = getattr(nms_wrapper, nms_type)
for i in range(1, num_classes):
cls_inds = multi_scores[:, i] > score_thr
if not cls_inds.any():
continue
# get bboxes and scores of this class
if multi_bboxes.shape[1] == 4:
_bboxes = multi_bboxes[cls_inds, :]
else:
_bboxes = multi_bboxes[cls_inds, i * 4:(i + 1) * 4]
_scores = multi_scores[cls_inds, i]
cls_dets = torch.cat([_bboxes, _scores[:, None]], dim=1)
cls_dets, _ = nms_op(cls_dets, **nms_cfg_)
cls_labels = multi_bboxes.new_full(
(cls_dets.shape[0], ), i - 1, dtype=torch.long)
bboxes.append(cls_dets)
labels.append(cls_labels)
if bboxes:
bboxes = torch.cat(bboxes)
labels = torch.cat(labels)
if bboxes.shape[0] > max_num:
_, inds = bboxes[:, -1].sort(descending=True)
inds = inds[:max_num]
bboxes = bboxes[inds]
labels = labels[inds]
else:
bboxes = multi_bboxes.new_zeros((0, 5))
labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
return bboxes, labels
================================================
FILE: mmdet/core/post_processing/merge_augs.py
================================================
import torch
import numpy as np
from mmdet.ops import nms
from ..bbox import bbox_mapping_back
def merge_aug_proposals(aug_proposals, img_metas, rpn_test_cfg):
"""Merge augmented proposals (multiscale, flip, etc.)
Args:
aug_proposals (list[Tensor]): proposals from different testing
schemes, shape (n, 5). Note that they are not rescaled to the
original image size.
img_metas (list[dict]): image info including "shape_scale" and "flip".
rpn_test_cfg (dict): rpn test config.
Returns:
Tensor: shape (n, 4), proposals corresponding to original image scale.
"""
recovered_proposals = []
for proposals, img_info in zip(aug_proposals, img_metas):
img_shape = img_info['img_shape']
scale_factor = img_info['scale_factor']
flip = img_info['flip']
_proposals = proposals.clone()
_proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], img_shape,
scale_factor, flip)
recovered_proposals.append(_proposals)
aug_proposals = torch.cat(recovered_proposals, dim=0)
merged_proposals, _ = nms(aug_proposals, rpn_test_cfg.nms_thr)
scores = merged_proposals[:, 4]
_, order = scores.sort(0, descending=True)
num = min(rpn_test_cfg.max_num, merged_proposals.shape[0])
order = order[:num]
merged_proposals = merged_proposals[order, :]
return merged_proposals
def merge_aug_bboxes(aug_bboxes, aug_scores, img_metas, rcnn_test_cfg):
"""Merge augmented detection bboxes and scores.
Args:
aug_bboxes (list[Tensor]): shape (n, 4*#class)
aug_scores (list[Tensor] or None): shape (n, #class)
img_shapes (list[Tensor]): shape (3, ).
rcnn_test_cfg (dict): rcnn test config.
Returns:
tuple: (bboxes, scores)
"""
recovered_bboxes = []
for bboxes, img_info in zip(aug_bboxes, img_metas):
img_shape = img_info[0]['img_shape']
scale_factor = img_info[0]['scale_factor']
flip = img_info[0]['flip']
bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip)
recovered_bboxes.append(bboxes)
bboxes = torch.stack(recovered_bboxes).mean(dim=0)
if aug_scores is None:
return bboxes
else:
scores = torch.stack(aug_scores).mean(dim=0)
return bboxes, scores
def merge_aug_scores(aug_scores):
"""Merge augmented bbox scores."""
if isinstance(aug_scores[0], torch.Tensor):
return torch.mean(torch.stack(aug_scores), dim=0)
else:
return np.mean(aug_scores, axis=0)
def merge_aug_masks(aug_masks, img_metas, rcnn_test_cfg, weights=None):
"""Merge augmented mask prediction.
Args:
aug_masks (list[ndarray]): shape (n, #class, h, w)
img_shapes (list[ndarray]): shape (3, ).
rcnn_test_cfg (dict): rcnn test config.
Returns:
tuple: (bboxes, scores)
"""
recovered_masks = [
mask if not img_info[0]['flip'] else mask[..., ::-1]
for mask, img_info in zip(aug_masks, img_metas)
]
if weights is None:
merged_masks = np.mean(recovered_masks, axis=0)
else:
merged_masks = np.average(
np.array(recovered_masks), axis=0, weights=np.array(weights))
return merged_masks
================================================
FILE: mmdet/core/utils/__init__.py
================================================
from .dist_utils import allreduce_grads, DistOptimizerHook
from .misc import tensor2imgs, unmap, multi_apply
__all__ = [
'allreduce_grads', 'DistOptimizerHook', 'tensor2imgs', 'unmap',
'multi_apply'
]
================================================
FILE: mmdet/core/utils/dist_utils.py
================================================
from collections import OrderedDict
import torch.distributed as dist
from torch._utils import (_flatten_dense_tensors, _unflatten_dense_tensors,
_take_tensors)
from mmcv.runner import OptimizerHook
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
if bucket_size_mb > 0:
bucket_size_bytes = bucket_size_mb * 1024 * 1024
buckets = _take_tensors(tensors, bucket_size_bytes)
else:
buckets = OrderedDict()
for tensor in tensors:
tp = tensor.type()
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(tensor)
buckets = buckets.values()
for bucket in buckets:
flat_tensors = _flatten_dense_tensors(bucket)
dist.all_reduce(flat_tensors)
flat_tensors.div_(world_size)
for tensor, synced in zip(
bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
tensor.copy_(synced)
def allreduce_grads(model, coalesce=True, bucket_size_mb=-1):
grads = [
param.grad.data for param in model.parameters()
if param.requires_grad and param.grad is not None
]
world_size = dist.get_world_size()
if coalesce:
_allreduce_coalesced(grads, world_size, bucket_size_mb)
else:
for tensor in grads:
dist.all_reduce(tensor.div_(world_size))
class DistOptimizerHook(OptimizerHook):
def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1):
self.grad_clip = grad_clip
self.coalesce = coalesce
self.bucket_size_mb = bucket_size_mb
def after_train_iter(self, runner):
runner.optimizer.zero_grad()
runner.outputs['loss'].backward()
allreduce_grads(runner.model, self.coalesce, self.bucket_size_mb)
if self.grad_clip is not None:
self.clip_grads(runner.model.parameters())
runner.optimizer.step()
================================================
FILE: mmdet/core/utils/misc.py
================================================
from functools import partial
import mmcv
import numpy as np
from six.moves import map, zip
def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True):
num_imgs = tensor.size(0)
mean = np.array(mean, dtype=np.float32)
std = np.array(std, dtype=np.float32)
imgs = []
for img_id in range(num_imgs):
img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0)
img = mmcv.imdenormalize(
img, mean, std, to_bgr=to_rgb).astype(np.uint8)
imgs.append(np.ascontiguousarray(img))
return imgs
def multi_apply(func, *args, **kwargs):
pfunc = partial(func, **kwargs) if kwargs else func
map_results = map(pfunc, *args)
return tuple(map(list, zip(*map_results)))
def unmap(data, count, inds, fill=0):
""" Unmap a subset of item (data) back to the original set of items (of
size count) """
if data.dim() == 1:
ret = data.new_full((count, ), fill)
ret[inds] = data
else:
new_size = (count, ) + data.size()[1:]
ret = data.new_full(new_size, fill)
ret[inds, :] = data
return ret
================================================
FILE: mmdet/datasets/__init__.py
================================================
from .custom import CustomDataset
from .xml_style import XMLDataset
from .coco import CocoDataset
from .voc import VOCDataset
from .loader import GroupSampler, DistributedGroupSampler, build_dataloader
from .utils import to_tensor, random_scale, show_ann, get_dataset
from .concat_dataset import ConcatDataset
from .repeat_dataset import RepeatDataset
from .extra_aug import ExtraAugmentation
__all__ = [
'CustomDataset', 'XMLDataset', 'CocoDataset', 'VOCDataset', 'GroupSampler',
'DistributedGroupSampler', 'build_dataloader', 'to_tensor', 'random_scale',
'show_ann', 'get_dataset', 'ConcatDataset', 'RepeatDataset',
'ExtraAugmentation'
]
================================================
FILE: mmdet/datasets/coco.py
================================================
import numpy as np
from pycocotools.coco import COCO
from .custom import CustomDataset
class CocoDataset(CustomDataset):
CLASSES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'traffic_light', 'fire_hydrant',
'stop_sign', 'parking_meter', 'bench', 'bird', 'cat', 'dog',
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
'skis', 'snowboard', 'sports_ball', 'kite', 'baseball_bat',
'baseball_glove', 'skateboard', 'surfboard', 'tennis_racket',
'bottle', 'wine_glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
'hot_dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
'potted_plant', 'bed', 'dining_table', 'toilet', 'tv', 'laptop',
'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave',
'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock',
'vase', 'scissors', 'teddy_bear', 'hair_drier', 'toothbrush')
def load_annotations(self, ann_file):
self.coco = COCO(ann_file)
self.cat_ids = self.coco.getCatIds()
self.cat2label = {
cat_id: i + 1
for i, cat_id in enumerate(self.cat_ids)
}
self.img_ids = self.coco.getImgIds()
img_infos = []
for i in self.img_ids:
info = self.coco.loadImgs([i])[0]
info['filename'] = info['file_name']
img_infos.append(info)
return img_infos
def get_ann_info(self, idx):
img_id = self.img_infos[idx]['id']
ann_ids = self.coco.getAnnIds(imgIds=[img_id])
ann_info = self.coco.loadAnns(ann_ids)
return self._parse_ann_info(ann_info, self.with_mask)
def _filter_imgs(self, min_size=32):
"""Filter images too small or without ground truths."""
valid_inds = []
ids_with_ann = set(_['image_id'] for _ in self.coco.anns.values())
for i, img_info in enumerate(self.img_infos):
if self.img_ids[i] not in ids_with_ann:
continue
if min(img_info['width'], img_info['height']) >= min_size:
valid_inds.append(i)
return valid_inds
def _parse_ann_info(self, ann_info, with_mask=True):
"""Parse bbox and mask annotation.
Args:
ann_info (list[dict]): Annotation info of an image.
with_mask (bool): Whether to parse mask annotations.
Returns:
dict: A dict containing the following keys: bboxes, bboxes_ignore,
labels, masks, mask_polys, poly_lens.
"""
gt_bboxes = []
gt_labels = []
gt_bboxes_ignore = []
# Two formats are provided.
# 1. mask: a binary map of the same size of the image.
# 2. polys: each mask consists of one or several polys, each poly is a
# list of float.
if with_mask:
gt_masks = []
gt_mask_polys = []
gt_poly_lens = []
for i, ann in enumerate(ann_info):
if ann.get('ignore', False):
continue
x1, y1, w, h = ann['bbox']
if ann['area'] <= 0 or w < 1 or h < 1:
continue
bbox = [x1, y1, x1 + w - 1, y1 + h - 1]
if ann['iscrowd']:
gt_bboxes_ignore.append(bbox)
else:
gt_bboxes.append(bbox)
gt_labels.append(self.cat2label[ann['category_id']])
if with_mask:
gt_masks.append(self.coco.annToMask(ann))
mask_polys = [
p for p in ann['segmentation'] if len(p) >= 6
] # valid polygons have >= 3 points (6 coordinates)
poly_lens = [len(p) for p in mask_polys]
gt_mask_polys.append(mask_polys)
gt_poly_lens.extend(poly_lens)
if gt_bboxes:
gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
gt_labels = np.array(gt_labels, dtype=np.int64)
else:
gt_bboxes = np.zeros((0, 4), dtype=np.float32)
gt_labels = np.array([], dtype=np.int64)
if gt_bboxes_ignore:
gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
else:
gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
ann = dict(
bboxes=gt_bboxes, labels=gt_labels, bboxes_ignore=gt_bboxes_ignore)
if with_mask:
ann['masks'] = gt_masks
# poly format is not used in the current implementation
ann['mask_polys'] = gt_mask_polys
ann['poly_lens'] = gt_poly_lens
return ann
================================================
FILE: mmdet/datasets/concat_dataset.py
================================================
import numpy as np
from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
class ConcatDataset(_ConcatDataset):
"""A wrapper of concatenated dataset.
Same as :obj:`torch.utils.data.dataset.ConcatDataset`, but
concat the group flag for image aspect ratio.
Args:
datasets (list[:obj:`Dataset`]): A list of datasets.
"""
def __init__(self, datasets):
super(ConcatDataset, self).__init__(datasets)
self.CLASSES = datasets[0].CLASSES
if hasattr(datasets[0], 'flag'):
flags = []
for i in range(0, len(datasets)):
flags.append(datasets[i].flag)
self.flag = np.concatenate(flags)
================================================
FILE: mmdet/datasets/custom.py
================================================
import os.path as osp
import mmcv
import numpy as np
from mmcv.parallel import DataContainer as DC
from torch.utils.data import Dataset
from .transforms import (ImageTransform, BboxTransform, MaskTransform,
Numpy2Tensor)
from .utils import to_tensor, random_scale
from .extra_aug import ExtraAugmentation
class CustomDataset(Dataset):
"""Custom dataset for detection.
Annotation format:
[
{
'filename': 'a.jpg',
'width': 1280,
'height': 720,
'ann': {
'bboxes': (n, 4),
'labels': (n, ),
'bboxes_ignore': (k, 4),
'labels_ignore': (k, 4) (optional field)
}
},
...
]
The `ann` field is optional for testing.
"""
CLASSES = None
def __init__(self,
ann_file,
img_prefix,
img_scale,
img_norm_cfg,
size_divisor=None,
proposal_file=None,
num_max_proposals=1000,
flip_ratio=0,
with_mask=True,
with_crowd=True,
with_label=True,
extra_aug=None,
resize_keep_ratio=True,
test_mode=False):
# prefix of images path
self.img_prefix = img_prefix
# load annotations (and proposals)
self.img_infos = self.load_annotations(ann_file)
if proposal_file is not None:
self.proposals = self.load_proposals(proposal_file)
else:
self.proposals = None
# filter images with no annotation during training
if not test_mode:
valid_inds = self._filter_imgs()
self.img_infos = [self.img_infos[i] for i in valid_inds]
if self.proposals is not None:
self.proposals = [self.proposals[i] for i in valid_inds]
# (long_edge, short_edge) or [(long1, short1), (long2, short2), ...]
self.img_scales = img_scale if isinstance(img_scale,
list) else [img_scale]
assert mmcv.is_list_of(self.img_scales, tuple)
# normalization configs
self.img_norm_cfg = img_norm_cfg
# max proposals per image
self.num_max_proposals = num_max_proposals
# flip ratio
self.flip_ratio = flip_ratio
assert flip_ratio >= 0 and flip_ratio <= 1
# padding border to ensure the image size can be divided by
# size_divisor (used for FPN)
self.size_divisor = size_divisor
# with mask or not (reserved field, takes no effect)
self.with_mask = with_mask
# some datasets provide bbox annotations as ignore/crowd/difficult,
# if `with_crowd` is True, then these info is returned.
self.with_crowd = with_crowd
# with label is False for RPN
self.with_label = with_label
# in test mode or not
self.test_mode = test_mode
# set group flag for the sampler
if not self.test_mode:
self._set_group_flag()
# transforms
self.img_transform = ImageTransform(
size_divisor=self.size_divisor, **self.img_norm_cfg)
self.bbox_transform = BboxTransform()
self.mask_transform = MaskTransform()
self.numpy2tensor = Numpy2Tensor()
# if use extra augmentation
if extra_aug is not None:
self.extra_aug = ExtraAugmentation(**extra_aug)
else:
self.extra_aug = None
# image rescale if keep ratio
self.resize_keep_ratio = resize_keep_ratio
def __len__(self):
return len(self.img_infos)
def load_annotations(self, ann_file):
return mmcv.load(ann_file)
def load_proposals(self, proposal_file):
return mmcv.load(proposal_file)
def get_ann_info(self, idx):
return self.img_infos[idx]['ann']
def _filter_imgs(self, min_size=32):
"""Filter images too small."""
valid_inds = []
for i, img_info in enumerate(self.img_infos):
if min(img_info['width'], img_info['height']) >= min_size:
valid_inds.append(i)
return valid_inds
def _set_group_flag(self):
"""Set flag according to image aspect ratio.
Images with aspect ratio greater than 1 will be set as group 1,
otherwise group 0.
"""
self.flag = np.zeros(len(self), dtype=np.uint8)
for i in range(len(self)):
img_info = self.img_infos[i]
if img_info['width'] / img_info['height'] > 1:
self.flag[i] = 1
def _rand_another(self, idx):
pool = np.where(self.flag == self.flag[idx])[0]
return np.random.choice(pool)
def __getitem__(self, idx):
if self.test_mode:
return self.prepare_test_img(idx)
while True:
data = self.prepare_train_img(idx)
if data is None:
idx = self._rand_another(idx)
continue
return data
def prepare_train_img(self, idx):
img_info = self.img_infos[idx]
# load image
img = mmcv.imread(osp.join(self.img_prefix, img_info['filename']))
# load proposals if necessary
if self.proposals is not None:
proposals = self.proposals[idx][:self.num_max_proposals]
# TODO: Handle empty proposals properly. Currently images with
# no proposals are just ignored, but they can be used for
# training in concept.
if len(proposals) == 0:
return None
if not (proposals.shape[1] == 4 or proposals.shape[1] == 5):
raise AssertionError(
'proposals should have shapes (n, 4) or (n, 5), '
'but found {}'.format(proposals.shape))
if proposals.shape[1] == 5:
scores = proposals[:, 4, None]
proposals = proposals[:, :4]
else:
scores = None
ann = self.get_ann_info(idx)
gt_bboxes = ann['bboxes']
gt_labels = ann['labels']
if self.with_crowd:
gt_bboxes_ignore = ann['bboxes_ignore']
# skip the image if there is no valid gt bbox
if len(gt_bboxes) == 0:
return None
# extra augmentation
if self.extra_aug is not None:
img, gt_bboxes, gt_labels = self.extra_aug(img, gt_bboxes,
gt_labels)
# apply transforms
flip = True if np.random.rand() < self.flip_ratio else False
img_scale = random_scale(self.img_scales) # sample a scale
img, img_shape, pad_shape, scale_factor = self.img_transform(
img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
img = img.copy()
if self.proposals is not None:
proposals = self.bbox_transform(proposals, img_shape, scale_factor,
flip)
proposals = np.hstack(
[proposals, scores]) if scores is not None else proposals
gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor,
flip)
if self.with_crowd:
gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape,
scale_factor, flip)
if self.with_mask:
gt_masks = self.mask_transform(ann['masks'], pad_shape,
scale_factor, flip)
ori_shape = (img_info['height'], img_info['width'], 3)
img_meta = dict(
ori_shape=ori_shape,
img_shape=img_shape,
pad_shape=pad_shape,
scale_factor=scale_factor,
flip=flip)
data = dict(
img=DC(to_tensor(img), stack=True),
img_meta=DC(img_meta, cpu_only=True),
gt_bboxes=DC(to_tensor(gt_bboxes)))
if self.proposals is not None:
data['proposals'] = DC(to_tensor(proposals))
if self.with_label:
data['gt_labels'] = DC(to_tensor(gt_labels))
if self.with_crowd:
data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore))
if self.with_mask:
data['gt_masks'] = DC(gt_masks, cpu_only=True)
return data
def prepare_test_img(self, idx):
"""Prepare an image for testing (multi-scale and flipping)"""
img_info = self.img_infos[idx]
img = mmcv.imread(osp.join(self.img_prefix, img_info['filename']))
if self.proposals is not None:
proposal = self.proposals[idx][:self.num_max_proposals]
if not (proposal.shape[1] == 4 or proposal.shape[1] == 5):
raise AssertionError(
'proposals should have shapes (n, 4) or (n, 5), '
'but found {}'.format(proposal.shape))
else:
proposal = None
def prepare_single(img, scale, flip, proposal=None):
_img, img_shape, pad_shape, scale_factor = self.img_transform(
img, scale, flip, keep_ratio=self.resize_keep_ratio)
_img = to_tensor(_img)
_img_meta = dict(
ori_shape=(img_info['height'], img_info['width'], 3),
img_shape=img_shape,
pad_shape=pad_shape,
scale_factor=scale_factor,
flip=flip)
if proposal is not None:
if proposal.shape[1] == 5:
score = proposal[:, 4, None]
proposal = proposal[:, :4]
else:
score = None
_proposal = self.bbox_transform(proposal, img_shape,
scale_factor, flip)
_proposal = np.hstack(
[_proposal, score]) if score is not None else _proposal
_proposal = to_tensor(_proposal)
else:
_proposal = None
return _img, _img_meta, _proposal
imgs = []
img_metas = []
proposals = []
for scale in self.img_scales:
_img, _img_meta, _proposal = prepare_single(
img, scale, False, proposal)
imgs.append(_img)
img_metas.append(DC(_img_meta, cpu_only=True))
proposals.append(_proposal)
if self.flip_ratio > 0:
_img, _img_meta, _proposal = prepare_single(
img, scale, True, proposal)
imgs.append(_img)
img_metas.append(DC(_img_meta, cpu_only=True))
proposals.append(_proposal)
data = dict(img=imgs, img_meta=img_metas)
if self.proposals is not None:
data['proposals'] = proposals
return data
================================================
FILE: mmdet/datasets/extra_aug.py
================================================
import mmcv
import numpy as np
from numpy import random
from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
class PhotoMetricDistortion(object):
def __init__(self,
brightness_delta=32,
contrast_range=(0.5, 1.5),
saturation_range=(0.5, 1.5),
hue_delta=18):
self.brightness_delta = brightness_delta
self.contrast_lower, self.contrast_upper = contrast_range
self.saturation_lower, self.saturation_upper = saturation_range
self.hue_delta = hue_delta
def __call__(self, img, boxes, labels):
# random brightness
if random.randint(2):
delta = random.uniform(-self.brightness_delta,
self.brightness_delta)
img += delta
# mode == 0 --> do random contrast first
# mode == 1 --> do random contrast last
mode = random.randint(2)
if mode == 1:
if random.randint(2):
alpha = random.uniform(self.contrast_lower,
self.contrast_upper)
img *= alpha
# convert color from BGR to HSV
img = mmcv.bgr2hsv(img)
# random saturation
if random.randint(2):
img[..., 1] *= random.uniform(self.saturation_lower,
self.saturation_upper)
# random hue
if random.randint(2):
img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
img[..., 0][img[..., 0] > 360] -= 360
img[..., 0][img[..., 0] < 0] += 360
# convert color from HSV to BGR
img = mmcv.hsv2bgr(img)
# random contrast
if mode == 0:
if random.randint(2):
alpha = random.uniform(self.contrast_lower,
self.contrast_upper)
img *= alpha
# randomly swap channels
if random.randint(2):
img = img[..., random.permutation(3)]
return img, boxes, labels
class Expand(object):
def __init__(self, mean=(0, 0, 0), to_rgb=True, ratio_range=(1, 4)):
if to_rgb:
self.mean = mean[::-1]
else:
self.mean = mean
self.min_ratio, self.max_ratio = ratio_range
def __call__(self, img, boxes, labels):
if random.randint(2):
return img, boxes, labels
h, w, c = img.shape
ratio = random.uniform(self.min_ratio, self.max_ratio)
expand_img = np.full((int(h * ratio), int(w * ratio), c),
self.mean).astype(img.dtype)
left = int(random.uniform(0, w * ratio - w))
top = int(random.uniform(0, h * ratio - h))
expand_img[top:top + h, left:left + w] = img
img = expand_img
boxes += np.tile((left, top), 2)
return img, boxes, labels
class RandomCrop(object):
def __init__(self,
min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
min_crop_size=0.3):
# 1: return ori img
self.sample_mode = (1, *min_ious, 0)
self.min_crop_size = min_crop_size
def __call__(self, img, boxes, labels):
h, w, c = img.shape
while True:
mode = random.choice(self.sample_mode)
if mode == 1:
return img, boxes, labels
min_iou = mode
for i in range(50):
new_w = random.uniform(self.min_crop_size * w, w)
new_h = random.uniform(self.min_crop_size * h, h)
# h / w in [0.5, 2]
if new_h / new_w < 0.5 or new_h / new_w > 2:
continue
left = random.uniform(w - new_w)
top = random.uniform(h - new_h)
patch = np.array((int(left), int(top), int(left + new_w),
int(top + new_h)))
overlaps = bbox_overlaps(
patch.reshape(-1, 4), boxes.reshape(-1, 4)).reshape(-1)
if overlaps.min() < min_iou:
continue
# center of boxes should inside the crop img
center = (boxes[:, :2] + boxes[:, 2:]) / 2
mask = (center[:, 0] > patch[0]) * (
center[:, 1] > patch[1]) * (center[:, 0] < patch[2]) * (
center[:, 1] < patch[3])
if not mask.any():
continue
boxes = boxes[mask]
labels = labels[mask]
# adjust boxes
img = img[patch[1]:patch[3], patch[0]:patch[2]]
boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
boxes -= np.tile(patch[:2], 2)
return img, boxes, labels
class ExtraAugmentation(object):
def __init__(self,
photo_metric_distortion=None,
expand=None,
random_crop=None):
self.transforms = []
if photo_metric_distortion is not None:
self.transforms.append(
PhotoMetricDistortion(**photo_metric_distortion))
if expand is not None:
self.transforms.append(Expand(**expand))
if random_crop is not None:
self.transforms.append(RandomCrop(**random_crop))
def __call__(self, img, boxes, labels):
img = img.astype(np.float32)
for transform in self.transforms:
img, boxes, labels = transform(img, boxes, labels)
return img, boxes, labels
================================================
FILE: mmdet/datasets/loader/__init__.py
================================================
from .build_loader import build_dataloader
from .sampler import GroupSampler, DistributedGroupSampler
__all__ = [
'GroupSampler', 'DistributedGroupSampler', 'build_dataloader'
]
================================================
FILE: mmdet/datasets/loader/build_loader.py
================================================
from functools import partial
from mmcv.runner import get_dist_info
from mmcv.parallel import collate
from torch.utils.data import DataLoader
from .sampler import GroupSampler, DistributedGroupSampler
# https://github.com/pytorch/pytorch/issues/973
import resource
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
def build_dataloader(dataset,
imgs_per_gpu,
workers_per_gpu,
num_gpus=1,
dist=True,
**kwargs):
if dist:
rank, world_size = get_dist_info()
sampler = DistributedGroupSampler(dataset, imgs_per_gpu, world_size,
rank)
batch_size = imgs_per_gpu
num_workers = workers_per_gpu
else:
if not kwargs.get('shuffle', True):
sampler = None
else:
sampler = GroupSampler(dataset, imgs_per_gpu)
batch_size = num_gpus * imgs_per_gpu
num_workers = num_gpus * workers_per_gpu
data_loader = DataLoader(
dataset,
batch_size=batch_size,
sampler=sampler,
num_workers=num_workers,
collate_fn=partial(collate, samples_per_gpu=imgs_per_gpu),
pin_memory=False,
**kwargs)
return data_loader
================================================
FILE: mmdet/datasets/loader/sampler.py
================================================
from __future__ import division
import math
import torch
import numpy as np
from torch.distributed import get_world_size, get_rank
from torch.utils.data.sampler import Sampler
class GroupSampler(Sampler):
def __init__(self, dataset, samples_per_gpu=1):
assert hasattr(dataset, 'flag')
self.dataset = dataset
self.samples_per_gpu = samples_per_gpu
self.flag = dataset.flag.astype(np.int64)
self.group_sizes = np.bincount(self.flag)
self.num_samples = 0
for i, size in enumerate(self.group_sizes):
self.num_samples += int(np.ceil(
size / self.samples_per_gpu)) * self.samples_per_gpu
def __iter__(self):
indices = []
for i, size in enumerate(self.group_sizes):
if size == 0:
continue
indice = np.where(self.flag == i)[0]
assert len(indice) == size
np.random.shuffle(indice)
num_extra = int(np.ceil(size / self.samples_per_gpu)
) * self.samples_per_gpu - len(indice)
indice = np.concatenate([indice, indice[:num_extra]])
indices.append(indice)
indices = np.concatenate(indices)
indices = [
indices[i * self.samples_per_gpu:(i + 1) * self.samples_per_gpu]
for i in np.random.permutation(
range(len(indices) // self.samples_per_gpu))
]
indices = np.concatenate(indices)
indices = torch.from_numpy(indices).long()
assert len(indices) == self.num_samples
return iter(indices)
def __len__(self):
return self.num_samples
class DistributedGroupSampler(Sampler):
"""Sampler that restricts data loading to a subset of the dataset.
It is especially useful in conjunction with
:class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
process can pass a DistributedSampler instance as a DataLoader sampler,
and load a subset of the original dataset that is exclusive to it.
.. note::
Dataset is assumed to be of constant size.
Arguments:
dataset: Dataset used for sampling.
num_replicas (optional): Number of processes participating in
distributed training.
rank (optional): Rank of the current process within num_replicas.
"""
def __init__(self,
dataset,
samples_per_gpu=1,
num_replicas=None,
rank=None):
if num_replicas is None:
num_replicas = get_world_size()
if rank is None:
rank = get_rank()
self.dataset = dataset
self.samples_per_gpu = samples_per_gpu
self.num_replicas = num_replicas
self.rank = rank
self.epoch = 0
assert hasattr(self.dataset, 'flag')
self.flag = self.dataset.flag
self.group_sizes = np.bincount(self.flag)
self.num_samples = 0
for i, j in enumerate(self.group_sizes):
self.num_samples += int(
math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
self.num_replicas)) * self.samples_per_gpu
self.total_size = self.num_samples * self.num_replicas
def __iter__(self):
# deterministically shuffle based on epoch
g = torch.Generator()
g.manual_seed(self.epoch)
indices = []
for i, size in enumerate(self.group_sizes):
if size > 0:
indice = np.where(self.flag == i)[0]
assert len(indice) == size
indice = indice[list(torch.randperm(int(size),
generator=g))].tolist()
extra = int(
math.ceil(
size * 1.0 / self.samples_per_gpu / self.num_replicas)
) * self.samples_per_gpu * self.num_replicas - len(indice)
indice += indice[:extra]
indices += indice
assert len(indices) == self.total_size
indices = [
indices[j] for i in list(
torch.randperm(
len(indices) // self.samples_per_gpu, generator=g))
for j in range(i * self.samples_per_gpu, (i + 1) *
self.samples_per_gpu)
]
# subsample
offset = self.num_samples * self.rank
indices = indices[offset:offset + self.num_samples]
assert len(indices) == self.num_samples
return iter(indices)
def __len__(self):
return self.num_samples
def set_epoch(self, epoch):
self.epoch = epoch
================================================
FILE: mmdet/datasets/repeat_dataset.py
================================================
import numpy as np
class RepeatDataset(object):
def __init__(self, dataset, times):
self.dataset = dataset
self.times = times
self.CLASSES = dataset.CLASSES
if hasattr(self.dataset, 'flag'):
self.flag = np.tile(self.dataset.flag, times)
self._ori_len = len(self.dataset)
def __getitem__(self, idx):
return self.dataset[idx % self._ori_len]
def __len__(self):
return self.times * self._ori_len
================================================
FILE: mmdet/datasets/transforms.py
================================================
import mmcv
import numpy as np
import torch
__all__ = ['ImageTransform', 'BboxTransform', 'MaskTransform', 'Numpy2Tensor']
class ImageTransform(object):
"""Preprocess an image.
1. rescale the image to expected size
2. normalize the image
3. flip the image (if needed)
4. pad the image (if needed)
5. transpose to (c, h, w)
"""
def __init__(self,
mean=(0, 0, 0),
std=(1, 1, 1),
to_rgb=True,
size_divisor=None):
self.mean = np.array(mean, dtype=np.float32)
self.std = np.array(std, dtype=np.float32)
self.to_rgb = to_rgb
self.size_divisor = size_divisor
def __call__(self, img, scale, flip=False, keep_ratio=True):
if keep_ratio:
img, scale_factor = mmcv.imrescale(img, scale, return_scale=True)
else:
img, w_scale, h_scale = mmcv.imresize(
img, scale, return_scale=True)
scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
dtype=np.float32)
img_shape = img.shape
img = mmcv.imnormalize(img, self.mean, self.std, self.to_rgb)
if flip:
img = mmcv.imflip(img)
if self.size_divisor is not None:
img = mmcv.impad_to_multiple(img, self.size_divisor)
pad_shape = img.shape
else:
pad_shape = img_shape
img = img.transpose(2, 0, 1)
return img, img_shape, pad_shape, scale_factor
def bbox_flip(bboxes, img_shape):
"""Flip bboxes horizontally.
Args:
bboxes(ndarray): shape (..., 4*k)
img_shape(tuple): (height, width)
"""
assert bboxes.shape[-1] % 4 == 0
w = img_shape[1]
flipped = bboxes.copy()
flipped[..., 0::4] = w - bboxes[..., 2::4] - 1
flipped[..., 2::4] = w - bboxes[..., 0::4] - 1
return flipped
class BboxTransform(object):
"""Preprocess gt bboxes.
1. rescale bboxes according to image size
2. flip bboxes (if needed)
3. pad the first dimension to `max_num_gts`
"""
def __init__(self, max_num_gts=None):
self.max_num_gts = max_num_gts
def __call__(self, bboxes, img_shape, scale_factor, flip=False):
gt_bboxes = bboxes * scale_factor
if flip:
gt_bboxes = bbox_flip(gt_bboxes, img_shape)
gt_bboxes[:, 0::2] = np.clip(gt_bboxes[:, 0::2], 0, img_shape[1])
gt_bboxes[:, 1::2] = np.clip(gt_bboxes[:, 1::2], 0, img_shape[0])
if self.max_num_gts is None:
return gt_bboxes
else:
num_gts = gt_bboxes.shape[0]
padded_bboxes = np.zeros((self.max_num_gts, 4), dtype=np.float32)
padded_bboxes[:num_gts, :] = gt_bboxes
return padded_bboxes
class MaskTransform(object):
"""Preprocess masks.
1. resize masks to expected size and stack to a single array
2. flip the masks (if needed)
3. pad the masks (if needed)
"""
def __call__(self, masks, pad_shape, scale_factor, flip=False):
masks = [
mmcv.imrescale(mask, scale_factor, interpolation='nearest')
for mask in masks
]
if flip:
masks = [mask[:, ::-1] for mask in masks]
padded_masks = [
mmcv.impad(mask, pad_shape[:2], pad_val=0) for mask in masks
]
padded_masks = np.stack(padded_masks, axis=0)
return padded_masks
class Numpy2Tensor(object):
def __init__(self):
pass
def __call__(self, *args):
if len(args) == 1:
return torch.from_numpy(args[0])
else:
return tuple([torch.from_numpy(np.array(array)) for array in args])
================================================
FILE: mmdet/datasets/utils.py
================================================
import copy
from collections import Sequence
import mmcv
from mmcv.runner import obj_from_dict
import torch
import matplotlib.pyplot as plt
import numpy as np
from .concat_dataset import ConcatDataset
from .repeat_dataset import RepeatDataset
from .. import datasets
def to_tensor(data):
"""Convert objects of various python types to :obj:`torch.Tensor`.
Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
:class:`Sequence`, :class:`int` and :class:`float`.
"""
if isinstance(data, torch.Tensor):
return data
elif isinstance(data, np.ndarray):
return torch.from_numpy(data)
elif isinstance(data, Sequence) and not mmcv.is_str(data):
return torch.tensor(data)
elif isinstance(data, int):
return torch.LongTensor([data])
elif isinstance(data, float):
return torch.FloatTensor([data])
else:
raise TypeError('type {} cannot be converted to tensor.'.format(
type(data)))
def random_scale(img_scales, mode='range'):
"""Randomly select a scale from a list of scales or scale ranges.
Args:
img_scales (list[tuple]): Image scale or scale range.
mode (str): "range" or "value".
Returns:
tuple: Sampled image scale.
"""
num_scales = len(img_scales)
if num_scales == 1: # fixed scale is specified
img_scale = img_scales[0]
elif num_scales == 2: # randomly sample a scale
if mode == 'range':
img_scale_long = [max(s) for s in img_scales]
img_scale_short = [min(s) for s in img_scales]
long_edge = np.random.randint(
min(img_scale_long),
max(img_scale_long) + 1)
short_edge = np.random.randint(
min(img_scale_short),
max(img_scale_short) + 1)
img_scale = (long_edge, short_edge)
elif mode == 'value':
img_scale = img_scales[np.random.randint(num_scales)]
else:
if mode != 'value':
raise ValueError(
'Only "value" mode supports more than 2 image scales')
img_scale = img_scales[np.random.randint(num_scales)]
return img_scale
def show_ann(coco, img, ann_info):
plt.imshow(mmcv.bgr2rgb(img))
plt.axis('off')
coco.showAnns(ann_info)
plt.show()
def get_dataset(data_cfg):
if data_cfg['type'] == 'RepeatDataset':
return RepeatDataset(
get_dataset(data_cfg['dataset']), data_cfg['times'])
if isinstance(data_cfg['ann_file'], (list, tuple)):
ann_files = data_cfg['ann_file']
num_dset = len(ann_files)
else:
ann_files = [data_cfg['ann_file']]
num_dset = 1
if 'proposal_file' in data_cfg.keys():
if isinstance(data_cfg['proposal_file'], (list, tuple)):
proposal_files = data_cfg['proposal_file']
else:
proposal_files = [data_cfg['proposal_file']]
else:
proposal_files = [None] * num_dset
assert len(proposal_files) == num_dset
if isinstance(data_cfg['img_prefix'], (list, tuple)):
img_prefixes = data_cfg['img_prefix']
else:
img_prefixes = [data_cfg['img_prefix']] * num_dset
assert len(img_prefixes) == num_dset
dsets = []
for i in range(num_dset):
data_info = copy.deepcopy(data_cfg)
data_info['ann_file'] = ann_files[i]
data_info['proposal_file'] = proposal_files[i]
data_info['img_prefix'] = img_prefixes[i]
dset = obj_from_dict(data_info, datasets)
dsets.append(dset)
if len(dsets) > 1:
dset = ConcatDataset(dsets)
else:
dset = dsets[0]
return dset
================================================
FILE: mmdet/datasets/voc.py
================================================
from .xml_style import XMLDataset
class VOCDataset(XMLDataset):
CLASSES = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
'tvmonitor')
def __init__(self, **kwargs):
super(VOCDataset, self).__init__(**kwargs)
if 'VOC2007' in self.img_prefix:
self.year = 2007
elif 'VOC2012' in self.img_prefix:
self.year = 2012
else:
raise ValueError('Cannot infer dataset year from img_prefix')
================================================
FILE: mmdet/datasets/xml_style.py
================================================
import os.path as osp
import xml.etree.ElementTree as ET
import mmcv
import numpy as np
from .custom import CustomDataset
class XMLDataset(CustomDataset):
def __init__(self, **kwargs):
super(XMLDataset, self).__init__(**kwargs)
self.cat2label = {cat: i + 1 for i, cat in enumerate(self.CLASSES)}
def load_annotations(self, ann_file):
img_infos = []
img_ids = mmcv.list_from_file(ann_file)
for img_id in img_ids:
filename = 'JPEGImages/{}.jpg'.format(img_id)
xml_path = osp.join(self.img_prefix, 'Annotations',
'{}.xml'.format(img_id))
tree = ET.parse(xml_path)
root = tree.getroot()
size = root.find('size')
width = int(size.find('width').text)
height = int(size.find('height').text)
img_infos.append(
dict(id=img_id, filename=filename, width=width, height=height))
return img_infos
def get_ann_info(self, idx):
img_id = self.img_infos[idx]['id']
xml_path = osp.join(self.img_prefix, 'Annotations',
'{}.xml'.format(img_id))
tree = ET.parse(xml_path)
root = tree.getroot()
bboxes = []
labels = []
bboxes_ignore = []
labels_ignore = []
for obj in root.findall('object'):
name = obj.find('name').text
label = self.cat2label[name]
difficult = int(obj.find('difficult').text)
bnd_box = obj.find('bndbox')
bbox = [
int(bnd_box.find('xmin').text),
int(bnd_box.find('ymin').text),
int(bnd_box.find('xmax').text),
int(bnd_box.find('ymax').text)
]
if difficult:
bboxes_ignore.append(bbox)
labels_ignore.append(label)
else:
bboxes.append(bbox)
labels.append(label)
if not bboxes:
bboxes = np.zeros((0, 4))
labels = np.zeros((0, ))
else:
bboxes = np.array(bboxes, ndmin=2) - 1
labels = np.array(labels)
if not bboxes_ignore:
bboxes_ignore = np.zeros((0, 4))
labels_ignore = np.zeros((0, ))
else:
bboxes_ignore = np.array(bboxes_ignore, ndmin=2) - 1
labels_ignore = np.array(labels_ignore)
ann = dict(
bboxes=bboxes.astype(np.float32),
labels=labels.astype(np.int64),
bboxes_ignore=bboxes_ignore.astype(np.float32),
labels_ignore=labels_ignore.astype(np.int64))
return ann
================================================
FILE: mmdet/models/__init__.py
================================================
from .base_sampler import BaseSampler
from .pseudo_sampler import PseudoSampler
from .random_sampler import RandomSampler
from .instance_balanced_pos_sampler import InstanceBalancedPosSampler
from .iou_balanced_neg_sampler import IoUBalancedNegSampler
from .combined_sampler import CombinedSampler
from .ohem_sampler import OHEMSampler
from .sampling_result import SamplingResult
from .random_sampler_fixnum import RandomSamplerFixnum
__all__ = [
'BaseSampler', 'PseudoSampler', 'RandomSampler',
'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler',
'OHEMSampler', 'SamplingResult', 'RandomSamplerFixnum'
]
================================================
FILE: mmdet/models/anchor_heads/__init__.py
================================================
from .anchor_head import AnchorHead
from .rpn_head import RPNHead
from .retina_head import RetinaHead
from .ssd_head import SSDHead
__all__ = ['AnchorHead', 'RPNHead', 'RetinaHead', 'SSDHead']
================================================
FILE: mmdet/models/anchor_heads/anchor_head.py
================================================
from __future__ import division
import numpy as np
import torch
import torch.nn as nn
from mmcv.cnn import normal_init
from mmdet.core import (AnchorGenerator, anchor_target, delta2bbox,
multi_apply, weighted_cross_entropy, weighted_smoothl1,
weighted_binary_cross_entropy,
weighted_sigmoid_focal_loss, multiclass_nms)
from ..registry import HEADS
@HEADS.register_module
class AnchorHead(nn.Module):
"""Anchor-based head (RPN, RetinaNet, SSD, etc.).
Args:
in_channels (int): Number of channels in the input feature map.
feat_channels (int): Number of channels of the feature map.
anchor_scales (Iterable): Anchor scales.
anchor_ratios (Iterable): Anchor aspect ratios.
anchor_strides (Iterable): Anchor strides.
anchor_base_sizes (Iterable): Anchor base sizes.
target_means (Iterable): Mean values of regression targets.
target_stds (Iterable): Std values of regression targets.
use_sigmoid_cls (bool): Whether to use sigmoid loss for classification.
(softmax by default)
use_focal_loss (bool): Whether to use focal loss for classification.
""" # noqa: W605
def __init__(self,
num_classes,
in_channels,
feat_channels=256,
anchor_scales=[8, 16, 32],
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[4, 8, 16, 32, 64],
anchor_base_sizes=None,
target_means=(.0, .0, .0, .0),
target_stds=(1.0, 1.0, 1.0, 1.0),
use_sigmoid_cls=False,
use_focal_loss=False):
super(AnchorHead, self).__init__()
self.in_channels = in_channels
self.num_classes = num_classes
self.feat_channels = feat_channels
self.anchor_scales = anchor_scales
self.anchor_ratios = anchor_ratios
self.anchor_strides = anchor_strides
self.anchor_base_sizes = list(
anchor_strides) if anchor_base_sizes is None else anchor_base_sizes
self.target_means = target_means
self.target_stds = target_stds
self.use_sigmoid_cls = use_sigmoid_cls
self.use_focal_loss = use_focal_loss
self.anchor_generators = []
for anchor_base in self.anchor_base_sizes:
self.anchor_generators.append(
AnchorGenerator(anchor_base, anchor_scales, anchor_ratios))
self.num_anchors = len(self.anchor_ratios) * len(self.anchor_scales)
if self.use_sigmoid_cls:
self.cls_out_channels = self.num_classes - 1
else:
self.cls_out_channels = self.num_classes
self._init_layers()
def _init_layers(self):
self.conv_cls = nn.Conv2d(self.feat_channels,
self.num_anchors * self.cls_out_channels, 1)
self.conv_reg = nn.Conv2d(self.feat_channels, self.num_anchors * 4, 1)
def init_weights(self):
normal_init(self.conv_cls, std=0.01)
normal_init(self.conv_reg, std=0.01)
def forward_single(self, x):
cls_score = self.conv_cls(x)
bbox_pred = self.conv_reg(x)
return cls_score, bbox_pred
def forward(self, feats):
return multi_apply(self.forward_single, feats)
def get_anchors(self, featmap_sizes, img_metas):
"""Get anchors according to feature map sizes.
Args:
featmap_sizes (list[tuple]): Multi-level feature map sizes.
img_metas (list[dict]): Image meta info.
Returns:
tuple: anchors of each image, valid flags of each image
"""
num_imgs = len(img_metas)
num_levels = len(featmap_sizes)
# since feature map sizes of all images are the same, we only compute
# anchors for one time
multi_level_anchors = []
for i in range(num_levels):
anchors = self.anchor_generators[i].grid_anchors(
featmap_sizes[i], self.anchor_strides[i])
multi_level_anchors.append(anchors)
anchor_list = [multi_level_anchors for _ in range(num_imgs)]
# for each image, we compute valid flags of multi level anchors
valid_flag_list = []
for img_id, img_meta in enumerate(img_metas):
multi_level_flags = []
for i in range(num_levels):
anchor_stride = self.anchor_strides[i]
feat_h, feat_w = featmap_sizes[i]
h, w, _ = img_meta['pad_shape']
valid_feat_h = min(int(np.ceil(h / anchor_stride)), feat_h)
valid_feat_w = min(int(np.ceil(w / anchor_stride)), feat_w)
flags = self.anchor_generators[i].valid_flags(
(feat_h, feat_w), (valid_feat_h, valid_feat_w))
multi_level_flags.append(flags)
valid_flag_list.append(multi_level_flags)
return anchor_list, valid_flag_list
def loss_single(self, cls_score, bbox_pred, labels, label_weights,
bbox_targets, bbox_weights, num_total_samples, cfg):
# classification loss
if self.use_sigmoid_cls:
labels = labels.reshape(-1, self.cls_out_channels)
label_weights = label_weights.reshape(-1, self.cls_out_channels)
else:
labels = labels.reshape(-1)
label_weights = label_weights.reshape(-1)
cls_score = cls_score.permute(0, 2, 3, 1).reshape(
-1, self.cls_out_channels)
if self.use_sigmoid_cls:
if self.use_focal_loss:
cls_criterion = weighted_sigmoid_focal_loss
else:
cls_criterion = weighted_binary_cross_entropy
else:
if self.use_focal_loss:
raise NotImplementedError
else:
cls_criterion = weighted_cross_entropy
if self.use_focal_loss:
loss_cls = cls_criterion(
cls_score,
labels,
label_weights,
gamma=cfg.gamma,
alpha=cfg.alpha,
avg_factor=num_total_samples)
else:
loss_cls = cls_criterion(
cls_score, labels, label_weights, avg_factor=num_total_samples)
# regression loss
bbox_targets = bbox_targets.reshape(-1, 4)
bbox_weights = bbox_weights.reshape(-1, 4)
bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
loss_reg = weighted_smoothl1(
bbox_pred,
bbox_targets,
bbox_weights,
beta=cfg.smoothl1_beta,
avg_factor=num_total_samples)
return loss_cls, loss_reg
def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas,
cfg):
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
assert len(featmap_sizes) == len(self.anchor_generators)
anchor_list, valid_flag_list = self.get_anchors(
featmap_sizes, img_metas)
sampling = False if self.use_focal_loss else True
label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
cls_reg_targets = anchor_target(
anchor_list,
valid_flag_list,
gt_bboxes,
img_metas,
self.target_means,
self.target_stds,
cfg,
gt_labels_list=gt_labels,
label_channels=label_channels,
sampling=sampling)
if cls_reg_targets is None:
return None
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
num_total_pos, num_total_neg) = cls_reg_targets
num_total_samples = (num_total_pos if self.use_focal_loss else
num_total_pos + num_total_neg)
losses_cls, losses_reg = multi_apply(
self.loss_single,
cls_scores,
bbox_preds,
labels_list,
label_weights_list,
bbox_targets_list,
bbox_weights_list,
num_total_samples=num_total_samples,
cfg=cfg)
return dict(loss_cls=losses_cls, loss_reg=losses_reg)
def get_bboxes(self, cls_scores, bbox_preds, img_metas, cfg,
rescale=False):
assert len(cls_scores) == len(bbox_preds)
num_levels = len(cls_scores)
mlvl_anchors = [
self.anchor_generators[i].grid_anchors(cls_scores[i].size()[-2:],
self.anchor_strides[i])
for i in range(num_levels)
]
result_list = []
for img_id in range(len(img_metas)):
cls_score_list = [
cls_scores[i][img_id].detach() for i in range(num_levels)
]
bbox_pred_list = [
bbox_preds[i][img_id].detach() for i in range(num_levels)
]
img_shape = img_metas[img_id]['img_shape']
scale_factor = img_metas[img_id]['scale_factor']
proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list,
mlvl_anchors, img_shape,
scale_factor, cfg, rescale)
result_list.append(proposals)
return result_list
def get_bboxes_single(self,
cls_scores,
bbox_preds,
mlvl_anchors,
img_shape,
scale_factor,
cfg,
rescale=False):
assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
mlvl_bboxes = []
mlvl_scores = []
for cls_score, bbox_pred, anchors in zip(cls_scores, bbox_preds,
mlvl_anchors):
assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
cls_score = cls_score.permute(1, 2, 0).reshape(
-1, self.cls_out_channels)
if self.use_sigmoid_cls:
scores = cls_score.sigmoid()
else:
scores = cls_score.softmax(-1)
bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
nms_pre = cfg.get('nms_pre', -1)
if nms_pre > 0 and scores.shape[0] > nms_pre:
if self.use_sigmoid_cls:
max_scores, _ = scores.max(dim=1)
else:
max_scores, _ = scores[:, 1:].max(dim=1)
_, topk_inds = max_scores.topk(nms_pre)
anchors = anchors[topk_inds, :]
bbox_pred = bbox_pred[topk_inds, :]
scores = scores[topk_inds, :]
bboxes = delta2bbox(anchors, bbox_pred, self.target_means,
self.target_stds, img_shape)
mlvl_bboxes.append(bboxes)
mlvl_scores.append(scores)
mlvl_bboxes = torch.cat(mlvl_bboxes)
if rescale:
mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
mlvl_scores = torch.cat(mlvl_scores)
if self.use_sigmoid_cls:
padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
mlvl_scores = torch.cat([padding, mlvl_scores], dim=1)
det_bboxes, det_labels = multiclass_nms(
mlvl_bboxes, mlvl_scores, cfg.score_thr, cfg.nms, cfg.max_per_img)
return det_bboxes, det_labels
================================================
FILE: mmdet/models/anchor_heads/retina_head.py
================================================
import numpy as np
import torch.nn as nn
from mmcv.cnn import normal_init
from .anchor_head import AnchorHead
from ..registry import HEADS
from ..utils import bias_init_with_prob
@HEADS.register_module
class RetinaHead(AnchorHead):
def __init__(self,
num_classes,
in_channels,
stacked_convs=4,
octave_base_scale=4,
scales_per_octave=3,
**kwargs):
self.stacked_convs = stacked_convs
self.octave_base_scale = octave_base_scale
self.scales_per_octave = scales_per_octave
octave_scales = np.array(
[2**(i / scales_per_octave) for i in range(scales_per_octave)])
anchor_scales = octave_scales * octave_base_scale
super(RetinaHead, self).__init__(
num_classes,
in_channels,
anchor_scales=anchor_scales,
use_sigmoid_cls=True,
use_focal_loss=True,
**kwargs)
def _init_layers(self):
self.relu = nn.ReLU(inplace=True)
self.cls_convs = nn.ModuleList()
self.reg_convs = nn.ModuleList()
for i in range(self.stacked_convs):
chn = self.in_channels if i == 0 else self.feat_channels
self.cls_convs.append(
nn.Conv2d(chn, self.feat_channels, 3, stride=1, padding=1))
self.reg_convs.append(
nn.Conv2d(chn, self.feat_channels, 3, stride=1, padding=1))
self.retina_cls = nn.Conv2d(
self.feat_channels,
self.num_anchors * self.cls_out_channels,
3,
padding=1)
self.retina_reg = nn.Conv2d(
self.feat_channels, self.num_anchors * 4, 3, padding=1)
def init_weights(self):
for m in self.cls_convs:
normal_init(m, std=0.01)
for m in self.reg_convs:
normal_init(m, std=0.01)
bias_cls = bias_init_with_prob(0.01)
normal_init(self.retina_cls, std=0.01, bias=bias_cls)
normal_init(self.retina_reg, std=0.01)
def forward_single(self, x):
cls_feat = x
reg_feat = x
for cls_conv in self.cls_convs:
cls_feat = self.relu(cls_conv(cls_feat))
for reg_conv in self.reg_convs:
reg_feat = self.relu(reg_conv(reg_feat))
cls_score = self.retina_cls(cls_feat)
bbox_pred = self.retina_reg(reg_feat)
return cls_score, bbox_pred
================================================
FILE: mmdet/models/anchor_heads/rpn_head.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import normal_init
from mmdet.core import delta2bbox
from mmdet.ops import nms
from .anchor_head import AnchorHead
from ..registry import HEADS
@HEADS.register_module
class RPNHead(AnchorHead):
def __init__(self, in_channels, **kwargs):
super(RPNHead, self).__init__(2, in_channels, **kwargs)
def _init_layers(self):
self.rpn_conv = nn.Conv2d(
self.in_channels, self.feat_channels, 3, padding=1)
self.rpn_cls = nn.Conv2d(self.feat_channels,
self.num_anchors * self.cls_out_channels, 1)
self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_anchors * 4, 1)
def init_weights(self):
normal_init(self.rpn_conv, std=0.01)
normal_init(self.rpn_cls, std=0.01)
normal_init(self.rpn_reg, std=0.01)
def forward_single(self, x):
x = self.rpn_conv(x)
x = F.relu(x, inplace=True)
rpn_cls_score = self.rpn_cls(x)
rpn_bbox_pred = self.rpn_reg(x)
return rpn_cls_score, rpn_bbox_pred
def loss(self, cls_scores, bbox_preds, gt_bboxes, img_metas, cfg):
losses = super(RPNHead, self).loss(cls_scores, bbox_preds, gt_bboxes,
None, img_metas, cfg)
return dict(
loss_rpn_cls=losses['loss_cls'], loss_rpn_reg=losses['loss_reg'])
def get_bboxes_single(self,
cls_scores,
bbox_preds,
mlvl_anchors,
img_shape,
scale_factor,
cfg,
rescale=False):
mlvl_proposals = []
for idx in range(len(cls_scores)):
rpn_cls_score = cls_scores[idx]
rpn_bbox_pred = bbox_preds[idx]
assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
anchors = mlvl_anchors[idx]
rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
if self.use_sigmoid_cls:
rpn_cls_score = rpn_cls_score.reshape(-1)
scores = rpn_cls_score.sigmoid()
else:
rpn_cls_score = rpn_cls_score.reshape(-1, 2)
scores = rpn_cls_score.softmax(dim=1)[:, 1]
rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4)
if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre:
_, topk_inds = scores.topk(cfg.nms_pre)
rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
anchors = anchors[topk_inds, :]
scores = scores[topk_inds]
proposals = delta2bbox(anchors, rpn_bbox_pred, self.target_means,
self.target_stds, img_shape)
if cfg.min_bbox_size > 0:
w = proposals[:, 2] - proposals[:, 0] + 1
h = proposals[:, 3] - proposals[:, 1] + 1
valid_inds = torch.nonzero((w >= cfg.min_bbox_size) &
(h >= cfg.min_bbox_size)).squeeze()
proposals = proposals[valid_inds, :]
scores = scores[valid_inds]
proposals = torch.cat([proposals, scores.unsqueeze(-1)], dim=-1)
proposals, _ = nms(proposals, cfg.nms_thr)
proposals = proposals[:cfg.nms_post, :]
mlvl_proposals.append(proposals)
proposals = torch.cat(mlvl_proposals, 0)
if cfg.nms_across_levels:
proposals, _ = nms(proposals, cfg.nms_thr)
proposals = proposals[:cfg.max_num, :]
else:
scores = proposals[:, 4]
num = min(cfg.max_num, proposals.shape[0])
_, topk_inds = scores.topk(num)
proposals = proposals[topk_inds, :]
return proposals
================================================
FILE: mmdet/models/anchor_heads/ssd_head.py
================================================
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import xavier_init
from mmdet.core import (AnchorGenerator, anchor_target, weighted_smoothl1,
multi_apply)
from .anchor_head import AnchorHead
from ..registry import HEADS
@HEADS.register_module
class SSDHead(AnchorHead):
def __init__(self,
input_size=300,
num_classes=81,
in_channels=(512, 1024, 512, 256, 256, 256),
anchor_strides=(8, 16, 32, 64, 100, 300),
basesize_ratio_range=(0.1, 0.9),
anchor_ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]),
target_means=(.0, .0, .0, .0),
target_stds=(1.0, 1.0, 1.0, 1.0)):
super(AnchorHead, self).__init__()
self.input_size = input_size
self.num_classes = num_classes
self.in_channels = in_channels
self.cls_out_channels = num_classes
num_anchors = [len(ratios) * 2 + 2 for ratios in anchor_ratios]
reg_convs = []
cls_convs = []
for i in range(len(in_channels)):
reg_convs.append(
nn.Conv2d(
in_channels[i],
num_anchors[i] * 4,
kernel_size=3,
padding=1))
cls_convs.append(
nn.Conv2d(
in_channels[i],
num_anchors[i] * num_classes,
kernel_size=3,
padding=1))
self.reg_convs = nn.ModuleList(reg_convs)
self.cls_convs = nn.ModuleList(cls_convs)
min_ratio, max_ratio = basesize_ratio_range
min_ratio = int(min_ratio * 100)
max_ratio = int(max_ratio * 100)
step = int(np.floor(max_ratio - min_ratio) / (len(in_channels) - 2))
min_sizes = []
max_sizes = []
for r in range(int(min_ratio), int(max_ratio) + 1, step):
min_sizes.append(int(input_size * r / 100))
max_sizes.append(int(input_size * (r + step) / 100))
if input_size == 300:
if basesize_ratio_range[0] == 0.15: # SSD300 COCO
min_sizes.insert(0, int(input_size * 7 / 100))
max_sizes.insert(0, int(input_size * 15 / 100))
elif basesize_ratio_range[0] == 0.2: # SSD300 VOC
min_sizes.insert(0, int(input_size * 10 / 100))
max_sizes.insert(0, int(input_size * 20 / 100))
elif input_size == 512:
if basesize_ratio_range[0] == 0.1: # SSD512 COCO
min_sizes.insert(0, int(input_size * 4 / 100))
max_sizes.insert(0, int(input_size * 10 / 100))
elif basesize_ratio_range[0] == 0.15: # SSD512 VOC
min_sizes.insert(0, int(input_size * 7 / 100))
max_sizes.insert(0, int(input_size * 15 / 100))
self.anchor_generators = []
self.anchor_strides = anchor_strides
for k in range(len(anchor_strides)):
base_size = min_sizes[k]
stride = anchor_strides[k]
ctr = ((stride - 1) / 2., (stride - 1) / 2.)
scales = [1., np.sqrt(max_sizes[k] / min_sizes[k])]
ratios = [1.]
for r in anchor_ratios[k]:
ratios += [1 / r, r] # 4 or 6 ratio
anchor_generator = AnchorGenerator(
base_size, scales, ratios, scale_major=False, ctr=ctr)
indices = list(range(len(ratios)))
indices.insert(1, len(indices))
anchor_generator.base_anchors = torch.index_select(
anchor_generator.base_anchors, 0, torch.LongTensor(indices))
self.anchor_generators.append(anchor_generator)
self.target_means = target_means
self.target_stds = target_stds
self.use_sigmoid_cls = False
self.use_focal_loss = False
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
xavier_init(m, distribution='uniform', bias=0)
def forward(self, feats):
cls_scores = []
bbox_preds = []
for feat, reg_conv, cls_conv in zip(feats, self.reg_convs,
self.cls_convs):
cls_scores.append(cls_conv(feat))
bbox_preds.append(reg_conv(feat))
return cls_scores, bbox_preds
def loss_single(self, cls_score, bbox_pred, labels, label_weights,
bbox_targets, bbox_weights, num_total_samples, cfg):
loss_cls_all = F.cross_entropy(
cls_score, labels, reduction='none') * label_weights
pos_inds = (labels > 0).nonzero().view(-1)
neg_inds = (labels == 0).nonzero().view(-1)
num_pos_samples = pos_inds.size(0)
num_neg_samples = cfg.neg_pos_ratio * num_pos_samples
if num_neg_samples > neg_inds.size(0):
num_neg_samples = neg_inds.size(0)
topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples)
loss_cls_pos = loss_cls_all[pos_inds].sum()
loss_cls_neg = topk_loss_cls_neg.sum()
loss_cls = (loss_cls_pos + loss_cls_neg) / num_total_samples
loss_reg = weighted_smoothl1(
bbox_pred,
bbox_targets,
bbox_weights,
beta=cfg.smoothl1_beta,
avg_factor=num_total_samples)
return loss_cls[None], loss_reg
def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas,
cfg):
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
assert len(featmap_sizes) == len(self.anchor_generators)
anchor_list, valid_flag_list = self.get_anchors(
featmap_sizes, img_metas)
cls_reg_targets = anchor_target(
anchor_list,
valid_flag_list,
gt_bboxes,
img_metas,
self.target_means,
self.target_stds,
cfg,
gt_labels_list=gt_labels,
label_channels=1,
sampling=False,
unmap_outputs=False)
if cls_reg_targets is None:
return None
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
num_total_pos, num_total_neg) = cls_reg_targets
num_images = len(img_metas)
all_cls_scores = torch.cat([
s.permute(0, 2, 3, 1).reshape(
num_images, -1, self.cls_out_channels) for s in cls_scores
], 1)
all_labels = torch.cat(labels_list, -1).view(num_images, -1)
all_label_weights = torch.cat(label_weights_list, -1).view(
num_images, -1)
all_bbox_preds = torch.cat([
b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
for b in bbox_preds
], -2)
all_bbox_targets = torch.cat(bbox_targets_list, -2).view(
num_images, -1, 4)
all_bbox_weights = torch.cat(bbox_weights_list, -2).view(
num_images, -1, 4)
losses_cls, losses_reg = multi_apply(
self.loss_single,
all_cls_scores,
all_bbox_preds,
all_labels,
all_label_weights,
all_bbox_targets,
all_bbox_weights,
num_total_samples=num_total_pos,
cfg=cfg)
return dict(loss_cls=losses_cls, loss_reg=losses_reg)
================================================
FILE: mmdet/models/backbones/__init__.py
================================================
from .resnet import ResNet
from .resnext import ResNeXt
from .ssd_vgg import SSDVGG
__all__ = ['ResNet', 'ResNeXt', 'SSDVGG']
================================================
FILE: mmdet/models/backbones/resnet.py
================================================
import logging
import torch.nn as nn
import torch.utils.checkpoint as cp
from mmcv.cnn import constant_init, kaiming_init
from mmcv.runner import load_checkpoint
from mmdet.ops import DeformConv, ModulatedDeformConv
from ..registry import BACKBONES
from ..utils import build_norm_layer
def conv3x3(in_planes, out_planes, stride=1, dilation=1):
"3x3 convolution with padding"
return nn.Conv2d(
in_planes,
out_planes,
kernel_size=3,
stride=stride,
padding=dilation,
dilation=dilation,
bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self,
inplanes,
planes,
stride=1,
dilation=1,
downsample=None,
style='pytorch',
with_cp=False,
normalize=dict(type='BN')):
super(BasicBlock, self).__init__()
self.norm1_name, norm1 = build_norm_layer(normalize, planes, postfix=1)
self.norm2_name, norm2 = build_norm_layer(normalize, planes, postfix=2)
self.conv1 = conv3x3(inplanes, planes, stride, dilation)
self.add_module(self.norm1_name, norm1)
self.conv2 = conv3x3(planes, planes)
self.add_module(self.norm2_name, norm2)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
self.dilation = dilation
assert not with_cp
@property
def norm1(self):
return getattr(self, self.norm1_name)
@property
def norm2(self):
return getattr(self, self.norm2_name)
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.norm1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.norm2(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self,
inplanes,
planes,
stride=1,
dilation=1,
downsample=None,
style='pytorch',
with_cp=False,
normalize=dict(type='BN'),
dcn=None):
"""Bottleneck block for ResNet.
If style is "pytorch", the stride-two layer is the 3x3 conv layer,
if it is "caffe", the stride-two layer is the first 1x1 conv layer.
"""
super(Bottleneck, self).__init__()
assert style in ['pytorch', 'caffe']
assert dcn is None or isinstance(dcn, dict)
self.inplanes = inplanes
self.planes = planes
self.normalize = normalize
self.dcn = dcn
self.with_dcn = dcn is not None
if style == 'pytorch':
self.conv1_stride = 1
self.conv2_stride = stride
else:
self.conv1_stride = stride
self.conv2_stride = 1
self.norm1_name, norm1 = build_norm_layer(normalize, planes, postfix=1)
self.norm2_name, norm2 = build_norm_layer(normalize, planes, postfix=2)
self.norm3_name, norm3 = build_norm_layer(
normalize, planes * self.expansion, postfix=3)
self.conv1 = nn.Conv2d(
inplanes,
planes,
kernel_size=1,
stride=self.conv1_stride,
bias=False)
self.add_module(self.norm1_name, norm1)
fallback_on_stride = False
self.with_modulated_dcn = False
if self.with_dcn:
fallback_on_stride = dcn.get('fallback_on_stride', False)
self.with_modulated_dcn = dcn.get('modulated', False)
if not self.with_dcn or fallback_on_stride:
self.conv2 = nn.Conv2d(
planes,
planes,
kernel_size=3,
stride=self.conv2_stride,
padding=dilation,
dilation=dilation,
bias=False)
else:
deformable_groups = dcn.get('deformable_groups', 1)
if not self.with_modulated_dcn:
conv_op = DeformConv
offset_channels = 18
else:
conv_op = ModulatedDeformConv
offset_channels = 27
self.conv2_offset = nn.Conv2d(
planes,
deformable_groups * offset_channels,
kernel_size=3,
stride=self.conv2_stride,
padding=dilation,
dilation=dilation)
self.conv2 = conv_op(
planes,
planes,
kernel_size=3,
stride=self.conv2_stride,
padding=dilation,
dilation=dilation,
deformable_groups=deformable_groups,
bias=False)
self.add_module(self.norm2_name, norm2)
self.conv3 = nn.Conv2d(
planes, planes * self.expansion, kernel_size=1, bias=False)
self.add_module(self.norm3_name, norm3)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
self.dilation = dilation
self.with_cp = with_cp
self.normalize = normalize
@property
def norm1(self):
return getattr(self, self.norm1_name)
@property
def norm2(self):
return getattr(self, self.norm2_name)
@property
def norm3(self):
return getattr(self, self.norm3_name)
def forward(self, x):
def _inner_forward(x):
identity = x
out = self.conv1(x)
out = self.norm1(out)
out = self.relu(out)
if not self.with_dcn:
out = self.conv2(out)
elif self.with_modulated_dcn:
offset_mask = self.conv2_offset(out)
offset = offset_mask[:, :18, :, :]
mask = offset_mask[:, -9:, :, :].sigmoid()
out = self.conv2(out, offset, mask)
else:
offset = self.conv2_offset(out)
out = self.conv2(out, offset)
out = self.norm2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.norm3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
return out
if self.with_cp and x.requires_grad:
out = cp.checkpoint(_inner_forward, x)
else:
out = _inner_forward(x)
out = self.relu(out)
return out
def make_res_layer(block,
inplanes,
planes,
blocks,
stride=1,
dilation=1,
style='pytorch',
with_cp=False,
normalize=dict(type='BN'),
dcn=None):
downsample = None
if stride != 1 or inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(
inplanes,
planes * block.expansion,
kernel_size=1,
stride=stride,
bias=False),
build_norm_layer(normalize, planes * block.expansion)[1],
)
layers = []
layers.append(
block(
inplanes,
planes,
stride,
dilation,
downsample,
style=style,
with_cp=with_cp,
normalize=normalize,
dcn=dcn))
inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(
block(
inplanes,
planes,
1,
dilation,
style=style,
with_cp=with_cp,
normalize=normalize,
dcn=dcn))
return nn.Sequential(*layers)
@BACKBONES.register_module
class ResNet(nn.Module):
"""ResNet backbone.
Args:
depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
num_stages (int): Resnet stages, normally 4.
strides (Sequence[int]): Strides of the first block of each stage.
dilations (Sequence[int]): Dilation of each stage.
out_indices (Sequence[int]): Output from which stages.
style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
layer is the 3x3 conv layer, otherwise the stride-two layer is
the first 1x1 conv layer.
frozen_stages (int): Stages to be frozen (all param fixed). -1 means
not freezing any parameters.
normalize (dict): dictionary to construct and config norm layer.
norm_eval (bool): Whether to set norm layers to eval mode, namely,
freeze running stats (mean and var). Note: Effect on Batch Norm
and its variants only.
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
memory while slowing down the training speed.
zero_init_residual (bool): whether to use zero init for last norm layer
in resblocks to let them behave as identity.
"""
arch_settings = {
18: (BasicBlock, (2, 2, 2, 2)),
34: (BasicBlock, (3, 4, 6, 3)),
50: (Bottleneck, (3, 4, 6, 3)),
101: (Bottleneck, (3, 4, 23, 3)),
152: (Bottleneck, (3, 8, 36, 3))
}
def __init__(self,
depth,
num_stages=4,
strides=(1, 2, 2, 2),
dilations=(1, 1, 1, 1),
out_indices=(0, 1, 2, 3),
style='pytorch',
frozen_stages=-1,
normalize=dict(type='BN', frozen=False),
norm_eval=True,
dcn=None,
stage_with_dcn=(False, False, False, False),
with_cp=False,
zero_init_residual=True):
super(ResNet, self).__init__()
if depth not in self.arch_settings:
raise KeyError('invalid depth {} for resnet'.format(depth))
self.depth = depth
self.num_stages = num_stages
assert num_stages >= 1 and num_stages <= 4
self.strides = strides
self.dilations = dilations
assert len(strides) == len(dilations) == len(
stage_with_dcn) == num_stages
self.out_indices = out_indices
assert max(out_indices) < num_stages
self.style = style
self.frozen_stages = frozen_stages
self.normalize = normalize
self.with_cp = with_cp
self.norm_eval = norm_eval
self.dcn = dcn
self.stage_with_dcn = stage_with_dcn
self.zero_init_residual = zero_init_residual
self.block, stage_blocks = self.arch_settings[depth]
self.stage_blocks = stage_blocks[:num_stages]
self.inplanes = 64
self._make_stem_layer()
self.res_layers = []
for i, num_blocks in enumerate(self.stage_blocks):
stride = strides[i]
dilation = dilations[i]
dcn = self.dcn if self.stage_with_dcn[i] else None
planes = 64 * 2**i
res_layer = make_res_layer(
self.block,
self.inplanes,
planes,
num_blocks,
stride=stride,
dilation=dilation,
style=self.style,
with_cp=with_cp,
normalize=normalize,
dcn=dcn)
self.inplanes = planes * self.block.expansion
layer_name = 'layer{}'.format(i + 1)
self.add_module(layer_name, res_layer)
self.res_layers.append(layer_name)
self._freeze_stages()
self.feat_dim = self.block.expansion * 64 * 2**(
len(self.stage_blocks) - 1)
@property
def norm1(self):
return getattr(self, self.norm1_name)
def _make_stem_layer(self):
self.conv1 = nn.Conv2d(
3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.norm1_name, norm1 = build_norm_layer(
self.normalize, 64, postfix=1)
self.add_module(self.norm1_name, norm1)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
def _freeze_stages(self):
if self.frozen_stages >= 0:
for m in [self.conv1, self.norm1]:
for param in m.parameters():
param.requires_grad = False
for i in range(1, self.frozen_stages + 1):
m = getattr(self, 'layer{}'.format(i))
for param in m.parameters():
param.requires_grad = False
def init_weights(self, pretrained=None):
if isinstance(pretrained, str):
logger = logging.getLogger()
load_checkpoint(self, pretrained, strict=False, logger=logger)
elif pretrained is None:
for m in self.modules():
if isinstance(m, nn.Conv2d):
kaiming_init(m)
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
constant_init(m, 1)
if self.dcn is not None:
for m in self.modules():
if isinstance(m, Bottleneck) and hasattr(
m, 'conv2_offset'):
constant_init(m.conv2_offset, 0)
if self.zero_init_residual:
for m in self.modules():
if isinstance(m, Bottleneck):
constant_init(m.norm3, 0)
elif isinstance(m, BasicBlock):
constant_init(m.norm2, 0)
else:
raise TypeError('pretrained must be a str or None')
def forward(self, x):
x = self.conv1(x)
x = self.norm1(x)
x = self.relu(x)
x = self.maxpool(x)
outs = []
for i, layer_name in enumerate(self.res_layers):
res_layer = getattr(self, layer_name)
x = res_layer(x)
if i in self.out_indices:
outs.append(x)
if len(outs) == 1:
return outs[0]
else:
return tuple(outs)
def train(self, mode=True):
super(ResNet, self).train(mode)
if mode and self.norm_eval:
for m in self.modules():
# trick: eval have effect on BatchNorm only
if isinstance(m, nn.BatchNorm2d):
m.eval()
================================================
FILE: mmdet/models/backbones/resnext.py
================================================
import math
import torch.nn as nn
from mmdet.ops import DeformConv, ModulatedDeformConv
from .resnet import Bottleneck as _Bottleneck
from .resnet import ResNet
from ..registry import BACKBONES
from ..utils import build_norm_layer
class Bottleneck(_Bottleneck):
def __init__(self, *args, groups=1, base_width=4, **kwargs):
"""Bottleneck block for ResNeXt.
If style is "pytorch", the stride-two layer is the 3x3 conv layer,
if it is "caffe", the stride-two layer is the first 1x1 conv layer.
"""
super(Bottleneck, self).__init__(*args, **kwargs)
if groups == 1:
width = self.planes
else:
width = math.floor(self.planes * (base_width / 64)) * groups
self.norm1_name, norm1 = build_norm_layer(
self.normalize, width, postfix=1)
self.norm2_name, norm2 = build_norm_layer(
self.normalize, width, postfix=2)
self.norm3_name, norm3 = build_norm_layer(
self.normalize, self.planes * self.expansion, postfix=3)
self.conv1 = nn.Conv2d(
self.inplanes,
width,
kernel_size=1,
stride=self.conv1_stride,
bias=False)
self.add_module(self.norm1_name, norm1)
fallback_on_stride = False
self.with_modulated_dcn = False
if self.with_dcn:
fallback_on_stride = self.dcn.get('fallback_on_stride', False)
self.with_modulated_dcn = self.dcn.get('modulated', False)
if not self.with_dcn or fallback_on_stride:
self.conv2 = nn.Conv2d(
width,
width,
kernel_size=3,
stride=self.conv2_stride,
padding=self.dilation,
dilation=self.dilation,
groups=groups,
bias=False)
else:
groups = self.dcn.get('groups', 1)
deformable_groups = self.dcn.get('deformable_groups', 1)
if not self.with_modulated_dcn:
conv_op = DeformConv
offset_channels = 18
else:
conv_op = ModulatedDeformConv
offset_channels = 27
self.conv2_offset = nn.Conv2d(
width,
deformable_groups * offset_channels,
kernel_size=3,
stride=self.conv2_stride,
padding=self.dilation,
dilation=self.dilation)
self.conv2 = conv_op(
width,
width,
kernel_size=3,
stride=self.conv2_stride,
padding=self.dilation,
dilation=self.dilation,
groups=groups,
deformable_groups=deformable_groups,
bias=False)
self.add_module(self.norm2_name, norm2)
self.conv3 = nn.Conv2d(
width, self.planes * self.expansion, kernel_size=1, bias=False)
self.add_module(self.norm3_name, norm3)
def make_res_layer(block,
inplanes,
planes,
blocks,
stride=1,
dilation=1,
groups=1,
base_width=4,
style='pytorch',
with_cp=False,
normalize=dict(type='BN'),
dcn=None):
downsample = None
if stride != 1 or inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(
inplanes,
planes * block.expansion,
kernel_size=1,
stride=stride,
bias=False),
build_norm_layer(normalize, planes * block.expansion)[1],
)
layers = []
layers.append(
block(
inplanes,
planes,
stride=stride,
dilation=dilation,
downsample=downsample,
groups=groups,
base_width=base_width,
style=style,
with_cp=with_cp,
normalize=normalize,
dcn=dcn))
inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(
block(
inplanes,
planes,
stride=1,
dilation=dilation,
groups=groups,
base_width=base_width,
style=style,
with_cp=with_cp,
normalize=normalize,
dcn=dcn))
return nn.Sequential(*layers)
@BACKBONES.register_module
class ResNeXt(ResNet):
"""ResNeXt backbone.
Args:
depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
num_stages (int): Resnet stages, normally 4.
groups (int): Group of resnext.
base_width (int): Base width of resnext.
strides (Sequence[int]): Strides of the first block of each stage.
dilations (Sequence[int]): Dilation of each stage.
out_indices (Sequence[int]): Output from which stages.
style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
layer is the 3x3 conv layer, otherwise the stride-two layer is
the first 1x1 conv layer.
frozen_stages (int): Stages to be frozen (all param fixed). -1 means
not freezing any parameters.
normalize (dict): dictionary to construct and config norm layer.
norm_eval (bool): Whether to set norm layers to eval mode, namely,
freeze running stats (mean and var). Note: Effect on Batch Norm
and its variants only.
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
memory while slowing down the training speed.
zero_init_residual (bool): whether to use zero init for last norm layer
in resblocks to let them behave as identity.
"""
arch_settings = {
50: (Bottleneck, (3, 4, 6, 3)),
101: (Bottleneck, (3, 4, 23, 3)),
152: (Bottleneck, (3, 8, 36, 3))
}
def __init__(self, groups=1, base_width=4, **kwargs):
super(ResNeXt, self).__init__(**kwargs)
self.groups = groups
self.base_width = base_width
self.inplanes = 64
self.res_layers = []
for i, num_blocks in enumerate(self.stage_blocks):
stride = self.strides[i]
dilation = self.dilations[i]
dcn = self.dcn if self.stage_with_dcn[i] else None
planes = 64 * 2**i
res_layer = make_res_layer(
self.block,
self.inplanes,
planes,
num_blocks,
stride=stride,
dilation=dilation,
groups=self.groups,
base_width=self.base_width,
style=self.style,
with_cp=self.with_cp,
normalize=self.normalize,
dcn=dcn)
self.inplanes = planes * self.block.expansion
layer_name = 'layer{}'.format(i + 1)
self.add_module(layer_name, res_layer)
self.res_layers.append(layer_name)
self._freeze_stages()
================================================
FILE: mmdet/models/backbones/ssd_vgg.py
================================================
import logging
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import (VGG, xavier_init, constant_init, kaiming_init,
normal_init)
from mmcv.runner import load_checkpoint
from ..registry import BACKBONES
@BACKBONES.register_module
class SSDVGG(VGG):
extra_setting = {
300: (256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256),
512: (256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256, 128),
}
def __init__(self,
input_size,
depth,
with_last_pool=False,
ceil_mode=True,
out_indices=(3, 4),
out_feature_indices=(22, 34),
l2_norm_scale=20.):
super(SSDVGG, self).__init__(
depth,
with_last_pool=with_last_pool,
ceil_mode=ceil_mode,
out_indices=out_indices)
assert input_size in (300, 512)
self.input_size = input_size
self.features.add_module(
str(len(self.features)),
nn.MaxPool2d(kernel_size=3, stride=1, padding=1))
self.features.add_module(
str(len(self.features)),
nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6))
self.features.add_module(
str(len(self.features)), nn.ReLU(inplace=True))
self.features.add_module(
str(len(self.features)), nn.Conv2d(1024, 1024, kernel_size=1))
self.features.add_module(
str(len(self.features)), nn.ReLU(inplace=True))
self.out_feature_indices = out_feature_indices
self.inplanes = 1024
self.extra = self._make_extra_layers(self.extra_setting[input_size])
self.l2_norm = L2Norm(
self.features[out_feature_indices[0] - 1].out_channels,
l2_norm_scale)
def init_weights(self, pretrained=None):
if isinstance(pretrained, str):
logger = logging.getLogger()
load_checkpoint(self, pretrained, strict=False, logger=logger)
elif pretrained is None:
for m in self.features.modules():
if isinstance(m, nn.Conv2d):
kaiming_init(m)
elif isinstance(m, nn.BatchNorm2d):
constant_init(m, 1)
elif isinstance(m, nn.Linear):
normal_init(m, std=0.01)
else:
raise TypeError('pretrained must be a str or None')
for m in self.extra.modules():
if isinstance(m, nn.Conv2d):
xavier_init(m, distribution='uniform')
constant_init(self.l2_norm, self.l2_norm.scale)
def forward(self, x):
outs = []
for i, layer in enumerate(self.features):
x = layer(x)
if i in self.out_feature_indices:
outs.append(x)
for i, layer in enumerate(self.extra):
x = F.relu(layer(x), inplace=True)
if i % 2 == 1:
outs.append(x)
outs[0] = self.l2_norm(outs[0])
if len(outs) == 1:
return outs[0]
else:
return tuple(outs)
def _make_extra_layers(self, outplanes):
layers = []
kernel_sizes = (1, 3)
num_layers = 0
outplane = None
for i in range(len(outplanes)):
if self.inplanes == 'S':
self.inplanes = outplane
continue
k = kernel_sizes[num_layers % 2]
if outplanes[i] == 'S':
outplane = outplanes[i + 1]
conv = nn.Conv2d(
self.inplanes, outplane, k, stride=2, padding=1)
else:
outplane = outplanes[i]
conv = nn.Conv2d(
self.inplanes, outplane, k, stride=1, padding=0)
layers.append(conv)
self.inplanes = outplanes[i]
num_layers += 1
if self.input_size == 512:
layers.append(nn.Conv2d(self.inplanes, 256, 4, padding=1))
return nn.Sequential(*layers)
class L2Norm(nn.Module):
def __init__(self, n_dims, scale=20., eps=1e-10):
super(L2Norm, self).__init__()
self.n_dims = n_dims
self.weight = nn.Parameter(torch.Tensor(self.n_dims))
self.eps = eps
self.scale = scale
def forward(self, x):
norm = x.pow(2).sum(1, keepdim=True).sqrt() + self.eps
return self.weight[None, :, None, None].expand_as(x) * x / norm
================================================
FILE: mmdet/models/bbox_heads/__init__.py
================================================
from .bbox_head import BBoxHead
from .convfc_bbox_head import ConvFCBBoxHead, SharedFCBBoxHead
from .graph_bbox_head import GraphBBoxHead
__all__ = ['BBoxHead', 'ConvFCBBoxHead', 'SharedFCBBoxHead', 'GraphBBoxHead']
================================================
FILE: mmdet/models/bbox_heads/bbox_head.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmdet.core import (delta2bbox, multiclass_nms, bbox_target,
weighted_cross_entropy, weighted_smoothl1, accuracy)
from ..registry import HEADS
@HEADS.register_module
class BBoxHead(nn.Module):
"""Simplest RoI head, with only two fc layers for classification and
regression respectively"""
def __init__(self,
with_avg_pool=False,
with_cls=True,
with_reg=True,
roi_feat_size=7,
in_channels=256,
num_classes=81,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=False):
super(BBoxHead, self).__init__()
assert with_cls or with_reg
self.with_avg_pool = with_avg_pool
self.with_cls = with_cls
self.with_reg = with_reg
self.roi_feat_size = roi_feat_size
self.in_channels = in_channels
self.num_classes = num_classes
self.target_means = target_means
self.target_stds = target_stds
self.reg_class_agnostic = reg_class_agnostic
in_channels = self.in_channels
if self.with_avg_pool:
self.avg_pool = nn.AvgPool2d(roi_feat_size)
else:
in_channels *= (self.roi_feat_size * self.roi_feat_size)
if self.with_cls:
self.fc_cls = nn.Linear(in_channels, num_classes)
if self.with_reg:
out_dim_reg = 4 if reg_class_agnostic else 4 * num_classes
self.fc_reg = nn.Linear(in_channels, out_dim_reg)
self.debug_imgs = None
def init_weights(self):
if self.with_cls:
nn.init.normal_(self.fc_cls.weight, 0, 0.01)
nn.init.constant_(self.fc_cls.bias, 0)
if self.with_reg:
nn.init.normal_(self.fc_reg.weight, 0, 0.001)
nn.init.constant_(self.fc_reg.bias, 0)
def forward(self, x):
if self.with_avg_pool:
x = self.avg_pool(x)
x = x.view(x.size(0), -1)
cls_score = self.fc_cls(x) if self.with_cls else None
bbox_pred = self.fc_reg(x) if self.with_reg else None
return cls_score, bbox_pred
def get_target(self, sampling_results, gt_bboxes, gt_labels,
rcnn_train_cfg):
pos_proposals = [res.pos_bboxes for res in sampling_results]
neg_proposals = [res.neg_bboxes for res in sampling_results]
pos_gt_bboxes = [res.pos_gt_bboxes for res in sampling_results]
pos_gt_labels = [res.pos_gt_labels for res in sampling_results]
reg_classes = 1 if self.reg_class_agnostic else self.num_classes
cls_reg_targets = bbox_target(
pos_proposals,
neg_proposals,
pos_gt_bboxes,
pos_gt_labels,
rcnn_train_cfg,
reg_classes,
target_means=self.target_means,
target_stds=self.target_stds)
return cls_reg_targets
def loss(self,
cls_score,
bbox_pred,
labels,
label_weights,
bbox_targets,
bbox_weights,
reduce=True):
losses = dict()
if cls_score is not None:
losses['loss_cls'] = weighted_cross_entropy(
cls_score, labels, label_weights, reduce=reduce)
losses['acc'] = accuracy(cls_score, labels)
if bbox_pred is not None:
losses['loss_reg'] = weighted_smoothl1(
bbox_pred,
bbox_targets,
bbox_weights,
avg_factor=bbox_targets.size(0))
return losses
def get_det_bboxes(self,
rois,
cls_score,
bbox_pred,
img_shape,
scale_factor,
rescale=False,
cfg=None):
if isinstance(cls_score, list):
cls_score = sum(cls_score) / float(len(cls_score))
scores = F.softmax(cls_score, dim=1) if cls_score is not None else None
if bbox_pred is not None:
bboxes = delta2bbox(rois[:, 1:], bbox_pred, self.target_means,
self.target_stds, img_shape)
else:
bboxes = rois[:, 1:]
# TODO: add clip here
if rescale:
bboxes /= scale_factor
if cfg is None:
return bboxes, scores
else:
det_bboxes, det_labels = multiclass_nms(
bboxes, scores, cfg.score_thr, cfg.nms, cfg.max_per_img)
return det_bboxes, det_labels
def refine_bboxes(self, rois, labels, bbox_preds, pos_is_gts, img_metas):
"""Refine bboxes during training.
Args:
rois (Tensor): Shape (n*bs, 5), where n is image number per GPU,
and bs is the sampled RoIs per image.
labels (Tensor): Shape (n*bs, ).
bbox_preds (Tensor): Shape (n*bs, 4) or (n*bs, 4*#class).
pos_is_gts (list[Tensor]): Flags indicating if each positive bbox
is a gt bbox.
img_metas (list[dict]): Meta info of each image.
Returns:
list[Tensor]: Refined bboxes of each image in a mini-batch.
"""
img_ids = rois[:, 0].long().unique(sorted=True)
assert img_ids.numel() == len(img_metas)
bboxes_list = []
for i in range(len(img_metas)):
inds = torch.nonzero(rois[:, 0] == i).squeeze()
num_rois = inds.numel()
bboxes_ = rois[inds, 1:]
label_ = labels[inds]
bbox_pred_ = bbox_preds[inds]
img_meta_ = img_metas[i]
pos_is_gts_ = pos_is_gts[i]
bboxes = self.regress_by_class(bboxes_, label_, bbox_pred_,
img_meta_)
# filter gt bboxes
pos_keep = 1 - pos_is_gts_
keep_inds = pos_is_gts_.new_ones(num_rois)
keep_inds[:len(pos_is_gts_)] = pos_keep
bboxes_list.append(bboxes[keep_inds])
return bboxes_list
def regress_by_class(self, rois, label, bbox_pred, img_meta):
"""Regress the bbox for the predicted class. Used in Cascade R-CNN.
Args:
rois (Tensor): shape (n, 4) or (n, 5)
label (Tensor): shape (n, )
bbox_pred (Tensor): shape (n, 4*(#class+1)) or (n, 4)
img_meta (dict): Image meta info.
Returns:
Tensor: Regressed bboxes, the same shape as input rois.
"""
assert rois.size(1) == 4 or rois.size(1) == 5
if not self.reg_class_agnostic:
label = label * 4
inds = torch.stack((label, label + 1, label + 2, label + 3), 1)
bbox_pred = torch.gather(bbox_pred, 1, inds)
assert bbox_pred.size(1) == 4
if rois.size(1) == 4:
new_rois = delta2bbox(rois, bbox_pred, self.target_means,
self.target_stds, img_meta['img_shape'])
else:
bboxes = delta2bbox(rois[:, 1:], bbox_pred, self.target_means,
self.target_stds, img_meta['img_shape'])
new_rois = torch.cat((rois[:, [0]], bboxes), dim=1)
return new_rois
================================================
FILE: mmdet/models/bbox_heads/convfc_bbox_head.py
================================================
import torch.nn as nn
from .bbox_head import BBoxHead
from ..registry import HEADS
from ..utils import ConvModule
@HEADS.register_module
class ConvFCBBoxHead(BBoxHead):
"""More general bbox head, with shared conv and fc layers and two optional
separated branches.
/-> cls convs -> cls fcs -> cls
shared convs -> shared fcs
\-> reg convs -> reg fcs -> reg
""" # noqa: W605
def __init__(self,
num_shared_convs=0,
num_shared_fcs=0,
num_cls_convs=0,
num_cls_fcs=0,
num_reg_convs=0,
num_reg_fcs=0,
conv_out_channels=256,
fc_out_channels=1024,
normalize=None,
*args,
**kwargs):
super(ConvFCBBoxHead, self).__init__(*args, **kwargs)
assert (num_shared_convs + num_shared_fcs + num_cls_convs + num_cls_fcs
+ num_reg_convs + num_reg_fcs > 0)
if num_cls_convs > 0 or num_reg_convs > 0:
assert num_shared_fcs == 0
if not self.with_cls:
assert num_cls_convs == 0 and num_cls_fcs == 0
if not self.with_reg:
assert num_reg_convs == 0 and num_reg_fcs == 0
self.num_shared_convs = num_shared_convs
self.num_shared_fcs = num_shared_fcs
self.num_cls_convs = num_cls_convs
self.num_cls_fcs = num_cls_fcs
self.num_reg_convs = num_reg_convs
self.num_reg_fcs = num_reg_fcs
self.conv_out_channels = conv_out_channels
self.fc_out_channels = fc_out_channels
self.normalize = normalize
self.with_bias = normalize is None
# add shared convs and fcs
self.shared_convs, self.shared_fcs, last_layer_dim = \
self._add_conv_fc_branch(
self.num_shared_convs, self.num_shared_fcs, self.in_channels,
True)
self.shared_out_channels = last_layer_dim
# add cls specific branch
self.cls_convs, self.cls_fcs, self.cls_last_dim = \
self._add_conv_fc_branch(
self.num_cls_convs, self.num_cls_fcs, self.shared_out_channels)
# add reg specific branch
self.reg_convs, self.reg_fcs, self.reg_last_dim = \
self._add_conv_fc_branch(
self.num_reg_convs, self.num_reg_fcs, self.shared_out_channels)
if self.num_shared_fcs == 0 and not self.with_avg_pool:
if self.num_cls_fcs == 0:
self.cls_last_dim *= (self.roi_feat_size * self.roi_feat_size)
if self.num_reg_fcs == 0:
self.reg_last_dim *= (self.roi_feat_size * self.roi_feat_size)
self.relu = nn.ReLU(inplace=True)
# reconstruct fc_cls and fc_reg since input channels are changed
if self.with_cls:
self.fc_cls = nn.Linear(self.cls_last_dim, self.num_classes)
if self.with_reg:
out_dim_reg = (4 if self.reg_class_agnostic else
4 * self.num_classes)
self.fc_reg = nn.Linear(self.reg_last_dim, out_dim_reg)
def _add_conv_fc_branch(self,
num_branch_convs,
num_branch_fcs,
in_channels,
is_shared=False):
"""Add shared or separable branch
convs -> avg pool (optional) -> fcs
"""
last_layer_dim = in_channels
# add branch specific conv layers
branch_convs = nn.ModuleList()
if num_branch_convs > 0:
for i in range(num_branch_convs):
conv_in_channels = (last_layer_dim
if i == 0 else self.conv_out_channels)
branch_convs.append(
ConvModule(
conv_in_channels,
self.conv_out_channels,
3,
padding=1,
normalize=self.normalize,
bias=self.with_bias))
last_layer_dim = self.conv_out_channels
# add branch specific fc layers
branch_fcs = nn.ModuleList()
if num_branch_fcs > 0:
# for shared branch, only consider self.with_avg_pool
# for separated branches, also consider self.num_shared_fcs
if (is_shared
or self.num_shared_fcs == 0) and not self.with_avg_pool:
last_layer_dim *= (self.roi_feat_size * self.roi_feat_size)
for i in range(num_branch_fcs):
fc_in_channels = (last_layer_dim
if i == 0 else self.fc_out_channels)
branch_fcs.append(
nn.Linear(fc_in_channels, self.fc_out_channels))
last_layer_dim = self.fc_out_channels
return branch_convs, branch_fcs, last_layer_dim
def init_weights(self):
super(ConvFCBBoxHead, self).init_weights()
for module_list in [self.shared_fcs, self.cls_fcs, self.reg_fcs]:
for m in module_list.modules():
if isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
nn.init.constant_(m.bias, 0)
def forward(self, x):
# shared part
if self.num_shared_convs > 0:
for conv in self.shared_convs:
x = conv(x)
if self.num_shared_fcs > 0:
if self.with_avg_pool:
x = self.avg_pool(x)
x = x.view(x.size(0), -1)
for fc in self.shared_fcs:
x = self.relu(fc(x))
# separate branches
x_cls = x
x_reg = x
for conv in self.cls_convs:
x_cls = conv(x_cls)
if x_cls.dim() > 2:
if self.with_avg_pool:
x_cls = self.avg_pool(x_cls)
x_cls = x_cls.view(x_cls.size(0), -1)
for fc in self.cls_fcs:
x_cls = self.relu(fc(x_cls))
for conv in self.reg_convs:
x_reg = conv(x_reg)
if x_reg.dim() > 2:
if self.with_avg_pool:
x_reg = self.avg_pool(x_reg)
x_reg = x_reg.view(x_reg.size(0), -1)
for fc in self.reg_fcs:
x_reg = self.relu(fc(x_reg))
cls_score = self.fc_cls(x_cls) if self.with_cls else None
bbox_pred = self.fc_reg(x_reg) if self.with_reg else None
return cls_score, bbox_pred
@HEADS.register_module
class SharedFCBBoxHead(ConvFCBBoxHead):
def __init__(self, num_fcs=2, fc_out_channels=1024, *args, **kwargs):
assert num_fcs >= 1
super(SharedFCBBoxHead, self).__init__(
num_shared_convs=0,
num_shared_fcs=num_fcs,
num_cls_convs=0,
num_cls_fcs=0,
num_reg_convs=0,
num_reg_fcs=0,
fc_out_channels=fc_out_channels,
*args,
**kwargs)
================================================
FILE: mmdet/models/bbox_heads/convfc_bbox_head_enhanced.py
================================================
import torch
import torch.nn as nn
from .bbox_head import BBoxHead
from ..utils import ConvModule
class ConvFCRoIHeadEnhance(BBoxHead):
"""More general bbox head, with shared conv and fc layers and two optional
separated branches.
/-> cls convs -> cls fcs -> cls
shared convs -> shared fcs
\-> reg convs -> reg fcs -> reg
"""
def __init__(self,
num_shared_convs=0,
num_shared_fcs=2,
num_cls_convs=0,
num_cls_fcs=0,
num_reg_convs=0,
num_reg_fcs=0,
conv_out_channels=256,
fc_out_channels=1024,
enhance_channels=256,
*args,
**kwargs):
super(ConvFCRoIHeadEnhance, self).__init__(*args, **kwargs)
#assert (num_shared_convs + num_shared_fcs + num_cls_convs + num_cls_fcs
# + num_reg_convs + num_reg_fcs > 0)
if num_cls_convs > 0 or num_reg_convs > 0:
assert num_shared_fcs == 0
if not self.with_cls:
assert num_cls_convs == 0 and num_cls_fcs == 0
if not self.with_reg:
assert num_reg_convs == 0 and num_reg_fcs == 0
self.num_shared_convs = num_shared_convs
self.num_shared_fcs = num_shared_fcs
self.num_cls_convs = num_cls_convs
self.num_cls_fcs = num_cls_fcs
self.num_reg_convs = num_reg_convs
self.num_reg_fcs = num_reg_fcs
self.conv_out_channels = conv_out_channels
self.fc_out_channels = fc_out_channels
self.enhance_channels = enhance_channels
# add shared convs and fcs
self.shared_convs, self.shared_fcs, last_layer_dim = \
self._add_conv_fc_branch(
self.num_shared_convs, self.num_shared_fcs, self.in_channels,
True)
self.shared_out_channels = last_layer_dim
# add cls specific branch
self.cls_convs, self.cls_fcs, self.cls_last_dim = \
self._add_conv_fc_branch(
self.num_cls_convs, self.num_cls_fcs, self.shared_out_channels)
# add reg specific branch
self.reg_convs, self.reg_fcs, self.reg_last_dim = \
self._add_conv_fc_branch(
self.num_reg_convs, self.num_reg_fcs, self.shared_out_channels)
if (num_shared_convs + num_shared_fcs + num_cls_convs + num_cls_fcs
+ num_reg_convs + num_reg_fcs > 0):
if self.num_shared_fcs == 0 and not self.with_avg_pool:
if self.num_cls_fcs == 0:
self.cls_last_dim *= (self.roi_feat_size * self.roi_feat_size)
if self.num_reg_fcs == 0:
self.reg_last_dim *= (self.roi_feat_size * self.roi_feat_size)
else:
self.cls_last_dim = self.in_channels
self.reg_last_dim = self.in_channels
self.relu = nn.ReLU(inplace=True)
# reconstruct fc_cls and fc_reg since input channels are changed
if self.with_cls:
self.fc_cls = nn.Linear(self.cls_last_dim+self.enhance_channels, self.num_classes)
if self.with_reg:
out_dim_reg = (4 if self.reg_class_agnostic else
4 * self.num_classes)
self.fc_reg = nn.Linear(self.reg_last_dim+self.enhance_channels, out_dim_reg)
def _add_conv_fc_branch(self,
num_branch_convs,
num_branch_fcs,
in_channels,
is_shared=False):
"""Add shared or separable branch
convs -> avg pool (optional) -> fcs
"""
last_layer_dim = in_channels
# add branch specific conv layers
branch_convs = nn.ModuleList()
if num_branch_convs > 0:
for i in range(num_branch_convs):
conv_in_channels = (last_layer_dim
if i == 0 else self.conv_out_channels)
branch_convs.append(
ConvModule(
conv_in_channels,
self.conv_out_channels,
3,
padding=1,
normalize=self.normalize,
bias=self.with_bias))
last_layer_dim = self.conv_out_channels
# add branch specific fc layers
branch_fcs = nn.ModuleList()
if num_branch_fcs > 0:
# for shared branch, only consider self.with_avg_pool
# for separated branches, also consider self.num_shared_fcs
if (is_shared
or self.num_shared_fcs == 0) and not self.with_avg_pool:
last_layer_dim *= (self.roi_feat_size * self.roi_feat_size)
for i in range(num_branch_fcs):
fc_in_channels = (last_layer_dim
if i == 0 else self.fc_out_channels)
branch_fcs.append(
nn.Linear(fc_in_channels, self.fc_out_channels))
last_layer_dim = self.fc_out_channels
return branch_convs, branch_fcs, last_layer_dim
def init_weights(self):
super(ConvFCRoIHeadEnhance, self).init_weights()
for module_list in [self.shared_fcs, self.cls_fcs, self.reg_fcs]:
for m in module_list.modules():
if isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
nn.init.constant_(m.bias, 0)
def forward(self, x, enhanced_feature=None):
# shared part
if self.num_shared_convs > 0:
for conv in self.shared_convs:
x = conv(x)
if self.num_shared_fcs > 0:
if self.with_avg_pool:
x = self.avg_pool(x)
x = x.view(x.size(0), -1)
for fc in self.shared_fcs:
x = self.relu(fc(x))
# separate branches
x = torch.cat([x, enhanced_feature], dim=1)
x_cls = x
x_reg = x
for conv in self.cls_convs:
x_cls = conv(x_cls)
if x_cls.dim() > 2:
if self.with_avg_pool:
x_cls = self.avg_pool(x_cls)
x_cls = x_cls.view(x_cls.size(0), -1)
for fc in self.cls_fcs:
x_cls = self.relu(fc(x_cls))
for conv in self.reg_convs:
x_reg = conv(x_reg)
if x_reg.dim() > 2:
if self.with_avg_pool:
x_reg = self.avg_pool(x_reg)
x_reg = x_reg.view(x_reg.size(0), -1)
for fc in self.reg_fcs:
x_reg = self.relu(fc(x_reg))
cls_score = self.fc_cls(x_cls) if self.with_cls else None
bbox_pred = self.fc_reg(x_reg) if self.with_reg else None
return cls_score, bbox_pred
================================================
FILE: mmdet/models/bbox_heads/graph_bbox_head.py
================================================
import torch.nn as nn
import torch
from .bbox_head import BBoxHead
from ..registry import HEADS
from ..utils import ConvModule
import torch.nn.functional as F
from mmdet.core import (weighted_cross_entropy, weighted_smoothl1, accuracy)
@HEADS.register_module
class GraphBBoxHead(BBoxHead):
"""More general bbox head, with shared conv and fc layers and two optional
separated branches.
/-> cls convs -> cls fcs -> cls
shared convs -> shared fcs
\-> reg convs -> reg fcs -> reg
""" # noqa: W605
def __init__(self,
num_attr_conv=0,
num_rela_conv=0,
num_spat_conv=0,
with_attr=False,
with_rela=False,
with_spat=False,
num_spat_graph=10,
graph_out_channels=256,
nf=64,
ratio=[4, 2, 1],
normalize=None,
num_shared_fcs=0,
fc_out_channels=1024,
*args,
**kwargs):
super(GraphBBoxHead, self).__init__(*args, **kwargs)
# original FPN head
self.num_shared_fcs = num_shared_fcs
self.normalize = normalize
self.with_bias = normalize is None
self.fc_out_channels = fc_out_channels
# add shared convs and fcs
_, self.shared_fcs, last_layer_dim = \
self._add_conv_fc_branch(0, self.in_channels, num_branch_fcs=self.num_shared_fcs)
if num_shared_fcs > 0:
self.cls_last_dim = last_layer_dim
self.reg_last_dim = last_layer_dim
self.in_channels = last_layer_dim
else:
self.cls_last_dim = self.in_channels
self.reg_last_dim = self.in_channels
# corresponding to graph compute
self.attr_transferW = nn.ModuleList()
self.rela_transferW = nn.ModuleList()
self.spat_transferW = nn.ModuleList()
if with_attr:
self.attr_convs, _, _ = self._add_conv_fc_branch(num_attr_conv, self.in_channels, nf, ratio)
self.attr_transferW = nn.Linear(self.in_channels, graph_out_channels)
self.cls_last_dim = self.cls_last_dim + graph_out_channels
self.reg_last_dim = self.reg_last_dim + graph_out_channels
if with_rela:
self.rela_convs, _, _ = self._add_conv_fc_branch(num_rela_conv, self.in_channels, nf, ratio)
self.rela_transferW = nn.Linear(self.in_channels, graph_out_channels)
self.cls_last_dim = self.cls_last_dim + graph_out_channels
self.reg_last_dim = self.reg_last_dim + graph_out_channels
if with_spat:
self.spat_convs, _, _ = self._add_conv_fc_branch(num_spat_conv, 5, nf=5, ratio=[1])
self.spat_transferW = nn.Linear(self.in_channels, graph_out_channels)
self.cls_last_dim = self.cls_last_dim + graph_out_channels
self.reg_last_dim = self.reg_last_dim + graph_out_channels
self.with_attr = with_attr
self.with_rela = with_rela
self.with_spat = with_spat
self.num_spat_graph = num_spat_graph
# classifer and bbox regression
self.relu = nn.ReLU(inplace=True)
# reconstruct fc_cls and fc_reg since input channels are changed
if self.with_cls:
self.fc_cls = nn.Linear(self.cls_last_dim, self.num_classes)
if self.with_reg:
out_dim_reg = (4 if self.reg_class_agnostic else
4 * self.num_classes)
self.fc_reg = nn.Linear(self.reg_last_dim, out_dim_reg)
def _add_conv_fc_branch(self,
num_branch_convs,
in_channels,
nf=0,
ratio=[0],
num_branch_fcs=0):
"""Add shared or separable branch
convs -> avg pool (optional) -> fcs
"""
last_layer_dim = in_channels
# add branch specific conv layers
branch_convs = nn.ModuleList()
if num_branch_convs > 0:
assert num_branch_convs == len(ratio) + 1
for i in range(num_branch_convs):
conv_in_channels = (last_layer_dim
if i == 0 else conv_out_channels)
conv_out_channels = (int(nf * ratio[i])
if i < num_branch_convs - 1 else 1)
branch_convs.append(
ConvModule(
conv_in_channels,
conv_out_channels,
1,
normalize=self.normalize,
bias=self.with_bias))
branch_fcs = nn.ModuleList()
if num_branch_fcs > 0:
# for shared branch, only consider self.with_avg_pool
# for separated branches, also consider self.num_shared_fcs
if not self.with_avg_pool:
last_layer_dim *= (self.roi_feat_size * self.roi_feat_size)
for i in range(num_branch_fcs):
fc_in_channels = (last_layer_dim
if i == 0 else self.fc_out_channels)
branch_fcs.append(
nn.Linear(fc_in_channels, self.fc_out_channels))
last_layer_dim = self.fc_out_channels
return branch_convs, branch_fcs, last_layer_dim
def init_weights(self):
super(GraphBBoxHead, self).init_weights()
for module_list in [self.shared_fcs, self.attr_transferW, self.rela_transferW, self.spat_transferW]:
for m in module_list.modules():
if isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
nn.init.constant_(m.bias, 0)
def forward(self, x, geom_f, bs):
# shared part
if self.num_shared_fcs > 0:
if self.with_avg_pool:
x = self.avg_pool(x)
x = x.view(x.size(0), -1)
for fc in self.shared_fcs:
x = self.relu(fc(x))
if x.dim() > 2:
if self.with_avg_pool:
x = self.avg_pool(x)
x = x.view(x.size(0), -1)
feat_dim = x.size(1)
x = x.view(bs, -1, feat_dim)
# compute A adj matrix
a_super = []
enhanced_feat = []
if self.with_attr or self.with_rela:
W1 = x.detach().unsqueeze(2)
W2 = torch.transpose(W1, 1, 2)
diff_W = torch.abs(W1 - W2)
diff_W = torch.transpose(diff_W, 1, 3)
if self.with_attr:
A_a = diff_W
for conv in self.attr_convs:
A_a = conv(A_a)
A_a = A_a.contiguous()
A_a = A_a.squeeze(1)
a_super.append(A_a)
# propogation
enhanced_feat.append(self.propagate_em(x, A_a, self.attr_transferW))
if self.with_rela:
A_r = diff_W
for conv in self.rela_convs:
A_r = conv(A_r)
A_r = A_r.contiguous()
A_r = A_r.squeeze(1)
a_super.append(A_r)
# propogation
enhanced_feat.append(self.propagate_em(x, A_r, self.rela_transferW))
if self.with_spat:
W1 = geom_f.unsqueeze(2)
W2 = torch.transpose(W1, 1, 2)
diff_W = W1 - W2
diff_W = torch.transpose(diff_W, 1, 3)
Iden = torch.eye(diff_W.size(-1)).cuda()
A_s = W2.new_zeros((diff_W.size(-1), diff_W.size(-1)))
for i in range(self.num_spat_graph):
tmp_A = diff_W
for conv in self.spat_convs:
tmp_A = conv(tmp_A)
A_s = tmp_A + A_s + Iden
A_s = A_s.contiguous()
A_s = A_s.squeeze(1)
enhanced_feat.append(self.propagate_em(x, A_s, self.spat_transferW))
enhanced_feat = torch.cat(enhanced_feat, -1)
# separate branches
assert len(x.size()) == len(enhanced_feat.size())
x = torch.cat((x, enhanced_feat), -1)
x_cls = x.view(-1, x.size(-1))
x_reg = x.view(-1, x.size(-1))
cls_score = self.fc_cls(x_cls) if self.with_cls else None
bbox_pred = self.fc_reg(x_reg) if self.with_reg else None
return cls_score, bbox_pred, a_super
def loss(self, cls_score, bbox_pred, A_pred, A_gt, labels, label_weights, bbox_targets,
bbox_weights, reduce=True):
losses = dict()
if cls_score is not None:
losses['loss_cls'] = weighted_cross_entropy(
cls_score, labels, label_weights, reduce=reduce)
losses['acc'] = accuracy(cls_score, labels)
if bbox_pred is not None:
losses['loss_reg'] = weighted_smoothl1(
bbox_pred,
bbox_targets,
bbox_weights,
avg_factor=bbox_targets.size(0))
if A_pred:
assert len(A_pred) == len(A_gt)
assert A_pred[0].size() == A_gt[0].size()
num_a_pred = len(A_pred)
for i in range(num_a_pred):
losses['loss_adj' + str(i)] = F.mse_loss(A_pred[i], A_gt[i].detach())
return losses
def propagate_em(self, x, A, W):
A = F.softmax(A, 2)
x = torch.bmm(A, x)
x = W(x)
return x
================================================
FILE: mmdet/models/builder.py
================================================
import mmcv
from torch import nn
from .registry import BACKBONES, NECKS, ROI_EXTRACTORS, HEADS, DETECTORS
def _build_module(cfg, registry, default_args):
assert isinstance(cfg, dict) and 'type' in cfg
assert isinstance(default_args, dict) or default_args is None
args = cfg.copy()
obj_type = args.pop('type')
if mmcv.is_str(obj_type):
if obj_type not in registry.module_dict:
raise KeyError('{} is not in the {} registry'.format(
obj_type, registry.name))
obj_type = registry.module_dict[obj_type]
elif not isinstance(obj_type, type):
raise TypeError('type must be a str or valid type, but got {}'.format(
type(obj_type)))
if default_args is not None:
for name, value in default_args.items():
args.setdefault(name, value)
return obj_type(**args)
def build(cfg, registry, default_args=None):
if isinstance(cfg, list):
modules = [_build_module(cfg_, registry, default_args) for cfg_ in cfg]
return nn.Sequential(*modules)
else:
return _build_module(cfg, registry, default_args)
def build_backbone(cfg):
return build(cfg, BACKBONES)
def build_neck(cfg):
return build(cfg, NECKS)
def build_roi_extractor(cfg):
return build(cfg, ROI_EXTRACTORS)
def build_head(cfg):
return build(cfg, HEADS)
def build_detector(cfg, train_cfg=None, test_cfg=None):
return build(cfg, DETECTORS, dict(train_cfg=train_cfg, test_cfg=test_cfg))
================================================
FILE: mmdet/models/detectors/__init__.py
================================================
from .base import BaseDetector
from .single_stage import SingleStageDetector
from .two_stage import TwoStageDetector
from .rpn import RPN
from .fast_rcnn import FastRCNN
from .faster_rcnn import FasterRCNN
from .mask_rcnn import MaskRCNN
from .cascade_rcnn import CascadeRCNN
from .retinanet import RetinaNet
__all__ = [
'BaseDetector', 'SingleStageDetector', 'TwoStageDetector', 'RPN',
'FastRCNN', 'FasterRCNN', 'MaskRCNN', 'CascadeRCNN', 'RetinaNet'
]
================================================
FILE: mmdet/models/detectors/base.py
================================================
import logging
from abc import ABCMeta, abstractmethod
import mmcv
import numpy as np
import torch.nn as nn
import pycocotools.mask as maskUtils
from mmdet.core import tensor2imgs, get_classes
class BaseDetector(nn.Module):
"""Base class for detectors"""
__metaclass__ = ABCMeta
def __init__(self):
super(BaseDetector, self).__init__()
@property
def with_neck(self):
return hasattr(self, 'neck') and self.neck is not None
@property
def with_bbox(self):
return hasattr(self, 'bbox_head') and self.bbox_head is not None
@property
def with_mask(self):
return hasattr(self, 'mask_head') and self.mask_head is not None
@abstractmethod
def extract_feat(self, imgs):
pass
def extract_feats(self, imgs):
assert isinstance(imgs, list)
for img in imgs:
yield self.extract_feat(img)
@abstractmethod
def forward_train(self, imgs, img_metas, **kwargs):
pass
@abstractmethod
def simple_test(self, img, img_meta, **kwargs):
pass
@abstractmethod
def aug_test(self, imgs, img_metas, **kwargs):
pass
def init_weights(self, pretrained=None):
if pretrained is not None:
logger = logging.getLogger()
logger.info('load model from: {}'.format(pretrained))
def forward_test(self, imgs, img_metas, **kwargs):
for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
if not isinstance(var, list):
raise TypeError('{} must be a list, but got {}'.format(
name, type(var)))
num_augs = len(imgs)
if num_augs != len(img_metas):
raise ValueError(
'num of augmentations ({}) != num of image meta ({})'.format(
len(imgs), len(img_metas)))
# TODO: remove the restriction of imgs_per_gpu == 1 when prepared
imgs_per_gpu = imgs[0].size(0)
assert imgs_per_gpu == 1
if num_augs == 1:
return self.simple_test(imgs[0], img_metas[0], **kwargs)
else:
return self.aug_test(imgs, img_metas, **kwargs)
def forward(self, img, img_meta, return_loss=True, **kwargs):
if return_loss:
return self.forward_train(img, img_meta, **kwargs)
else:
return self.forward_test(img, img_meta, **kwargs)
def show_result(self,
data,
result,
img_norm_cfg,
dataset='coco',
score_thr=0.3):
if isinstance(result, tuple):
bbox_result, segm_result = result
else:
bbox_result, segm_result = result, None
img_tensor = data['img'][0]
img_metas = data['img_meta'][0].data[0]
imgs = tensor2imgs(img_tensor, **img_norm_cfg)
assert len(imgs) == len(img_metas)
if isinstance(dataset, str):
class_names = get_classes(dataset)
elif isinstance(dataset, (list, tuple)) or dataset is None:
class_names = dataset
else:
raise TypeError(
'dataset must be a valid dataset name or a sequence'
' of class names, not {}'.format(type(dataset)))
for img, img_meta in zip(imgs, img_metas):
h, w, _ = img_meta['img_shape']
img_show = img[:h, :w, :]
bboxes = np.vstack(bbox_result)
# draw segmentation masks
if segm_result is not None:
segms = mmcv.concat_list(segm_result)
inds = np.where(bboxes[:, -1] > score_thr)[0]
for i in inds:
color_mask = np.random.randint(
0, 256, (1, 3), dtype=np.uint8)
mask = maskUtils.decode(segms[i]).astype(np.bool)
img_show[mask] = img_show[mask] * 0.5 + color_mask * 0.5
# draw bounding boxes
labels = [
np.full(bbox.shape[0], i, dtype=np.int32)
for i, bbox in enumerate(bbox_result)
]
labels = np.concatenate(labels)
mmcv.imshow_det_bboxes(
img_show,
bboxes,
labels,
class_names=class_names,
score_thr=score_thr)
================================================
FILE: mmdet/models/detectors/cascade_rcnn.py
================================================
from __future__ import division
import torch
import torch.nn as nn
from .base import BaseDetector
from .test_mixins import RPNTestMixin
from .. import builder
from ..registry import DETECTORS
from mmdet.core import (assign_and_sample, bbox2roi, bbox2result, multi_apply,
merge_aug_masks)
@DETECTORS.register_module
class CascadeRCNN(BaseDetector, RPNTestMixin):
def __init__(self,
num_stages,
backbone,
neck=None,
rpn_head=None,
bbox_roi_extractor=None,
bbox_head=None,
mask_roi_extractor=None,
mask_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
assert bbox_roi_extractor is not None
assert bbox_head is not None
super(CascadeRCNN, self).__init__()
self.num_stages = num_stages
self.backbone = builder.build_backbone(backbone)
if neck is not None:
self.neck = builder.build_neck(neck)
else:
raise NotImplementedError
if rpn_head is not None:
self.rpn_head = builder.build_head(rpn_head)
if bbox_head is not None:
self.bbox_roi_extractor = nn.ModuleList()
self.bbox_head = nn.ModuleList()
if not isinstance(bbox_roi_extractor, list):
bbox_roi_extractor = [
bbox_roi_extractor for _ in range(num_stages)
]
if not isinstance(bbox_head, list):
bbox_head = [bbox_head for _ in range(num_stages)]
assert len(bbox_roi_extractor) == len(bbox_head) == self.num_stages
for roi_extractor, head in zip(bbox_roi_extractor, bbox_head):
self.bbox_roi_extractor.append(
builder.build_roi_extractor(roi_extractor))
self.bbox_head.append(builder.build_head(head))
if mask_head is not None:
self.mask_roi_extractor = nn.ModuleList()
self.mask_head = nn.ModuleList()
if not isinstance(mask_roi_extractor, list):
mask_roi_extractor = [
mask_roi_extractor for _ in range(num_stages)
]
if not isinstance(mask_head, list):
mask_head = [mask_head for _ in range(num_stages)]
assert len(mask_roi_extractor) == len(mask_head) == self.num_stages
for roi_extractor, head in zip(mask_roi_extractor, mask_head):
self.mask_roi_extractor.append(
builder.build_roi_extractor(roi_extractor))
self.mask_head.append(builder.build_head(head))
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.init_weights(pretrained=pretrained)
@property
def with_rpn(self):
return hasattr(self, 'rpn_head') and self.rpn_head is not None
def init_weights(self, pretrained=None):
super(CascadeRCNN, self).init_weights(pretrained)
self.backbone.init_weights(pretrained=pretrained)
if self.with_neck:
if isinstance(self.neck, nn.Sequential):
for m in self.neck:
m.init_weights()
else:
self.neck.init_weights()
if self.with_rpn:
self.rpn_head.init_weights()
for i in range(self.num_stages):
if self.with_bbox:
self.bbox_roi_extractor[i].init_weights()
self.bbox_head[i].init_weights()
if self.with_mask:
self.mask_roi_extractor[i].init_weights()
self.mask_head[i].init_weights()
def extract_feat(self, img):
x = self.backbone(img)
if self.with_neck:
x = self.neck(x)
return x
def forward_train(self,
img,
img_meta,
gt_bboxes,
gt_bboxes_ignore,
gt_labels,
gt_masks=None,
proposals=None):
x = self.extract_feat(img)
losses = dict()
if self.with_rpn:
rpn_outs = self.rpn_head(x)
rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
self.train_cfg.rpn)
rpn_losses = self.rpn_head.loss(*rpn_loss_inputs)
losses.update(rpn_losses)
proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn)
proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
else:
proposal_list = proposals
for i in range(self.num_stages):
rcnn_train_cfg = self.train_cfg.rcnn[i]
lw = self.train_cfg.stage_loss_weights[i]
# assign gts and sample proposals
assign_results, sampling_results = multi_apply(
assign_and_sample,
proposal_list,
gt_bboxes,
gt_bboxes_ignore,
gt_labels,
cfg=rcnn_train_cfg)
# bbox head forward and loss
bbox_roi_extractor = self.bbox_roi_extractor[i]
bbox_head = self.bbox_head[i]
rois = bbox2roi([res.bboxes for res in sampling_results])
bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
rois)
cls_score, bbox_pred = bbox_head(bbox_feats)
bbox_targets = bbox_head.get_target(sampling_results, gt_bboxes,
gt_labels, rcnn_train_cfg)
loss_bbox = bbox_head.loss(cls_score, bbox_pred, *bbox_targets)
for name, value in loss_bbox.items():
losses['s{}.{}'.format(i, name)] = (value * lw if
'loss' in name else value)
# mask head forward and loss
if self.with_mask:
mask_roi_extractor = self.mask_roi_extractor[i]
mask_head = self.mask_head[i]
pos_rois = bbox2roi(
[res.pos_bboxes for res in sampling_results])
mask_feats = mask_roi_extractor(
x[:mask_roi_extractor.num_inputs], pos_rois)
mask_pred = mask_head(mask_feats)
mask_targets = mask_head.get_target(sampling_results, gt_masks,
rcnn_train_cfg)
pos_labels = torch.cat(
[res.pos_gt_labels for res in sampling_results])
loss_mask = mask_head.loss(mask_pred, mask_targets, pos_labels)
for name, value in loss_mask.items():
losses['s{}.{}'.format(i, name)] = (value * lw
if 'loss' in name else
value)
# refine bboxes
if i < self.num_stages - 1:
pos_is_gts = [res.pos_is_gt for res in sampling_results]
roi_labels = bbox_targets[0] # bbox_targets is a tuple
with torch.no_grad():
proposal_list = bbox_head.refine_bboxes(
rois, roi_labels, bbox_pred, pos_is_gts, img_meta)
return losses
def simple_test(self, img, img_meta, proposals=None, rescale=False):
x = self.extract_feat(img)
proposal_list = self.simple_test_rpn(
x, img_meta, self.test_cfg.rpn) if proposals is None else proposals
img_shape = img_meta[0]['img_shape']
ori_shape = img_meta[0]['ori_shape']
scale_factor = img_meta[0]['scale_factor']
# "ms" in variable names means multi-stage
ms_bbox_result = {}
ms_segm_result = {}
ms_scores = []
rcnn_test_cfg = self.test_cfg.rcnn
rois = bbox2roi(proposal_list)
for i in range(self.num_stages):
bbox_roi_extractor = self.bbox_roi_extractor[i]
bbox_head = self.bbox_head[i]
bbox_feats = bbox_roi_extractor(
x[:len(bbox_roi_extractor.featmap_strides)], rois)
cls_score, bbox_pred = bbox_head(bbox_feats)
ms_scores.append(cls_score)
if self.test_cfg.keep_all_stages:
det_bboxes, det_labels = bbox_head.get_det_bboxes(
rois,
cls_score,
bbox_pred,
img_shape,
scale_factor,
rescale=rescale,
cfg=rcnn_test_cfg)
bbox_result = bbox2result(det_bboxes, det_labels,
bbox_head.num_classes)
ms_bbox_result['stage{}'.format(i)] = bbox_result
if self.with_mask:
mask_roi_extractor = self.mask_roi_extractor[i]
mask_head = self.mask_head[i]
if det_bboxes.shape[0] == 0:
segm_result = [
[] for _ in range(mask_head.num_classes - 1)
]
else:
_bboxes = (det_bboxes[:, :4] * scale_factor
if rescale else det_bboxes)
mask_rois = bbox2roi([_bboxes])
mask_feats = mask_roi_extractor(
x[:len(mask_roi_extractor.featmap_strides)],
mask_rois)
mask_pred = mask_head(mask_feats)
segm_result = mask_head.get_seg_masks(
mask_pred, _bboxes, det_labels, rcnn_test_cfg,
ori_shape, scale_factor, rescale)
ms_segm_result['stage{}'.format(i)] = segm_result
if i < self.num_stages - 1:
bbox_label = cls_score.argmax(dim=1)
rois = bbox_head.regress_by_class(rois, bbox_label, bbox_pred,
img_meta[0])
cls_score = sum(ms_scores) / self.num_stages
det_bboxes, det_labels = self.bbox_head[-1].get_det_bboxes(
rois,
cls_score,
bbox_pred,
img_shape,
scale_factor,
rescale=rescale,
cfg=rcnn_test_cfg)
bbox_result = bbox2result(det_bboxes, det_labels,
self.bbox_head[-1].num_classes)
ms_bbox_result['ensemble'] = bbox_result
if self.with_mask:
if det_bboxes.shape[0] == 0:
segm_result = [
[] for _ in range(self.mask_head[-1].num_classes - 1)
]
else:
_bboxes = (det_bboxes[:, :4] * scale_factor
if rescale else det_bboxes)
mask_rois = bbox2roi([_bboxes])
aug_masks = []
for i in range(self.num_stages):
mask_roi_extractor = self.mask_roi_extractor[i]
mask_feats = mask_roi_extractor(
x[:len(mask_roi_extractor.featmap_strides)], mask_rois)
mask_pred = self.mask_head[i](mask_feats)
aug_masks.append(mask_pred.sigmoid().cpu().numpy())
merged_masks = merge_aug_masks(aug_masks,
[img_meta] * self.num_stages,
self.test_cfg.rcnn)
segm_result = self.mask_head[-1].get_seg_masks(
merged_masks, _bboxes, det_labels, rcnn_test_cfg,
ori_shape, scale_factor, rescale)
ms_segm_result['ensemble'] = segm_result
if not self.test_cfg.keep_all_stages:
if self.with_mask:
results = (ms_bbox_result['ensemble'],
ms_segm_result['ensemble'])
else:
results = ms_bbox_result['ensemble']
else:
if self.with_mask:
results = {
stage: (ms_bbox_result[stage], ms_segm_result[stage])
for stage in ms_bbox_result
}
else:
results = ms_bbox_result
return results
def aug_test(self, img, img_meta, proposals=None, rescale=False):
raise NotImplementedError
def show_result(self, data, result, img_norm_cfg, **kwargs):
if self.with_mask:
ms_bbox_result, ms_segm_result = result
if isinstance(ms_bbox_result, dict):
result = (ms_bbox_result['ensemble'],
ms_segm_result['ensemble'])
else:
if isinstance(result, dict):
result = result['ensemble']
super(CascadeRCNN, self).show_result(data, result, img_norm_cfg,
**kwargs)
================================================
FILE: mmdet/models/detectors/fast_rcnn.py
================================================
from .two_stage import TwoStageDetector
from ..registry import DETECTORS
@DETECTORS.register_module
class FastRCNN(TwoStageDetector):
def __init__(self,
backbone,
neck,
bbox_roi_extractor,
bbox_head,
train_cfg,
test_cfg,
mask_roi_extractor=None,
mask_head=None,
pretrained=None):
super(FastRCNN, self).__init__(
backbone=backbone,
neck=neck,
bbox_roi_extractor=bbox_roi_extractor,
bbox_head=bbox_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
mask_roi_extractor=mask_roi_extractor,
mask_head=mask_head,
pretrained=pretrained)
def forward_test(self, imgs, img_metas, proposals, **kwargs):
for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
if not isinstance(var, list):
raise TypeError('{} must be a list, but got {}'.format(
name, type(var)))
num_augs = len(imgs)
if num_augs != len(img_metas):
raise ValueError(
'num of augmentations ({}) != num of image meta ({})'.format(
len(imgs), len(img_metas)))
# TODO: remove the restriction of imgs_per_gpu == 1 when prepared
imgs_per_gpu = imgs[0].size(0)
assert imgs_per_gpu == 1
if num_augs == 1:
return self.simple_test(imgs[0], img_metas[0], proposals[0],
**kwargs)
else:
return self.aug_test(imgs, img_metas, proposals, **kwargs)
================================================
FILE: mmdet/models/detectors/faster_rcnn.py
================================================
from .two_stage import TwoStageDetector
from ..registry import DETECTORS
@DETECTORS.register_module
class FasterRCNN(TwoStageDetector):
def __init__(self,
backbone,
neck,
rpn_head,
bbox_roi_extractor,
bbox_head,
train_cfg,
test_cfg,
pretrained=None):
super(FasterRCNN, self).__init__(
backbone=backbone,
neck=neck,
rpn_head=rpn_head,
bbox_roi_extractor=bbox_roi_extractor,
bbox_head=bbox_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
pretrained=pretrained)
================================================
FILE: mmdet/models/detectors/hkrm_rcnn.py
================================================
import torch
import torch.nn as nn
from .base import BaseDetector
from .test_mixins import RPNTestMixin, BBoxTestMixin, MaskTestMixin
from .. import builder
from ..registry import DETECTORS
from mmdet.core import bbox2roi, bbox2result, build_assigner, build_sampler
import numpy as np
import pickle
@DETECTORS.register_module
class HKRMRCNN(BaseDetector, RPNTestMixin, BBoxTestMixin,
MaskTestMixin):
def __init__(self,
backbone,
neck=None,
rpn_head=None,
bbox_roi_extractor=None,
bbox_head=None,
mask_roi_extractor=None,
mask_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None,
adja_gt=None,
adjr_gt=None):
super(HKRMRCNN, self).__init__()
self.backbone = builder.build_backbone(backbone)
if neck is not None:
self.neck = builder.build_neck(neck)
else:
raise NotImplementedError
if rpn_head is not None:
self.rpn_head = builder.build_rpn_head(rpn_head)
if bbox_head is not None:
self.bbox_roi_extractor = builder.build_roi_extractor(
bbox_roi_extractor)
self.bbox_head_hkrm = builder.build_bbox_head(bbox_head)
if mask_head is not None:
if mask_roi_extractor is not None:
self.mask_roi_extractor = builder.build_roi_extractor(
mask_roi_extractor)
self.mask_head = builder.build_mask_head(mask_head)
# read adj gts from .pkl
self.adja_gt = None
self.adjr_gt = None
if adja_gt is not None:
self.adja_gt = pickle.load(open(adja_gt, 'rb'))
self.adja_gt = np.float32(self.adja_gt)
self.adja_gt = nn.Parameter(torch.from_numpy(self.adja_gt), requires_grad=False)
if adjr_gt is not None:
self.adjr_gt = pickle.load(open(adjr_gt, 'rb'))
self.adjr_gt = np.float32(self.adjr_gt)
self.adjr_gt = nn.Parameter(torch.from_numpy(self.adjr_gt), requires_grad=False)
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.init_weights(pretrained=pretrained)
@property
def with_rpn(self):
return hasattr(self, 'rpn_head') and self.rpn_head is not None
def init_weights(self, pretrained=None):
super(HKRMRCNN, self).init_weights(pretrained)
self.backbone.init_weights(pretrained=pretrained)
if self.with_neck:
if isinstance(self.neck, nn.Sequential):
for m in self.neck:
m.init_weights()
else:
self.neck.init_weights()
if self.with_rpn:
self.rpn_head.init_weights()
if self.with_bbox:
self.bbox_roi_extractor.init_weights()
self.bbox_head_hkrm.init_weights()
if self.with_mask:
self.mask_head.init_weights()
def extract_feat(self, img):
x = self.backbone(img)
if self.with_neck:
x = self.neck(x)
return x
def forward_train(self,
img,
img_meta,
gt_bboxes,
gt_bboxes_ignore,
gt_labels,
gt_masks=None,
proposals=None):
x = self.extract_feat(img)
losses = dict()
# RPN forward and loss
if self.with_rpn:
rpn_outs = self.rpn_head(x)
rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
self.train_cfg.rpn)
rpn_losses = self.rpn_head.loss(*rpn_loss_inputs)
losses.update(rpn_losses)
proposal_inputs = rpn_outs + (img_meta, self.train_cfg.rpn)
proposal_list = self.rpn_head.get_proposals(*proposal_inputs)
else:
proposal_list = proposals
# assign gts and sample proposals
if self.with_bbox or self.with_mask:
bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner)
bbox_sampler = build_sampler(
self.train_cfg.rcnn.sampler, context=self)
num_imgs = img.size(0)
sampling_results = []
# gt adj list
gt_adja_list = []
gt_adjr_list = []
for i in range(num_imgs):
assign_result = bbox_assigner.assign(
proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i],
gt_labels[i])
sampling_result = bbox_sampler.sample(
assign_result,
proposal_list[i],
gt_bboxes[i],
gt_labels[i],
has_roi_score=True,
feats=[lvl_feat[i][None] for lvl_feat in x])
sampling_results.append(sampling_result)
# get adj matrix gt
index_ = torch.cat((sampling_result.pos_gt_labels,
sampling_result.pos_gt_labels.new_zeros((len(sampling_result.neg_bboxes)))))
assert len(index_) == len(sampling_result.bboxes)
if self.adja_gt is not None:
pos_gt = self.adja_gt[index_, :]
pos_gt = pos_gt.transpose(0, 1)[index_, :]
pos_gt = pos_gt.transpose(0, 1)
gt_adja_list.append(pos_gt)
if self.adjr_gt is not None:
pos_gt = self.adjr_gt[index_, :]
pos_gt = pos_gt.transpose(0, 1)[index_, :]
pos_gt = pos_gt.transpose(0, 1)
gt_adjr_list.append(pos_gt)
A_gt = []
if self.adja_gt is not None:
A_gt.append(torch.stack(gt_adja_list, 0))
if self.adjr_gt is not None:
A_gt.append(torch.stack(gt_adjr_list, 0))
# bbox head forward and loss
if self.with_bbox:
rois, rois_index = bbox2roi(
[(res.pos_bboxes, res.neg_bboxes) for res in sampling_results],
return_index=True)
# TODO: a more flexible way to decide which feature maps to use
bbox_feats = self.bbox_roi_extractor(
x[:self.bbox_roi_extractor.num_inputs], rois)
# Get grometric feature of rois
geom_f = []
for img_i, img_shape in enumerate(img_meta):
h, w, _ = img_meta[img_i]['pad_shape']
tmp_geo = rois[rois[:, 0] == img_i, 1:] / torch.Tensor([h, w, h, w]).cuda()
tmp_geo = torch.cat((tmp_geo, sampling_results[img_i].bboxes[:, 4].unsqueeze(1)), dim=-1)
geom_f.append(tmp_geo)
geom_f = torch.stack(geom_f, 0)
# bbox_feats = bbox_feats.view(len(img_meta), -1, bbox_feats.size(-1))
assert len(geom_f.size()) == 3
cls_score, bbox_pred, A_pred = self.bbox_head_hkrm(bbox_feats, geom_f, len(img_meta))
bbox_targets = self.bbox_head_hkrm.get_target(
sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn)
loss_bbox = self.bbox_head_hkrm.loss(cls_score, bbox_pred,
A_pred, A_gt,
*bbox_targets)
losses.update(loss_bbox)
# mask head forward and loss
if self.with_mask:
if self.with_mask_roi_extractor:
pos_rois = bbox2roi(
[res.pos_bboxes for res in sampling_results])
mask_feats = self.mask_roi_extractor(
x[:self.mask_roi_extractor.num_inputs], pos_rois)
else:
pos_inds = (rois_index == 0)
mask_feats = bbox_feats[pos_inds]
mask_pred = self.mask_head(mask_feats)
mask_targets = self.mask_head.get_target(
sampling_results, gt_masks, self.train_cfg.rcnn)
pos_labels = torch.cat(
[res.pos_gt_labels for res in sampling_results])
loss_mask = self.mask_head.loss(mask_pred, mask_targets,
pos_labels)
losses.update(loss_mask)
return losses
def simple_test(self, img, img_meta, proposals=None, rescale=False):
"""Test without augmentation."""
assert self.with_bbox, "Bbox head must be implemented."
x = self.extract_feat(img)
proposal_list = self.simple_test_rpn(
x, img_meta, self.test_cfg.rpn) if proposals is None else proposals
det_bboxes, det_labels = self.simple_test_bboxes_hkrm(
x, img_meta, proposal_list, self.test_cfg.rcnn, rescale=rescale, use_hkrm=True)
bbox_results = bbox2result(det_bboxes, det_labels,
self.bbox_head_hkrm.num_classes)
if not self.with_mask:
return bbox_results
else:
segm_results = self.simple_test_mask(
x, img_meta, det_bboxes, det_labels, rescale=rescale)
return bbox_results, segm_results
def aug_test(self, imgs, img_metas, rescale=False):
"""Test with augmentations.
If rescale is False, then returned bboxes and masks will fit the scale
of imgs[0].
"""
# recompute feats to save memory
proposal_list = self.aug_test_rpn(
self.extract_feats(imgs), img_metas, self.test_cfg.rpn)
det_bboxes, det_labels = self.aug_test_bboxes_hkrm(
self.extract_feats(imgs), img_metas, proposal_list,
self.test_cfg.rcnn, use_hkrm=True)
if rescale:
_det_bboxes = det_bboxes
else:
_det_bboxes = det_bboxes.clone()
_det_bboxes[:, :4] *= img_metas[0][0]['scale_factor']
bbox_results = bbox2result(_det_bboxes, det_labels,
self.bbox_head_hkrm.num_classes)
# det_bboxes always keep the original scale
if self.with_mask:
segm_results = self.aug_test_mask(
self.extract_feats(imgs), img_metas, det_bboxes, det_labels)
return bbox_results, segm_results
else:
return bbox_results
def simple_test_bboxes_hkrm(self,
x,
img_meta,
proposals,
rcnn_test_cfg,
rescale=False,
use_hkrm=False):
"""Test only det bboxes without augmentation."""
rois = bbox2roi(proposals)
roi_feats = self.bbox_roi_extractor(
x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
if self.with_upper_neck:
roi_feats = self.upper_neck(roi_feats)
if use_hkrm:
# Get grometric feature of rois
geom_f = []
for img_i, img_shape in enumerate(img_meta):
h, w, _ = img_meta[img_i]['pad_shape']
tmp_geo = rois[rois[:, 0] == img_i, 1:] / torch.Tensor([h, w, h, w]).cuda()
tmp_geo = torch.cat((tmp_geo, proposals[img_i][:, 4].unsqueeze(1)), dim=-1)
geom_f.append(tmp_geo)
geom_f = torch.stack(geom_f, 0)
assert len(geom_f.size()) == 3
cls_score, bbox_pred, A_pred = self.bbox_head_hkrm(roi_feats, geom_f, len(img_meta))
else:
cls_score, bbox_pred = self.bbox_head_hkrm(roi_feats)
img_shape = img_meta[0]['img_shape']
scale_factor = img_meta[0]['scale_factor']
det_bboxes, det_labels = self.bbox_head_hkrm.get_det_bboxes(
rois,
cls_score,
bbox_pred,
img_shape,
scale_factor,
rescale=rescale,
cfg=rcnn_test_cfg)
return det_bboxes, det_labels
def aug_test_bboxes_hkrm(self, feats, img_metas, proposal_list, rcnn_test_cfg, use_hkrm=False):
aug_bboxes = []
aug_scores = []
for x, img_meta in zip(feats, img_metas):
# only one image in the batch
img_shape = img_meta[0]['img_shape']
scale_factor = img_meta[0]['scale_factor']
flip = img_meta[0]['flip']
# TODO more flexible
proposals = bbox_mapping(proposal_list[0][:, :4], img_shape,
scale_factor, flip)
rois = bbox2roi([proposals])
# recompute feature maps to save GPU memory
roi_feats = self.bbox_roi_extractor(
x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
if self.with_upper_neck:
roi_feats = self.upper_neck(roi_feats)
if use_hkrm:
# Get grometric feature of rois
geom_f = []
h, w, _ = img_meta['pad_shape']
tmp_geo = rois[:, 1:] / torch.Tensor([h, w, h, w]).cuda()
tmp_geo = torch.cat((tmp_geo, proposals[:, 4].unsqueeze(1)), dim=-1)
geom_f.append(tmp_geo)
geom_f = torch.stack(geom_f, 0)
assert len(geom_f.size()) == 3
cls_score, bbox_pred, A_pred = self.bbox_head_hkrm(roi_feats, geom_f, len(img_meta))
else:
cls_score, bbox_pred = self.bbox_head_hkrm(roi_feats)
bboxes, scores = self.bbox_head_hkrm.get_det_bboxes(
rois,
cls_score,
bbox_pred,
img_shape,
scale_factor,
rescale=False,
cfg=None)
aug_bboxes.append(bboxes)
aug_scores.append(scores)
# after merging, bboxes will be rescaled to the original image size
merged_bboxes, merged_scores = merge_aug_bboxes(
aug_bboxes, aug_scores, img_metas, rcnn_test_cfg)
det_bboxes, det_labels = multiclass_nms(
merged_bboxes, merged_scores, rcnn_test_cfg.score_thr,
rcnn_test_cfg.nms, rcnn_test_cfg.max_per_img)
return det_bboxes, det_labels
================================================
FILE: mmdet/models/detectors/mask_rcnn.py
================================================
from .two_stage import TwoStageDetector
from ..registry import DETECTORS
@DETECTORS.register_module
class MaskRCNN(TwoStageDetector):
def __init__(self,
backbone,
neck,
rpn_head,
bbox_roi_extractor,
bbox_head,
mask_roi_extractor,
mask_head,
train_cfg,
test_cfg,
pretrained=None):
super(MaskRCNN, self).__init__(
backbone=backbone,
neck=neck,
rpn_head=rpn_head,
bbox_roi_extractor=bbox_roi_extractor,
bbox_head=bbox_head,
mask_roi_extractor=mask_roi_extractor,
mask_head=mask_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
pretrained=pretrained)
================================================
FILE: mmdet/models/detectors/reasoning_rcnn.py
================================================
from __future__ import division
import torch
import torch.nn as nn
from .base import BaseDetector
from .test_mixins import RPNTestMixin
from .. import builder
from ..registry import DETECTORS
from mmdet.core import (assign_and_sample, bbox2roi, bbox2result, multi_apply,
merge_aug_masks)
import numpy as np
import pickle
from ..utils import ConvModule
import torch.nn.functional as F
@DETECTORS.register_module
class ReasoningRCNN(BaseDetector, RPNTestMixin):
def __init__(self,
num_stages,
backbone,
neck=None,
upper_neck=None,
rpn_head=None,
bbox_roi_extractor=None,
bbox_head=None,
mask_roi_extractor=None,
mask_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None,
adj_gt=None,
graph_out_channels=256,
normalize=None,
roi_feat_size=7,
shared_num_fc=2):
assert bbox_roi_extractor is not None
assert bbox_head is not None
super(ReasoningRCNN, self).__init__()
self.num_stages = num_stages
self.backbone = builder.build_backbone(backbone)
if neck is not None:
self.neck = builder.build_neck(neck)
else:
assert upper_neck is not None
if rpn_head is not None:
self.rpn_head = builder.build_rpn_head(rpn_head)
if upper_neck is not None:
if isinstance(upper_neck, list):
self.upper_neck = nn.ModuleList()
assert len(upper_neck) == self.num_stages
for neck in upper_neck:
self.upper_neck.append(builder.build_upper_neck(neck))
else:
self.upper_neck = builder.build_upper_neck(upper_neck)
if bbox_head is not None:
self.bbox_roi_extractor = nn.ModuleList()
self.bbox_head = nn.ModuleList()
if not isinstance(bbox_roi_extractor, list):
bbox_roi_extractor = [
bbox_roi_extractor for _ in range(num_stages)
]
if not isinstance(bbox_head, list):
bbox_head = [bbox_head for _ in range(num_stages)]
assert len(bbox_roi_extractor) == len(bbox_head) == self.num_stages
for roi_extractor, head in zip(bbox_roi_extractor, bbox_head):
self.bbox_roi_extractor.append(
builder.build_roi_extractor(roi_extractor))
self.bbox_head.append(builder.build_bbox_head(head))
if mask_head is not None:
self.mask_head = nn.ModuleList()
if not isinstance(mask_head, list):
mask_head = [mask_head for _ in range(num_stages)]
assert len(mask_head) == self.num_stages
for head in mask_head:
self.mask_head.append(builder.build_mask_head(head))
if mask_roi_extractor is not None:
self.mask_roi_extractor = nn.ModuleList()
if not isinstance(mask_roi_extractor, list):
mask_roi_extractor = [
mask_roi_extractor for _ in range(num_stages)
]
assert len(mask_roi_extractor) == self.num_stages
for roi_extractor in mask_roi_extractor:
self.mask_roi_extractor.append(
builder.build_roi_extractor(roi_extractor))
self.normalize = normalize
self.with_bias = normalize is None
if adj_gt is not None:
self.adj_gt = pickle.load(open(adj_gt, 'rb'))
self.adj_gt = np.float32(self.adj_gt)
self.adj_gt = nn.Parameter(torch.from_numpy(self.adj_gt), requires_grad=False)
# init cmp attention
self.cmp_attention = nn.ModuleList()
self.cmp_attention.append(
ConvModule(1024, 1024 // 16,
3, stride=2, padding=1, normalize=self.normalize, bias=self.with_bias))
self.cmp_attention.append(
nn.Linear(1024 // 16, bbox_head[0]['in_channels'] + 1))
# init graph w
self.graph_out_channels = graph_out_channels
self.graph_weight_fc = nn.Linear(bbox_head[0]['in_channels'] + 1, self.graph_out_channels)
self.relu = nn.ReLU(inplace=True)
# shared upper neck
in_channels = rpn_head['in_channels']
if shared_num_fc > 0:
in_channels *= (roi_feat_size * roi_feat_size)
self.branch_fcs = nn.ModuleList()
for i in range(shared_num_fc):
fc_in_channels = (in_channels
if i == 0 else bbox_head[0]['in_channels'])
self.branch_fcs.append(
nn.Linear(fc_in_channels, bbox_head[0]['in_channels']))
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.init_weights(pretrained=pretrained)
@property
def with_rpn(self):
return hasattr(self, 'rpn_head') and self.rpn_head is not None
def init_weights(self, pretrained=None):
super(ReasoningRCNN, self).init_weights(pretrained)
self.backbone.init_weights(pretrained=pretrained)
if self.with_neck:
if isinstance(self.neck, nn.Sequential):
for m in self.neck:
m.init_weights()
else:
self.neck.init_weights()
if self.with_rpn:
self.rpn_head.init_weights()
for i in range(self.num_stages):
if self.with_bbox:
self.bbox_roi_extractor[i].init_weights()
self.bbox_head[i].init_weights()
if self.with_mask_roi_extractor:
self.mask_roi_extractor[i].init_weights()
if self.with_mask:
self.mask_head[i].init_weights()
def extract_feat(self, img):
x = self.backbone(img)
if self.with_neck:
x = self.neck(x)
return x
def forward_upper_neck(self, x, stage):
if self.with_share_upper_neck:
x = self.upper_neck(x)
elif self.with_unshare_upper_neck:
x = self.upper_neck[stage](x)
return x
def forward_train(self,
img,
img_meta,
gt_bboxes,
gt_bboxes_ignore,
gt_labels,
gt_masks=None,
proposals=None):
x = self.extract_feat(img)
# precmp attention
if len(x) > 1:
base_feat = []
for b_f in x[1:]:
base_feat.append(
F.interpolate(b_f, scale_factor=(x[2].size(2) / b_f.size(2), x[2].size(3) / b_f.size(3))))
base_feat = torch.cat(base_feat, 1)
else:
base_feat = torch.cat(x, 1)
for ops in self.cmp_attention:
base_feat = ops(base_feat)
if len(base_feat.size()) > 2:
base_feat = base_feat.mean(3).mean(2)
else:
base_feat = self.relu(base_feat)
losses = dict()
if self.with_rpn:
rpn_outs = self.rpn_head(x)
rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
self.train_cfg.rpn)
rpn_losses = self.rpn_head.loss(*rpn_loss_inputs)
losses.update(rpn_losses)
proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn)
proposal_list = self.rpn_head.get_proposals(*proposal_inputs)
else:
proposal_list = proposals
for i in range(self.num_stages):
rcnn_train_cfg = self.train_cfg.rcnn[i]
lw = self.train_cfg.stage_loss_weights[i]
# add reasoning process
if i > 0:
# 1.build global semantic pool
global_semantic_pool = torch.cat((bbox_head.fc_cls.weight,
bbox_head.fc_cls.bias.unsqueeze(1)), 1).detach()
# 2.compute graph attention
attention_map = nn.Softmax(1)(torch.mm(base_feat, torch.transpose(global_semantic_pool, 0, 1)))
# 3.adaptive global reasoning
alpha_em = attention_map.unsqueeze(-1) * torch.mm(self.adj_gt, global_semantic_pool).unsqueeze(0)
alpha_em = alpha_em.view(-1, global_semantic_pool.size(-1))
alpha_em = self.graph_weight_fc(alpha_em)
alpha_em = self.relu(alpha_em)
# enhanced_feat = torch.mm(nn.Softmax(1)(cls_score), alpha_em)
n_classes = bbox_head.fc_cls.weight.size(0)
cls_prob = nn.Softmax(1)(cls_score).view(len(img_meta), -1, n_classes)
enhanced_feat = torch.bmm(cls_prob, alpha_em.view(len(img_meta), -1, self.graph_out_channels))
enhanced_feat = enhanced_feat.view(-1, self.graph_out_channels)
# assign gts and sample proposals
assign_results, sampling_results = multi_apply(
assign_and_sample,
proposal_list,
gt_bboxes,
gt_bboxes_ignore,
gt_labels,
cfg=rcnn_train_cfg)
# bbox head forward and loss
bbox_roi_extractor = self.bbox_roi_extractor[i]
bbox_head = self.bbox_head[i]
rois, rois_index = bbox2roi(
[(res.pos_bboxes, res.neg_bboxes) for res in sampling_results],
return_index=True)
bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
rois)
# without upperneck
bbox_feats = bbox_feats.view(bbox_feats.size(0), -1)
for fc in self.branch_fcs:
bbox_feats = self.relu(fc(bbox_feats))
# cat with enhanced feature
if i > 0:
bbox_feats = torch.cat([bbox_feats, enhanced_feat], 1)
cls_score, bbox_pred = bbox_head(bbox_feats)
bbox_targets = bbox_head.get_target(sampling_results, gt_bboxes,
gt_labels, rcnn_train_cfg)
loss_bbox = bbox_head.loss(cls_score, bbox_pred, *bbox_targets)
for name, value in loss_bbox.items():
losses['s{}.{}'.format(
i, name)] = (value * lw if 'loss' in name else value)
# mask head forward and loss
if self.with_mask:
if self.with_mask_roi_extractor:
mask_roi_extractor = self.mask_roi_extractor[i]
pos_rois = bbox2roi(
[res.pos_bboxes for res in sampling_results])
mask_feats = mask_roi_extractor(
x[:mask_roi_extractor.num_inputs], pos_rois)
mask_feats = self.forward_upper_neck(mask_feats, i)
else:
pos_inds = (rois_index == 0)
mask_feats = bbox_feats[pos_inds]
mask_head = self.mask_head[i]
mask_pred = mask_head(mask_feats)
mask_targets = mask_head.get_target(sampling_results, gt_masks,
rcnn_train_cfg)
pos_labels = torch.cat(
[res.pos_gt_labels for res in sampling_results])
loss_mask = mask_head.loss(mask_pred, mask_targets, pos_labels)
for name, value in loss_mask.items():
losses['s{}.{}'.format(
i, name)] = (value * lw if 'loss' in name else value)
# refine bboxes
if i < self.num_stages - 1:
pos_is_gts = [res.pos_is_gt for res in sampling_results]
roi_labels = bbox_targets[0] # bbox_targets is a tuple
with torch.no_grad():
proposal_list = bbox_head.refine_bboxes(
rois, roi_labels, bbox_pred, pos_is_gts, img_meta)
return losses
def simple_test(self, img, img_meta, proposals=None, rescale=False):
x = self.extract_feat(img)
# precmp attention
if len(x) > 1:
base_feat = []
for b_f in x[1:]:
base_feat.append(
F.interpolate(b_f, scale_factor=(x[2].size(2) / b_f.size(2), x[2].size(3) / b_f.size(3))))
base_feat = torch.cat(base_feat, 1)
else:
base_feat = torch.cat(x, 1)
for ops in self.cmp_attention:
base_feat = ops(base_feat)
if len(base_feat.size()) > 2:
base_feat = base_feat.mean(3).mean(2)
else:
base_feat = self.relu(base_feat)
proposal_list = self.simple_test_rpn(
x, img_meta, self.test_cfg.rpn) if proposals is None else proposals
img_shape = img_meta[0]['img_shape']
ori_shape = img_meta[0]['ori_shape']
scale_factor = img_meta[0]['scale_factor']
# "ms" in variable names means multi-stage
ms_bbox_result = {}
ms_segm_result = {}
ms_scores = []
rcnn_test_cfg = self.test_cfg.rcnn
rois = bbox2roi(proposal_list)
for i in range(self.num_stages):
# add reasoning process
if i > 0:
# transform CxC classes graph to region
# 1.build global semantic pool
global_semantic_pool = torch.cat((bbox_head.fc_cls.weight,
bbox_head.fc_cls.bias.unsqueeze(1)), 1).detach()
# 2.compute graph attention
attention_map = nn.Softmax(1)(torch.mm(base_feat, torch.transpose(global_semantic_pool, 0, 1)))
# 3.adaptive global reasoning
alpha_em = attention_map.unsqueeze(-1) * torch.mm(self.adj_gt, global_semantic_pool).unsqueeze(0)
alpha_em = alpha_em.view(-1, global_semantic_pool.size(-1))
alpha_em = self.graph_weight_fc(alpha_em)
alpha_em = self.relu(alpha_em)
n_classes = bbox_head.fc_cls.weight.size(0)
cls_prob = nn.Softmax(1)(cls_score).view(len(img_meta), -1, n_classes)
enhanced_feat = torch.bmm(cls_prob, alpha_em.view(len(img_meta), -1, self.graph_out_channels))
enhanced_feat = enhanced_feat.view(-1, self.graph_out_channels)
bbox_roi_extractor = self.bbox_roi_extractor[i]
bbox_head = self.bbox_head[i]
bbox_feats = bbox_roi_extractor(
x[:len(bbox_roi_extractor.featmap_strides)], rois)
# bbox_feats = self.forward_upper_neck(bbox_feats, i)
# without upperneck
bbox_feats = bbox_feats.view(bbox_feats.size(0), -1)
for fc in self.branch_fcs:
bbox_feats = self.relu(fc(bbox_feats))
# cat with enhanced feature
if i > 0:
bbox_feats = torch.cat([bbox_feats, enhanced_feat], 1)
cls_score, bbox_pred = bbox_head(bbox_feats)
ms_scores.append(cls_score)
if self.test_cfg.keep_all_stages:
det_bboxes, det_labels = bbox_head.get_det_bboxes(
rois,
cls_score,
bbox_pred,
img_shape,
scale_factor,
rescale=rescale,
cfg=rcnn_test_cfg)
bbox_result = bbox2result(det_bboxes, det_labels,
bbox_head.num_classes)
ms_bbox_result['stage{}'.format(i)] = bbox_result
if self.with_mask:
if self.with_mask_roi_extractor:
mask_roi_extractor = self.mask_roi_extractor[i]
else:
mask_roi_extractor = self.bbox_roi_extractor[i]
mask_head = self.mask_head[i]
if det_bboxes.shape[0] == 0:
segm_result = [
[] for _ in range(mask_head.num_classes - 1)
]
else:
_bboxes = (det_bboxes[:, :4] * scale_factor
if rescale else det_bboxes)
mask_rois = bbox2roi([_bboxes])
mask_feats = mask_roi_extractor(
x[:len(mask_roi_extractor.featmap_strides)],
mask_rois)
mask_feats = self.forward_upper_neck(mask_feats, i)
mask_pred = mask_head(mask_feats)
segm_result = mask_head.get_seg_masks(
mask_pred, _bboxes, det_labels, rcnn_test_cfg,
ori_shape, scale_factor, rescale)
ms_segm_result['stage{}'.format(i)] = segm_result
if i < self.num_stages - 1:
bbox_label = cls_score.argmax(dim=1)
rois = bbox_head.regress_by_class(rois, bbox_label, bbox_pred,
img_meta[0])
cls_score = sum(ms_scores) / self.num_stages
det_bboxes, det_labels = self.bbox_head[-1].get_det_bboxes(
rois,
cls_score,
bbox_pred,
img_shape,
scale_factor,
rescale=rescale,
cfg=rcnn_test_cfg)
bbox_result = bbox2result(det_bboxes, det_labels,
self.bbox_head[-1].num_classes)
ms_bbox_result['ensemble'] = bbox_result
if self.with_mask:
if det_bboxes.shape[0] == 0:
segm_result = [
[] for _ in range(self.mask_head[-1].num_classes - 1)
]
else:
_bboxes = (det_bboxes[:, :4] * scale_factor
if rescale else det_bboxes)
mask_rois = bbox2roi([_bboxes])
aug_masks = []
for i in range(self.num_stages):
if self.with_mask_roi_extractor:
mask_roi_extractor = self.mask_roi_extractor[i]
else:
mask_roi_extractor = self.bbox_roi_extractor[i]
mask_feats = mask_roi_extractor(
x[:len(mask_roi_extractor.featmap_strides)], mask_rois)
mask_feats = self.forward_upper_neck(mask_feats, i)
mask_pred = self.mask_head[i](mask_feats)
aug_masks.append(mask_pred.sigmoid().cpu().numpy())
merged_masks = merge_aug_masks(aug_masks,
[img_meta] * self.num_stages,
self.test_cfg.rcnn)
segm_result = self.mask_head[-1].get_seg_masks(
merged_masks, _bboxes, det_labels, rcnn_test_cfg,
ori_shape, scale_factor, rescale)
ms_segm_result['ensemble'] = segm_result
if not self.test_cfg.keep_all_stages:
if self.with_mask:
results = (ms_bbox_result['ensemble'],
ms_segm_result['ensemble'])
else:
results = ms_bbox_result['ensemble']
else:
if self.with_mask:
results = {
stage: (ms_bbox_result[stage], ms_segm_result[stage])
for stage in ms_bbox_result
}
else:
results = ms_bbox_result
return results
def aug_test(self, img, img_meta, proposals=None, rescale=False):
raise NotImplementedError
def show_result(self, data, result, img_norm_cfg, **kwargs):
if self.with_mask:
ms_bbox_result, ms_segm_result = result
if isinstance(ms_bbox_result, dict):
result = (ms_bbox_result['ensemble'],
ms_segm_result['ensemble'])
else:
if isinstance(result, dict):
result = result['ensemble']
super(ReasoningRCNN, self).show_result(data, result, img_norm_cfg,
**kwargs)
================================================
FILE: mmdet/models/detectors/retinanet.py
================================================
from .single_stage import SingleStageDetector
from ..registry import DETECTORS
@DETECTORS.register_module
class RetinaNet(SingleStageDetector):
def __init__(self,
backbone,
neck,
bbox_head,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(RetinaNet, self).__init__(backbone, neck, bbox_head, train_cfg,
test_cfg, pretrained)
================================================
FILE: mmdet/models/detectors/rpn.py
================================================
import mmcv
from mmdet.core import tensor2imgs, bbox_mapping
from .base import BaseDetector
from .test_mixins import RPNTestMixin
from .. import builder
from ..registry import DETECTORS
@DETECTORS.register_module
class RPN(BaseDetector, RPNTestMixin):
def __init__(self,
backbone,
neck,
rpn_head,
train_cfg,
test_cfg,
pretrained=None):
super(RPN, self).__init__()
self.backbone = builder.build_backbone(backbone)
self.neck = builder.build_neck(neck) if neck is not None else None
self.rpn_head = builder.build_head(rpn_head)
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.init_weights(pretrained=pretrained)
def init_weights(self, pretrained=None):
super(RPN, self).init_weights(pretrained)
self.backbone.init_weights(pretrained=pretrained)
if self.with_neck:
self.neck.init_weights()
self.rpn_head.init_weights()
def extract_feat(self, img):
x = self.backbone(img)
if self.with_neck:
x = self.neck(x)
return x
def forward_train(self, img, img_meta, gt_bboxes=None):
if self.train_cfg.rpn.get('debug', False):
self.rpn_head.debug_imgs = tensor2imgs(img)
x = self.extract_feat(img)
rpn_outs = self.rpn_head(x)
rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn)
losses = self.rpn_head.loss(*rpn_loss_inputs)
return losses
def simple_test(self, img, img_meta, rescale=False):
x = self.extract_feat(img)
proposal_list = self.simple_test_rpn(x, img_meta, self.test_cfg.rpn)
if rescale:
for proposals, meta in zip(proposal_list, img_meta):
proposals[:, :4] /= meta['scale_factor']
# TODO: remove this restriction
return proposal_list[0].cpu().numpy()
def aug_test(self, imgs, img_metas, rescale=False):
proposal_list = self.aug_test_rpn(
self.extract_feats(imgs), img_metas, self.test_cfg.rpn)
if not rescale:
for proposals, img_meta in zip(proposal_list, img_metas[0]):
img_shape = img_meta['img_shape']
scale_factor = img_meta['scale_factor']
flip = img_meta['flip']
proposals[:, :4] = bbox_mapping(proposals[:, :4], img_shape,
scale_factor, flip)
# TODO: remove this restriction
return proposal_list[0].cpu().numpy()
def show_result(self, data, result, img_norm_cfg):
"""Show RPN proposals on the image.
Although we assume batch size is 1, this method supports arbitrary
batch size.
"""
img_tensor = data['img'][0]
img_metas = data['img_meta'][0].data[0]
imgs = tensor2imgs(img_tensor, **img_norm_cfg)
assert len(imgs) == len(img_metas)
for img, img_meta in zip(imgs, img_metas):
h, w, _ = img_meta['img_shape']
img_show = img[:h, :w, :]
mmcv.imshow_bboxes(img_show, result, top_k=20)
================================================
FILE: mmdet/models/detectors/sgrn.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module
import numpy as np
from .base import BaseDetector
from .test_mixins import RPNTestMixin, BBoxTestMixin, MaskTestMixin
from .. import builder
from mmdet.core import sample_bboxes_return_index, bbox2roi, bbox2result, multi_apply
class ThreeStageGraphDetector(BaseDetector, RPNTestMixin, BBoxTestMixin,
MaskTestMixin):
def __init__(self,
backbone,
neck=None,
rpn_head=None,
bbox_roi_extractor=None,
bbox_head=None,
mask_roi_extractor=None,
mask_head=None,
graph_convolution=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(ThreeStageGraphDetector, self).__init__()
self.backbone = builder.build_backbone(backbone)
if neck is not None:
self.neck = builder.build_neck(neck)
else:
raise NotImplementedError
if rpn_head is not None:
self.rpn_head = builder.build_rpn_head(rpn_head)
if bbox_head is not None:
self.bbox_roi_extractor = builder.build_roi_extractor(
bbox_roi_extractor)
self.bbox_head = builder.build_bbox_head(bbox_head[0])
self.bbox_roi_extractor_2 = builder.build_roi_extractor(
bbox_roi_extractor)
self.bbox_head_en = builder.build_bbox_head(bbox_head[1])
if mask_head is not None:
self.mask_roi_extractor = builder.build_roi_extractor(
mask_roi_extractor)
self.mask_head = builder.build_mask_head(mask_head)
if mask_head is not None:
self.mask_roi_extractor = builder.build_roi_extractor(
mask_roi_extractor)
self.mask_head = builder.build_mask_head(mask_head)
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.n_graph_node = graph_convolution.n_graph_node
# Graph Module
self.latent_graph_channel = graph_convolution.latent_graph_channel
self.n_kernels = graph_convolution.n_kernels_gc
self.neigh_size = graph_convolution.neigh_size
# graph learner
self.adjacency_learner = GraphLearner(in_feature_dim=1024, combined_feature_dim=256)
# graph convolution layers
self.graph_convolution_1 = NeighbourhoodGraphConvolution(bbox_head[0].fc_out_channels+1,
self.latent_graph_channel*2, self.n_kernels, 2)
self.graph_convolution_2 = NeighbourhoodGraphConvolution(self.latent_graph_channel*2,
self.latent_graph_channel, self.n_kernels, 2)
self.dropout = nn.Dropout(p=0.5)
#self.bn_1 = nn.BatchNorm1d(self.latent_graph_channel*2)
#self.bn_2 = nn.BatchNorm1d(self.latent_graph_channel)
self.relu = nn.ReLU(inplace=True)
self.init_weights(pretrained=pretrained)
@property
def with_rpn(self):
return hasattr(self, 'rpn_head') and self.rpn_head is not None
def init_weights(self, pretrained=None):
super(ThreeStageGraphDetector, self).init_weights(pretrained)
self.backbone.init_weights(pretrained=pretrained)
if self.with_neck:
if isinstance(self.neck, nn.Sequential):
for m in self.neck:
m.init_weights()
else:
self.neck.init_weights()
if self.with_rpn:
self.rpn_head.init_weights()
if self.with_bbox:
self.bbox_roi_extractor.init_weights()
self.bbox_head.init_weights()
self.bbox_roi_extractor_2.init_weights()
self.bbox_head_en.init_weights()
def extract_feat(self, img):
x = self.backbone(img)
if self.with_neck:
x = self.neck(x)
return x
def forward_train(self,
img,
img_meta,
gt_bboxes,
gt_bboxes_ignore,
gt_labels,
gt_masks=None,
proposals=None):
batch_size = len(img_meta)
losses = dict()
x = self.extract_feat(img)
if self.with_rpn:
rpn_outs = self.rpn_head(x)
rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
self.train_cfg.rpn)
rpn_losses = self.rpn_head.loss(*rpn_loss_inputs)
losses.update(rpn_losses)
proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn)
proposal_list = self.rpn_head.get_proposals(*proposal_inputs)
else:
proposal_list = proposals
if self.with_bbox:
(pos_inds, neg_inds, pos_proposals, neg_proposals, pos_assigned_gt_inds, pos_gt_bboxes,
pos_gt_labels) = multi_apply(
sample_bboxes_return_index,
proposal_list,
gt_bboxes,
gt_bboxes_ignore,
gt_labels,
cfg=self.train_cfg.rcnn)
(labels, label_weights, bbox_targets,
bbox_weights) = self.bbox_head.get_bbox_target(
pos_proposals, neg_proposals, pos_gt_bboxes, pos_gt_labels,
self.train_cfg.rcnn)
rois = bbox2roi([
torch.cat([pos, neg], dim=0)
for pos, neg in zip(pos_proposals, neg_proposals)
])
# TODO: a more flexible way to configurate feat maps
roi_feats = self.bbox_roi_extractor(
x[:self.bbox_roi_extractor.num_inputs], rois)
cls_score, bbox_pred = self.bbox_head(roi_feats)
loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, labels,
label_weights, bbox_targets,
bbox_weights)
losses.update(loss_bbox)
#next stage
# Get weight from fc layer to become the pool
feature_pool_weight = torch.cat([self.bbox_head.fc_cls.weight, self.bbox_head.fc_cls.bias.unsqueeze(1)], 1).detach()
# Get the ideal soft weight for each bbox(roi)
#cls_prob = nn.functional.softmax(cls_score, 1)
#fatched_mixed_weight = torch.mm(cls_prob, feature_pool_weight)
max_cls = torch.max(cls_score, 1)
fatched_mixed_weight = feature_pool_weight[max_cls[1], :]
# detach here
img_shapes = [zx['img_shape'] for zx in img_meta]
refined_rois = self.bbox_head.refine_bboxes(
rois,
labels,
bbox_pred,
img_shapes,
gt_labels,
pos_inds,
has_gt_in_roi=False)
refined_rois = [r.detach() for r in refined_rois]
(pos_inds, neg_inds, pos_proposals, neg_proposals, pos_assigned_gt_inds, pos_gt_bboxes,
pos_gt_labels) = multi_apply(
sample_bboxes_return_index,
refined_rois,
gt_bboxes,
gt_bboxes_ignore,
gt_labels,
cfg=self.train_cfg.rcnn2)
(labels, label_weights, bbox_targets,
bbox_weights) = self.bbox_head.get_bbox_target(
pos_proposals, neg_proposals, pos_gt_bboxes, pos_gt_labels,
self.train_cfg.rcnn2)
bbox_each = [torch.cat([pos, neg], dim=0) for pos, neg in zip(pos_proposals, neg_proposals)]
rois_2 = bbox2roi(bbox_each)
bb=[]
for one_img_idx, bbox_each_each_img in enumerate(bbox_each):
_bbox_each = bbox_each_each_img.clone()
_bbox_each[:, 0] = _bbox_each[:, 0]/img_shapes[one_img_idx][0]
_bbox_each[:, 1] = _bbox_each[:, 1]/img_shapes[one_img_idx][1]
_bbox_each[:, 2] = _bbox_each[:, 2]/img_shapes[one_img_idx][0]
_bbox_each[:, 3] = _bbox_each[:, 3]/img_shapes[one_img_idx][1]
bb.append(_bbox_each)
bb = torch.cat(bb, 0).view(batch_size, -1, 4)
# Compute pseudo coordinates
# extract bounding boxes and compute centres
bb_size = (bb[:, :, 2:] - bb[:, :, :2])
bb_centre = bb[:, :, :2] + 0.5 * bb_size
# Compute pseudo coordinates
pseudo_coord = self._compute_pseudo(bb_centre)
roi_feats_2 = self.bbox_roi_extractor_2(
x[:self.bbox_roi_extractor_2.num_inputs], rois_2)
# shared with last fc
roi_feats_2 = roi_feats_2.view(roi_feats_2.size(0), -1)
for fc in self.bbox_head.shared_fcs:
roi_feats_2 = self.relu(fc(roi_feats_2))
input_graph_learner = roi_feats_2.detach()
#input_graph_learner = roi_feats_2.mean(3).mean(2)
input_graph_learner = input_graph_learner.view(batch_size, -1, input_graph_learner.size(-1))
# Learn adjacency matrix
adjacency_matrix = self.adjacency_learner(input_graph_learner)
# Create the right order for fatched_mixed_weight:
fatched_mixed_weight = fatched_mixed_weight.view(batch_size, -1, self.bbox_head.fc_out_channels + 1)
input_graph_conv = []
for one_img_idx in range(fatched_mixed_weight.size(0)):
one_image_mw = fatched_mixed_weight[one_img_idx]
pos = one_image_mw[pos_inds[one_img_idx]]
neg = one_image_mw[neg_inds[one_img_idx]]
new_mw = torch.cat([pos, neg], dim=0)
input_graph_conv.append(new_mw)
input_graph_conv = torch.cat(input_graph_conv, dim=0).view(batch_size, -1,
self.bbox_head.fc_out_channels + 1)
# Graph convolution 1
neighbourhood_image, neighbourhood_pseudo = self._create_neighbourhood(input_graph_conv,
pseudo_coord,
adjacency_matrix,
neighbourhood_size=self.neigh_size,
weight=True)
hidden_graph_1 = self.graph_convolution_1(
neighbourhood_image, neighbourhood_pseudo)
# hidden_graph_1 = self.bn_1(hidden_graph_1)
hidden_graph_1 = F.relu(hidden_graph_1)
hidden_graph_1 = self.dropout(hidden_graph_1)
# graph convolution 2
hidden_graph_1, neighbourhood_pseudo = self._create_neighbourhood(hidden_graph_1,
pseudo_coord,
adjacency_matrix,
neighbourhood_size=self.neigh_size,
weight=False)
hidden_graph_2 = self.graph_convolution_2(
hidden_graph_1, neighbourhood_pseudo)
hidden_graph_2 = hidden_graph_2.view(-1, self.latent_graph_channel)
# hidden_graph_2 = self.bn_2(hidden_graph_2)
hidden_graph_2 = F.relu(hidden_graph_2)
# hidden_graph_2 = self.bn_1(hidden_graph_2)
cls_score, bbox_pred = self.bbox_head_en(roi_feats_2, hidden_graph_2)
loss_bbox_2 = self.bbox_head_en.loss(cls_score, bbox_pred, labels,
label_weights, bbox_targets,
bbox_weights)
loss_bbox_2 = {'loss_cls_2': loss_bbox_2['loss_cls'], 'loss_reg_2': loss_bbox_2['loss_reg'], 'acc_2': loss_bbox_2['acc']}
losses.update(loss_bbox_2)
if self.with_mask:
mask_targets = self.mask_head.get_mask_target(
pos_proposals, pos_assigned_gt_inds, gt_masks,
self.train_cfg.rcnn)
pos_rois = bbox2roi(pos_proposals)
mask_feats = self.mask_roi_extractor(
x[:self.mask_roi_extractor.num_inputs], pos_rois)
mask_pred = self.mask_head(mask_feats)
loss_mask = self.mask_head.loss(mask_pred, mask_targets,
torch.cat(pos_gt_labels))
losses.update(loss_mask)
return losses
def simple_test(self, img, img_meta, proposals=None, rescale=False):
"""Test without augmentation."""
assert self.with_bbox, "Bbox head must be implemented."
# -- get backbone feature, sample proposals
x = self.extract_feat(img)
proposal_list = self.simple_test_rpn(
x, img_meta,
self.test_cfg.rpn) if proposals is None else proposals
#test_combs = [([i], i) for i in range(2)]
det_bboxes_mul, det_labels_mul = self.simple_test_bboxes_ms(
x, img_meta, proposal_list, rescale=rescale)
#
bbox_result_mul = []
for i in range(3):
det_bboxes = det_bboxes_mul[i]
det_labels = det_labels_mul[i]
bbox_result = bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
bbox_result_mul.append(bbox_result)
if not self.with_mask:
return bbox_result_mul[2]
else:
segm_results = self.simple_test_mask(
x, img_meta, det_bboxes, det_labels, rescale=rescale)
return bbox_result_mul, segm_results
#multistage test
def simple_test_bboxes_ms(self,
x,
img_meta,
proposal_list,
rescale=False):
batch_size = len(img_meta)
# rpn_outs = self.rpn_head(x)
# proposal_inputs = rpn_outs + (img_shapes, self.rpn_test_cfg)
# proposal_list = self.rpn_head.get_proposals(*proposal_inputs)
# -- get rois by sampling from proposals
rois = bbox2roi(proposal_list)
# img_shapes = [zx['img_shape'] for zx in img_meta]
# img_shape = img_shapes[0]
img_shape = img_meta[0]['img_shape']
scale_factor = img_meta[0]['scale_factor']
# -- forward each stage
(rois_mul, bbox_pred_mul, cls_score_mul) = [], [], []
roi_feats = self.bbox_roi_extractor(
x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
cls_score, bbox_pred = self.bbox_head(roi_feats)
rois_mul.append(rois)
bbox_pred_mul.append(bbox_pred)
cls_score_mul.append(cls_score)
refined_rois = self.bbox_head.regress_by_class(
rois, cls_score, bbox_pred, img_shape)
# Get weight from fc layer to become the pool
feature_pool_weight = torch.cat([self.bbox_head.fc_cls.weight, self.bbox_head.fc_cls.bias.unsqueeze(1)],
1).detach()
# Get the ideal soft weight for each bbox(roi)
# cls_prob = nn.functional.softmax(cls_score, 1)
# fatched_mixed_weight = torch.mm(cls_prob, feature_pool_weight)
max_cls = torch.max(cls_score, 1)
fatched_mixed_weight = feature_pool_weight[max_cls[1], :]
_bbox_each = refined_rois[:, 1:].clone()
_bbox_each[:, 0] = _bbox_each[:, 0] / img_shape[0]
_bbox_each[:, 1] = _bbox_each[:, 1] / img_shape[1]
_bbox_each[:, 2] = _bbox_each[:, 2] / img_shape[0]
_bbox_each[:, 3] = _bbox_each[:, 3] / img_shape[1]
bb = _bbox_each.view(batch_size, -1, 4)
# Compute pseudo coordinates
# extract bounding boxes and compute centres
bb_size = (bb[:, :, 2:] - bb[:, :, :2])
bb_centre = bb[:, :, :2] + 0.5 * bb_size
# Compute pseudo coordinates
pseudo_coord = self._compute_pseudo(bb_centre)
#Stage 3
roi_feats_2 = self.bbox_roi_extractor(
x[:len(self.bbox_roi_extractor.featmap_strides)], refined_rois)
# shared with last fc
roi_feats_2 = roi_feats_2.view(roi_feats_2.size(0), -1)
for fc in self.bbox_head.shared_fcs:
roi_feats_2 = self.relu(fc(roi_feats_2))
input_graph_learner = roi_feats_2.detach()
#input_graph_learner = roi_feats_2.mean(3).mean(2)
input_graph_learner = input_graph_learner.view(batch_size, -1, input_graph_learner.size(-1))
# Learn adjacency matrix
adjacency_matrix = self.adjacency_learner(input_graph_learner)
# Create the right order for fatched_mixed_weight:
input_graph_conv = fatched_mixed_weight.view(batch_size, -1, self.bbox_head.fc_out_channels + 1)
# Graph convolution 1
neighbourhood_image, neighbourhood_pseudo = self._create_neighbourhood(input_graph_conv,
pseudo_coord,
adjacency_matrix,
neighbourhood_size=self.neigh_size,
weight=True)
hidden_graph_1 = self.graph_convolution_1(
neighbourhood_image, neighbourhood_pseudo)
#hidden_graph_1 = self.bn_1(hidden_graph_1)
hidden_graph_1 = F.relu(hidden_graph_1)
hidden_graph_1 = self.dropout(hidden_graph_1)
# graph convolution 2
hidden_graph_1, neighbourhood_pseudo = self._create_neighbourhood(hidden_graph_1,
pseudo_coord,
adjacency_matrix,
neighbourhood_size=self.neigh_size,
weight=False)
hidden_graph_2 = self.graph_convolution_2(
hidden_graph_1, neighbourhood_pseudo)
hidden_graph_2 = hidden_graph_2.view(-1, self.latent_graph_channel)
#hidden_graph_2 = self.bn_2(hidden_graph_2)
hidden_graph_2 = F.relu(hidden_graph_2)
# hidden_graph_2 = self.bn_1(hidden_graph_2)
cls_score, bbox_pred = self.bbox_head_en(roi_feats_2, hidden_graph_2)
rois_mul.append(refined_rois)
bbox_pred_mul.append(bbox_pred)
cls_score_mul.append(cls_score)
#get det bboxes
det_bboxes_mul, det_labels_mul = [], []
det_bboxes, det_labels = self.bbox_head.get_det_bboxes(
rois_mul[0], cls_score_mul[0],
bbox_pred_mul[0],
img_shape,
scale_factor,
rescale=rescale,
nms_cfg=self.test_cfg.rcnn)
det_bboxes_mul.append(det_bboxes)
det_labels_mul.append(det_labels)
# Stage 3
det_bboxes, det_labels = self.bbox_head_en.get_det_bboxes(
rois_mul[1], cls_score_mul[1],
bbox_pred_mul[1],
img_shape,
scale_factor,
rescale=rescale,
nms_cfg=self.test_cfg.rcnn)
det_bboxes_mul.append(det_bboxes)
det_labels_mul.append(det_labels)
# Stage all
rois_all = torch.cat(rois_mul)
bbox_pred_all = torch.cat(bbox_pred_mul)
cls_score_all = torch.cat(cls_score_mul)
det_bboxes, det_labels = self.bbox_head_en.get_det_bboxes(
rois_all, cls_score_all,
bbox_pred_all,
img_shape,
scale_factor,
rescale=rescale,
nms_cfg=self.test_cfg.rcnn)
det_bboxes_mul.append(det_bboxes)
det_labels_mul.append(det_labels)
return det_bboxes_mul, det_labels_mul
def aug_test(self, imgs, img_metas, rescale=False):
"""Test with augmentations.
If rescale is False, then returned bboxes and masks will fit the scale
of imgs[0].
"""
# recompute feats to save memory
proposal_list = self.aug_test_rpn(
self.extract_feats(imgs), img_metas, self.test_cfg.rpn)
det_bboxes, det_labels = self.aug_test_bboxes(
self.extract_feats(imgs), img_metas, proposal_list,
self.test_cfg.rcnn)
if rescale:
_det_bboxes = det_bboxes
else:
_det_bboxes = det_bboxes.clone()
_det_bboxes[:, :4] *= img_metas[0][0]['scale_factor']
bbox_results = bbox2result(_det_bboxes, det_labels,
self.bbox_head.num_classes)
# det_bboxes always keep the original scale
if self.with_mask:
segm_results = self.aug_test_mask(
self.extract_feats(imgs), img_metas, det_bboxes, det_labels)
return bbox_results, segm_results
else:
return bbox_results
def _compute_pseudo(self, bb_centre):
'''
Computes pseudo-coordinates from bounding box centre coordinates
## Inputs:
- bb_centre (batch_size, K, coord_dim)
- polar (bool: polar or euclidean coordinates)
## Returns:
- pseudo_coord (batch_size, K, K, coord_dim)
'''
K = bb_centre.size(1)
# Compute cartesian coordinates (batch_size, K, K, 2)
pseudo_coord = bb_centre.view(-1, K, 1, 2) - \
bb_centre.view(-1, 1, K, 2)
# Conver to polar coordinates
rho = torch.sqrt(
pseudo_coord[:, :, :, 0]**2 + pseudo_coord[:, :, :, 1]**2)
theta = torch.atan2(
pseudo_coord[:, :, :, 0], pseudo_coord[:, :, :, 1])
pseudo_coord = torch.cat(
(torch.unsqueeze(rho, -1), torch.unsqueeze(theta, -1)), dim=-1)
return pseudo_coord
def _create_neighbourhood(self,
features,
pseudo_coord,
adjacency_matrix,
neighbourhood_size=16,
weight=True):
'''
Creates a neighbourhood system for each graph node/image object
## Inputs:
- features (batch_size, K, feat_dim): input image features
- pseudo_coord (batch_size, K, K, coord_dim): pseudo coordinates for graph convolutions
- adjacency_matrix (batch_size, K, K): learned adjacency matrix
- neighbourhood_size (int)
- weight (bool): specify if the features should be weighted by the adjacency matrix values
## Returns:
- neighbourhood_image (batch_size, K, neighbourhood_size, feat_dim)
- neighbourhood_pseudo (batch_size, K, neighbourhood_size, coord_dim)
'''
# Number of graph nodes
K = features.size(1)
# extract top k neighbours for each node and normalise
top_k, top_ind = torch.topk(
adjacency_matrix, k=neighbourhood_size, dim=-1, sorted=False)
top_k = torch.stack([F.softmax(top_k[:, k], dim=1) for k in range(K)]).transpose(0, 1) # (batch_size, K, neighbourhood_size)
# extract top k features and pseudo coordinates
neighbourhood_image = \
self._create_neighbourhood_feat(features, top_ind)
neighbourhood_pseudo = \
self._create_neighbourhood_pseudo(pseudo_coord, top_ind)
# weight neighbourhood features with graph edge weights
if weight:
neighbourhood_image = top_k.unsqueeze(-1)*neighbourhood_image
return neighbourhood_image, neighbourhood_pseudo
def _create_neighbourhood_feat(self, image, top_ind):
'''
## Inputs:
- image (batch_size, K, feat_dim)
- top_ind (batch_size, K, neighbourhood_size)
## Returns:
- neighbourhood_image (batch_size, K, neighbourhood_size, feat_dim)
'''
batch_size = image.size(0)
K = image.size(1)
feat_dim = image.size(2)
neighbourhood_size = top_ind.size(-1)
image = image.unsqueeze(1).expand(batch_size, K, K, feat_dim)
idx = top_ind.unsqueeze(-1).expand(batch_size,
K, neighbourhood_size, feat_dim)
return torch.gather(image, dim=2, index=idx)
def _create_neighbourhood_pseudo(self, pseudo, top_ind):
'''
## Inputs:
- pseudo_coord (batch_size, K, K, coord_dim)
- top_ind (batch_size, K, neighbourhood_size)
## Returns:
- neighbourhood_pseudo (batch_size, K, neighbourhood_size, coord_dim)
'''
batch_size = pseudo.size(0)
K = pseudo.size(1)
coord_dim = pseudo.size(3)
neighbourhood_size = top_ind.size(-1)
idx = top_ind.unsqueeze(-1).expand(batch_size,
K, neighbourhood_size, coord_dim)
return torch.gather(pseudo, dim=2, index=idx)
class GraphLearner(nn.Module):
def __init__(self, in_feature_dim, combined_feature_dim, dropout=0.5):
super(GraphLearner, self).__init__()
'''
## Variables:
- in_feature_dim: dimensionality of input features
- combined_feature_dim: dimensionality of the joint hidden embedding
- K: number of graph nodes/objects on the image
'''
# Parameters
self.in_dim = in_feature_dim
self.combined_dim = combined_feature_dim
# Embedding layers
self.edge_layer_1 = nn.Linear(in_feature_dim,
combined_feature_dim)
self.edge_layer_2 = nn.Linear(combined_feature_dim,
combined_feature_dim)
# Regularisation
self.edge_layer_1 = nn.utils.weight_norm(self.edge_layer_1)
self.edge_layer_2 = nn.utils.weight_norm(self.edge_layer_2)
def forward(self, graph_nodes):
'''
## Inputs:
- graph_nodes (batch_size, K, in_feat_dim): input features
## Returns:
- adjacency matrix (batch_size, K, K)
'''
bs = len(graph_nodes)
graph_nodes = graph_nodes.view(-1, self.in_dim)
# layer 1
h = self.edge_layer_1(graph_nodes)
h = nn.functional.relu(h)
# layer 2
h = self.edge_layer_2(h)
h = nn.functional.relu(h)
# outer product
h = h.view(bs, -1, self.combined_dim)
adjacency_matrix = torch.matmul(h, h.transpose(1, 2))
return adjacency_matrix
class NeighbourhoodGraphConvolution(Module):
'''
Implementation of: https://arxiv.org/pdf/1611.08402.pdf where we consider
a fixed sized neighbourhood of nodes for each feature
'''
def __init__(self,
in_feat_dim,
out_feat_dim,
n_kernels,
coordinate_dim,
bias=False):
super(NeighbourhoodGraphConvolution, self).__init__()
'''
## Variables:
- in_feat_dim: dimensionality of input features
- out_feat_dim: dimensionality of output features
- n_kernels: number of Gaussian kernels to use
- coordinate_dim : dimensionality of the pseudo coordinates
- bias: whether to add a bias to convolutional kernels
'''
# Set parameters
self.n_kernels = n_kernels
self.coordinate_dim = coordinate_dim
self.in_feat_dim = in_feat_dim
self.out_feat_dim = out_feat_dim
self.bias = bias
# Convolution filters weights
self.conv_weights = nn.ModuleList([nn.Linear(
in_feat_dim, out_feat_dim//n_kernels, bias=bias) for i in range(n_kernels)])
# Parameters of the Gaussian kernels
self.mean_rho = Parameter(torch.Tensor(n_kernels, 1))
self.mean_theta = Parameter(torch.Tensor(n_kernels, 1))
self.precision_rho = Parameter(torch.Tensor(n_kernels, 1))
self.precision_theta = Parameter(torch.Tensor(n_kernels, 1))
self.init_parameters()
def init_parameters(self):
# Initialise Gaussian parameters
self.mean_theta.data.uniform_(-np.pi, np.pi)
self.mean_rho.data.uniform_(0, 1.0)
self.precision_theta.data.uniform_(0.0, 1.0)
self.precision_rho.data.uniform_(0.0, 1.0)
def forward(self, neighbourhood_features, neighbourhood_pseudo_coord):
'''
## Inputs:
- neighbourhood_features (batch_size, K, neighbourhood_size, in_feat_dim)
- neighbourhood_pseudo_coord (batch_size, K, neighbourhood_size, coordinate_dim)
## Returns:
- convolved_features (batch_size, K, neighbourhood_size, out_feat_dim)
'''
# set parameters
batch_size = neighbourhood_features.size(0)
K = neighbourhood_features.size(1)
neighbourhood_size = neighbourhood_features.size(2)
# compute pseudo coordinate kernel weights
weights = self.get_gaussian_weights(neighbourhood_pseudo_coord)
weights = weights.view(
batch_size*K, neighbourhood_size, self.n_kernels)
# compute convolved features
neighbourhood_features = neighbourhood_features.view(
batch_size*K, neighbourhood_size, -1)
convolved_features = self.convolution(neighbourhood_features, weights)
convolved_features = convolved_features.view(-1, K, self.out_feat_dim)
return convolved_features
def get_gaussian_weights(self, pseudo_coord):
'''
## Inputs:
- pseudo_coord (batch_size, K, K, pseudo_coord_dim)
## Returns:
- weights (batch_size*K, neighbourhood_size, n_kernels)
'''
# compute rho weights
diff = (pseudo_coord[:, :, :, 0].contiguous().view(-1, 1) - self.mean_rho.view(1, -1))**2
weights_rho = torch.exp(-0.5 * diff /
(1e-14 + self.precision_rho.view(1, -1)**2))
# compute theta weights
first_angle = torch.abs(pseudo_coord[:, :, :, 1].contiguous().view(-1, 1) - self.mean_theta.view(1, -1))
second_angle = torch.abs(2 * np.pi - first_angle)
weights_theta = torch.exp(-0.5 * (torch.min(first_angle, second_angle)**2)
/ (1e-14 + self.precision_theta.view(1, -1)**2))
weights = weights_rho * weights_theta
weights[(weights != weights).detach()] = 0
# normalise weights
weights = weights / (torch.sum(weights, dim=1, keepdim=True)+1e-10)
return weights
def convolution(self, neighbourhood, weights):
'''
## Inputs:
- neighbourhood (batch_size*K, neighbourhood_size, in_feat_dim)
- weights (batch_size*K, neighbourhood_size, n_kernels)
## Returns:
- convolved_features (batch_size*K, out_feat_dim)
'''
# patch operator
weighted_neighbourhood = torch.bmm(
weights.transpose(1, 2), neighbourhood)
# convolutions
weighted_neighbourhood = [self.conv_weights[i](weighted_neighbourhood[:, i]) for i in range(self.n_kernels)]
convolved_features = torch.cat([i.unsqueeze(1) for i in weighted_neighbourhood], dim=1)
convolved_features = convolved_features.view(-1, self.out_feat_dim)
return convolved_features
================================================
FILE: mmdet/models/detectors/single_stage.py
================================================
import torch.nn as nn
from .base import BaseDetector
from .. import builder
from ..registry import DETECTORS
from mmdet.core import bbox2result
@DETECTORS.register_module
class SingleStageDetector(BaseDetector):
def __init__(self,
backbone,
neck=None,
bbox_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(SingleStageDetector, self).__init__()
self.backbone = builder.build_backbone(backbone)
if neck is not None:
self.neck = builder.build_neck(neck)
self.bbox_head = builder.build_head(bbox_head)
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.init_weights(pretrained=pretrained)
def init_weights(self, pretrained=None):
super(SingleStageDetector, self).init_weights(pretrained)
self.backbone.init_weights(pretrained=pretrained)
if self.with_neck:
if isinstance(self.neck, nn.Sequential):
for m in self.neck:
m.init_weights()
else:
self.neck.init_weights()
self.bbox_head.init_weights()
def extract_feat(self, img):
x = self.backbone(img)
if self.with_neck:
x = self.neck(x)
return x
def forward_train(self, img, img_metas, gt_bboxes, gt_labels):
x = self.extract_feat(img)
outs = self.bbox_head(x)
loss_inputs = outs + (gt_bboxes, gt_labels, img_metas, self.train_cfg)
losses = self.bbox_head.loss(*loss_inputs)
return losses
def simple_test(self, img, img_meta, rescale=False):
x = self.extract_feat(img)
outs = self.bbox_head(x)
bbox_inputs = outs + (img_meta, self.test_cfg, rescale)
bbox_list = self.bbox_head.get_bboxes(*bbox_inputs)
bbox_results = [
bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
for det_bboxes, det_labels in bbox_list
]
return bbox_results[0]
def aug_test(self, imgs, img_metas, rescale=False):
raise NotImplementedError
================================================
FILE: mmdet/models/detectors/test_mixins.py
================================================
from mmdet.core import (bbox2roi, bbox_mapping, merge_aug_proposals,
merge_aug_bboxes, merge_aug_masks, multiclass_nms)
class RPNTestMixin(object):
def simple_test_rpn(self, x, img_meta, rpn_test_cfg):
rpn_outs = self.rpn_head(x)
proposal_inputs = rpn_outs + (img_meta, rpn_test_cfg)
proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
return proposal_list
def aug_test_rpn(self, feats, img_metas, rpn_test_cfg):
imgs_per_gpu = len(img_metas[0])
aug_proposals = [[] for _ in range(imgs_per_gpu)]
for x, img_meta in zip(feats, img_metas):
proposal_list = self.simple_test_rpn(x, img_meta, rpn_test_cfg)
for i, proposals in enumerate(proposal_list):
aug_proposals[i].append(proposals)
# after merging, proposals will be rescaled to the original image size
merged_proposals = [
merge_aug_proposals(proposals, img_meta, rpn_test_cfg)
for proposals, img_meta in zip(aug_proposals, img_metas)
]
return merged_proposals
class BBoxTestMixin(object):
def simple_test_bboxes(self,
x,
img_meta,
proposals,
rcnn_test_cfg,
rescale=False):
"""Test only det bboxes without augmentation."""
rois = bbox2roi(proposals)
roi_feats = self.bbox_roi_extractor(
x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
cls_score, bbox_pred = self.bbox_head(roi_feats)
img_shape = img_meta[0]['img_shape']
scale_factor = img_meta[0]['scale_factor']
det_bboxes, det_labels = self.bbox_head.get_det_bboxes(
rois,
cls_score,
bbox_pred,
img_shape,
scale_factor,
rescale=rescale,
cfg=rcnn_test_cfg)
return det_bboxes, det_labels
def aug_test_bboxes(self, feats, img_metas, proposal_list, rcnn_test_cfg):
aug_bboxes = []
aug_scores = []
for x, img_meta in zip(feats, img_metas):
# only one image in the batch
img_shape = img_meta[0]['img_shape']
scale_factor = img_meta[0]['scale_factor']
flip = img_meta[0]['flip']
# TODO more flexible
proposals = bbox_mapping(proposal_list[0][:, :4], img_shape,
scale_factor, flip)
rois = bbox2roi([proposals])
# recompute feature maps to save GPU memory
roi_feats = self.bbox_roi_extractor(
x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
cls_score, bbox_pred = self.bbox_head(roi_feats)
bboxes, scores = self.bbox_head.get_det_bboxes(
rois,
cls_score,
bbox_pred,
img_shape,
scale_factor,
rescale=False,
cfg=None)
aug_bboxes.append(bboxes)
aug_scores.append(scores)
# after merging, bboxes will be rescaled to the original image size
merged_bboxes, merged_scores = merge_aug_bboxes(
aug_bboxes, aug_scores, img_metas, rcnn_test_cfg)
det_bboxes, det_labels = multiclass_nms(
merged_bboxes, merged_scores, rcnn_test_cfg.score_thr,
rcnn_test_cfg.nms, rcnn_test_cfg.max_per_img)
return det_bboxes, det_labels
class MaskTestMixin(object):
def simple_test_mask(self,
x,
img_meta,
det_bboxes,
det_labels,
rescale=False):
# image shape of the first image in the batch (only one)
ori_shape = img_meta[0]['ori_shape']
scale_factor = img_meta[0]['scale_factor']
if det_bboxes.shape[0] == 0:
segm_result = [[] for _ in range(self.mask_head.num_classes - 1)]
else:
# if det_bboxes is rescaled to the original image size, we need to
# rescale it back to the testing scale to obtain RoIs.
_bboxes = (det_bboxes[:, :4] * scale_factor
if rescale else det_bboxes)
mask_rois = bbox2roi([_bboxes])
mask_feats = self.mask_roi_extractor(
x[:len(self.mask_roi_extractor.featmap_strides)], mask_rois)
mask_pred = self.mask_head(mask_feats)
segm_result = self.mask_head.get_seg_masks(
mask_pred, _bboxes, det_labels, self.test_cfg.rcnn, ori_shape,
scale_factor, rescale)
return segm_result
def aug_test_mask(self, feats, img_metas, det_bboxes, det_labels):
if det_bboxes.shape[0] == 0:
segm_result = [[] for _ in range(self.mask_head.num_classes - 1)]
else:
aug_masks = []
for x, img_meta in zip(feats, img_metas):
img_shape = img_meta[0]['img_shape']
scale_factor = img_meta[0]['scale_factor']
flip = img_meta[0]['flip']
_bboxes = bbox_mapping(det_bboxes[:, :4], img_shape,
scale_factor, flip)
mask_rois = bbox2roi([_bboxes])
mask_feats = self.mask_roi_extractor(
x[:len(self.mask_roi_extractor.featmap_strides)],
mask_rois)
mask_pred = self.mask_head(mask_feats)
# convert to numpy array to save memory
aug_masks.append(mask_pred.sigmoid().cpu().numpy())
merged_masks = merge_aug_masks(aug_masks, img_metas,
self.test_cfg.rcnn)
ori_shape = img_metas[0][0]['ori_shape']
segm_result = self.mask_head.get_seg_masks(
merged_masks,
det_bboxes,
det_labels,
self.test_cfg.rcnn,
ori_shape,
scale_factor=1.0,
rescale=False)
return segm_result
================================================
FILE: mmdet/models/detectors/two_stage.py
================================================
import torch
import torch.nn as nn
from .base import BaseDetector
from .test_mixins import RPNTestMixin, BBoxTestMixin, MaskTestMixin
from .. import builder
from ..registry import DETECTORS
from mmdet.core import bbox2roi, bbox2result, build_assigner, build_sampler
@DETECTORS.register_module
class TwoStageDetector(BaseDetector, RPNTestMixin, BBoxTestMixin,
MaskTestMixin):
def __init__(self,
backbone,
neck=None,
rpn_head=None,
bbox_roi_extractor=None,
bbox_head=None,
mask_roi_extractor=None,
mask_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(TwoStageDetector, self).__init__()
self.backbone = builder.build_backbone(backbone)
if neck is not None:
self.neck = builder.build_neck(neck)
else:
raise NotImplementedError
if rpn_head is not None:
self.rpn_head = builder.build_head(rpn_head)
if bbox_head is not None:
self.bbox_roi_extractor = builder.build_roi_extractor(
bbox_roi_extractor)
self.bbox_head = builder.build_head(bbox_head)
if mask_head is not None:
self.mask_roi_extractor = builder.build_roi_extractor(
mask_roi_extractor)
self.mask_head = builder.build_head(mask_head)
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.init_weights(pretrained=pretrained)
@property
def with_rpn(self):
return hasattr(self, 'rpn_head') and self.rpn_head is not None
def init_weights(self, pretrained=None):
super(TwoStageDetector, self).init_weights(pretrained)
self.backbone.init_weights(pretrained=pretrained)
if self.with_neck:
if isinstance(self.neck, nn.Sequential):
for m in self.neck:
m.init_weights()
else:
self.neck.init_weights()
if self.with_rpn:
self.rpn_head.init_weights()
if self.with_bbox:
self.bbox_roi_extractor.init_weights()
self.bbox_head.init_weights()
if self.with_mask:
self.mask_roi_extractor.init_weights()
self.mask_head.init_weights()
def extract_feat(self, img):
x = self.backbone(img)
if self.with_neck:
x = self.neck(x)
return x
def forward_train(self,
img,
img_meta,
gt_bboxes,
gt_bboxes_ignore,
gt_labels,
gt_masks=None,
proposals=None):
x = self.extract_feat(img)
losses = dict()
# RPN forward and loss
if self.with_rpn:
rpn_outs = self.rpn_head(x)
rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
self.train_cfg.rpn)
rpn_losses = self.rpn_head.loss(*rpn_loss_inputs)
losses.update(rpn_losses)
proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn)
proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
else:
proposal_list = proposals
# assign gts and sample proposals
if self.with_bbox or self.with_mask:
bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner)
bbox_sampler = build_sampler(
self.train_cfg.rcnn.sampler, context=self)
num_imgs = img.size(0)
sampling_results = []
for i in range(num_imgs):
assign_result = bbox_assigner.assign(
proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i],
gt_labels[i])
sampling_result = bbox_sampler.sample(
assign_result,
proposal_list[i],
gt_bboxes[i],
gt_labels[i],
feats=[lvl_feat[i][None] for lvl_feat in x])
sampling_results.append(sampling_result)
# bbox head forward and loss
if self.with_bbox:
rois = bbox2roi([res.bboxes for res in sampling_results])
# TODO: a more flexible way to decide which feature maps to use
bbox_feats = self.bbox_roi_extractor(
x[:self.bbox_roi_extractor.num_inputs], rois)
cls_score, bbox_pred = self.bbox_head(bbox_feats)
bbox_targets = self.bbox_head.get_target(
sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn)
loss_bbox = self.bbox_head.loss(cls_score, bbox_pred,
*bbox_targets)
losses.update(loss_bbox)
# mask head forward and loss
if self.with_mask:
pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
mask_feats = self.mask_roi_extractor(
x[:self.mask_roi_extractor.num_inputs], pos_rois)
mask_pred = self.mask_head(mask_feats)
mask_targets = self.mask_head.get_target(
sampling_results, gt_masks, self.train_cfg.rcnn)
pos_labels = torch.cat(
[res.pos_gt_labels for res in sampling_results])
loss_mask = self.mask_head.loss(mask_pred, mask_targets,
pos_labels)
losses.update(loss_mask)
return losses
def simple_test(self, img, img_meta, proposals=None, rescale=False):
"""Test without augmentation."""
assert self.with_bbox, "Bbox head must be implemented."
x = self.extract_feat(img)
proposal_list = self.simple_test_rpn(
x, img_meta, self.test_cfg.rpn) if proposals is None else proposals
det_bboxes, det_labels = self.simple_test_bboxes(
x, img_meta, proposal_list, self.test_cfg.rcnn, rescale=rescale)
bbox_results = bbox2result(det_bboxes, det_labels,
self.bbox_head.num_classes)
if not self.with_mask:
return bbox_results
else:
segm_results = self.simple_test_mask(
x, img_meta, det_bboxes, det_labels, rescale=rescale)
return bbox_results, segm_results
def aug_test(self, imgs, img_metas, rescale=False):
"""Test with augmentations.
If rescale is False, then returned bboxes and masks will fit the scale
of imgs[0].
"""
# recompute feats to save memory
proposal_list = self.aug_test_rpn(
self.extract_feats(imgs), img_metas, self.test_cfg.rpn)
det_bboxes, det_labels = self.aug_test_bboxes(
self.extract_feats(imgs), img_metas, proposal_list,
self.test_cfg.rcnn)
if rescale:
_det_bboxes = det_bboxes
else:
_det_bboxes = det_bboxes.clone()
_det_bboxes[:, :4] *= img_metas[0][0]['scale_factor']
bbox_results = bbox2result(_det_bboxes, det_labels,
self.bbox_head.num_classes)
# det_bboxes always keep the original scale
if self.with_mask:
segm_results = self.aug_test_mask(
self.extract_feats(imgs), img_metas, det_bboxes, det_labels)
return bbox_results, segm_results
else:
return bbox_results
================================================
FILE: mmdet/models/mask_heads/__init__.py
================================================
from .fcn_mask_head import FCNMaskHead
__all__ = ['FCNMaskHead']
================================================
FILE: mmdet/models/mask_heads/fcn_mask_head.py
================================================
import mmcv
import numpy as np
import pycocotools.mask as mask_util
import torch
import torch.nn as nn
from ..registry import HEADS
from ..utils import ConvModule
from mmdet.core import mask_cross_entropy, mask_target
@HEADS.register_module
class FCNMaskHead(nn.Module):
def __init__(self,
num_convs=4,
roi_feat_size=14,
in_channels=256,
conv_kernel_size=3,
conv_out_channels=256,
upsample_method='deconv',
upsample_ratio=2,
num_classes=81,
class_agnostic=False,
normalize=None):
super(FCNMaskHead, self).__init__()
if upsample_method not in [None, 'deconv', 'nearest', 'bilinear']:
raise ValueError(
'Invalid upsample method {}, accepted methods '
'are "deconv", "nearest", "bilinear"'.format(upsample_method))
self.num_convs = num_convs
self.roi_feat_size = roi_feat_size # WARN: not used and reserved
self.in_channels = in_channels
self.conv_kernel_size = conv_kernel_size
self.conv_out_channels = conv_out_channels
self.upsample_method = upsample_method
self.upsample_ratio = upsample_ratio
self.num_classes = num_classes
self.class_agnostic = class_agnostic
self.normalize = normalize
self.with_bias = normalize is None
self.convs = nn.ModuleList()
for i in range(self.num_convs):
in_channels = (self.in_channels
if i == 0 else self.conv_out_channels)
padding = (self.conv_kernel_size - 1) // 2
self.convs.append(
ConvModule(
in_channels,
self.conv_out_channels,
3,
padding=padding,
normalize=normalize,
bias=self.with_bias))
if self.upsample_method is None:
self.upsample = None
elif self.upsample_method == 'deconv':
self.upsample = nn.ConvTranspose2d(
self.conv_out_channels,
self.conv_out_channels,
self.upsample_ratio,
stride=self.upsample_ratio)
else:
self.upsample = nn.Upsample(
scale_factor=self.upsample_ratio, mode=self.upsample_method)
out_channels = 1 if self.class_agnostic else self.num_classes
self.conv_logits = nn.Conv2d(self.conv_out_channels, out_channels, 1)
self.relu = nn.ReLU(inplace=True)
self.debug_imgs = None
def init_weights(self):
for m in [self.upsample, self.conv_logits]:
if m is None:
continue
nn.init.kaiming_normal_(
m.weight, mode='fan_out', nonlinearity='relu')
nn.init.constant_(m.bias, 0)
def forward(self, x):
for conv in self.convs:
x = conv(x)
if self.upsample is not None:
x = self.upsample(x)
if self.upsample_method == 'deconv':
x = self.relu(x)
mask_pred = self.conv_logits(x)
return mask_pred
def get_target(self, sampling_results, gt_masks, rcnn_train_cfg):
pos_proposals = [res.pos_bboxes for res in sampling_results]
pos_assigned_gt_inds = [
res.pos_assigned_gt_inds for res in sampling_results
]
mask_targets = mask_target(pos_proposals, pos_assigned_gt_inds,
gt_masks, rcnn_train_cfg)
return mask_targets
def loss(self, mask_pred, mask_targets, labels):
loss = dict()
if self.class_agnostic:
loss_mask = mask_cross_entropy(mask_pred, mask_targets,
torch.zeros_like(labels))
else:
loss_mask = mask_cross_entropy(mask_pred, mask_targets, labels)
loss['loss_mask'] = loss_mask
return loss
def get_seg_masks(self, mask_pred, det_bboxes, det_labels, rcnn_test_cfg,
ori_shape, scale_factor, rescale):
"""Get segmentation masks from mask_pred and bboxes.
Args:
mask_pred (Tensor or ndarray): shape (n, #class+1, h, w).
For single-scale testing, mask_pred is the direct output of
model, whose type is Tensor, while for multi-scale testing,
it will be converted to numpy array outside of this method.
det_bboxes (Tensor): shape (n, 4/5)
det_labels (Tensor): shape (n, )
img_shape (Tensor): shape (3, )
rcnn_test_cfg (dict): rcnn testing config
ori_shape: original image size
Returns:
list[list]: encoded masks
"""
if isinstance(mask_pred, torch.Tensor):
mask_pred = mask_pred.sigmoid().cpu().numpy()
assert isinstance(mask_pred, np.ndarray)
cls_segms = [[] for _ in range(self.num_classes - 1)]
bboxes = det_bboxes.cpu().numpy()[:, :4]
labels = det_labels.cpu().numpy() + 1
if rescale:
img_h, img_w = ori_shape[:2]
else:
img_h = np.round(ori_shape[0] * scale_factor).astype(np.int32)
img_w = np.round(ori_shape[1] * scale_factor).astype(np.int32)
scale_factor = 1.0
for i in range(bboxes.shape[0]):
bbox = (bboxes[i, :] / scale_factor).astype(np.int32)
label = labels[i]
w = max(bbox[2] - bbox[0] + 1, 1)
h = max(bbox[3] - bbox[1] + 1, 1)
if not self.class_agnostic:
mask_pred_ = mask_pred[i, label, :, :]
else:
mask_pred_ = mask_pred[i, 0, :, :]
im_mask = np.zeros((img_h, img_w), dtype=np.uint8)
bbox_mask = mmcv.imresize(mask_pred_, (w, h))
bbox_mask = (bbox_mask > rcnn_test_cfg.mask_thr_binary).astype(
np.uint8)
im_mask[bbox[1]:bbox[1] + h, bbox[0]:bbox[0] + w] = bbox_mask
rle = mask_util.encode(
np.array(im_mask[:, :, np.newaxis], order='F'))[0]
cls_segms[label - 1].append(rle)
return cls_segms
================================================
FILE: mmdet/models/necks/__init__.py
================================================
from .fpn import FPN
__all__ = ['FPN']
================================================
FILE: mmdet/models/necks/fpn.py
================================================
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import xavier_init
from ..utils import ConvModule
from ..registry import NECKS
@NECKS.register_module
class FPN(nn.Module):
def __init__(self,
in_channels,
out_channels,
num_outs,
start_level=0,
end_level=-1,
add_extra_convs=False,
normalize=None,
activation=None):
super(FPN, self).__init__()
assert isinstance(in_channels, list)
self.in_channels = in_channels
self.out_channels = out_channels
self.num_ins = len(in_channels)
self.num_outs = num_outs
self.activation = activation
self.with_bias = normalize is None
if end_level == -1:
self.backbone_end_level = self.num_ins
assert num_outs >= self.num_ins - start_level
else:
# if end_level < inputs, no extra level is allowed
self.backbone_end_level = end_level
assert end_level <= len(in_channels)
assert num_outs == end_level - start_level
self.start_level = start_level
self.end_level = end_level
self.add_extra_convs = add_extra_convs
self.lateral_convs = nn.ModuleList()
self.fpn_convs = nn.ModuleList()
for i in range(self.start_level, self.backbone_end_level):
l_conv = ConvModule(
in_channels[i],
out_channels,
1,
normalize=normalize,
bias=self.with_bias,
activation=self.activation,
inplace=False)
fpn_conv = ConvModule(
out_channels,
out_channels,
3,
padding=1,
normalize=normalize,
bias=self.with_bias,
activation=self.activation,
inplace=False)
self.lateral_convs.append(l_conv)
self.fpn_convs.append(fpn_conv)
# lvl_id = i - self.start_level
# setattr(self, 'lateral_conv{}'.format(lvl_id), l_conv)
# setattr(self, 'fpn_conv{}'.format(lvl_id), fpn_conv)
# add extra conv layers (e.g., RetinaNet)
extra_levels = num_outs - self.backbone_end_level + self.start_level
if add_extra_convs and extra_levels >= 1:
for i in range(extra_levels):
in_channels = (self.in_channels[self.backbone_end_level - 1]
if i == 0 else out_channels)
extra_fpn_conv = ConvModule(
in_channels,
out_channels,
3,
stride=2,
padding=1,
normalize=normalize,
bias=self.with_bias,
activation=self.activation,
inplace=False)
self.fpn_convs.append(extra_fpn_conv)
# default init_weights for conv(msra) and norm in ConvModule
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
xavier_init(m, distribution='uniform')
def forward(self, inputs):
assert len(inputs) == len(self.in_channels)
# build laterals
laterals = [
lateral_conv(inputs[i + self.start_level])
for i, lateral_conv in enumerate(self.lateral_convs)
]
# build top-down path
used_backbone_levels = len(laterals)
for i in range(used_backbone_levels - 1, 0, -1):
laterals[i - 1] += F.interpolate(
laterals[i], scale_factor=2, mode='nearest')
# build outputs
# part 1: from original levels
outs = [
self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
]
# part 2: add extra levels
if self.num_outs > len(outs):
# use max pool to get more levels on top of outputs
# (e.g., Faster R-CNN, Mask R-CNN)
if not self.add_extra_convs:
for i in range(self.num_outs - used_backbone_levels):
outs.append(F.max_pool2d(outs[-1], 1, stride=2))
# add conv layers on top of original feature maps (RetinaNet)
else:
orig = inputs[self.backbone_end_level - 1]
outs.append(self.fpn_convs[used_backbone_levels](orig))
for i in range(used_backbone_levels + 1, self.num_outs):
# BUG: we should add relu before each extra conv
outs.append(self.fpn_convs[i](outs[-1]))
return tuple(outs)
================================================
FILE: mmdet/models/registry.py
================================================
import torch.nn as nn
class Registry(object):
def __init__(self, name):
self._name = name
self._module_dict = dict()
@property
def name(self):
return self._name
@property
def module_dict(self):
return self._module_dict
def _register_module(self, module_class):
"""Register a module.
Args:
module (:obj:`nn.Module`): Module to be registered.
"""
if not issubclass(module_class, nn.Module):
raise TypeError(
'module must be a child of nn.Module, but got {}'.format(
type(module_class)))
module_name = module_class.__name__
if module_name in self._module_dict:
raise KeyError('{} is already registered in {}'.format(
module_name, self.name))
self._module_dict[module_name] = module_class
def register_module(self, cls):
self._register_module(cls)
return cls
BACKBONES = Registry('backbone')
NECKS = Registry('neck')
ROI_EXTRACTORS = Registry('roi_extractor')
HEADS = Registry('head')
DETECTORS = Registry('detector')
================================================
FILE: mmdet/models/roi_extractors/__init__.py
================================================
from .single_level import SingleRoIExtractor
__all__ = ['SingleRoIExtractor']
================================================
FILE: mmdet/models/roi_extractors/single_level.py
================================================
from __future__ import division
import torch
import torch.nn as nn
from mmdet import ops
from ..registry import ROI_EXTRACTORS
@ROI_EXTRACTORS.register_module
class SingleRoIExtractor(nn.Module):
"""Extract RoI features from a single level feature map.
If there are mulitple input feature levels, each RoI is mapped to a level
according to its scale.
Args:
roi_layer (dict): Specify RoI layer type and arguments.
out_channels (int): Output channels of RoI layers.
featmap_strides (int): Strides of input feature maps.
finest_scale (int): Scale threshold of mapping to level 0.
"""
def __init__(self,
roi_layer,
out_channels,
featmap_strides,
finest_scale=56):
super(SingleRoIExtractor, self).__init__()
self.roi_layers = self.build_roi_layers(roi_layer, featmap_strides)
self.out_channels = out_channels
self.featmap_strides = featmap_strides
self.finest_scale = finest_scale
@property
def num_inputs(self):
"""int: Input feature map levels."""
return len(self.featmap_strides)
def init_weights(self):
pass
def build_roi_layers(self, layer_cfg, featmap_strides):
cfg = layer_cfg.copy()
layer_type = cfg.pop('type')
assert hasattr(ops, layer_type)
layer_cls = getattr(ops, layer_type)
roi_layers = nn.ModuleList(
[layer_cls(spatial_scale=1 / s, **cfg) for s in featmap_strides])
return roi_layers
def map_roi_levels(self, rois, num_levels):
"""Map rois to corresponding feature levels by scales.
- scale < finest_scale: level 0
- finest_scale <= scale < finest_scale * 2: level 1
- finest_scale * 2 <= scale < finest_scale * 4: level 2
- scale >= finest_scale * 4: level 3
Args:
rois (Tensor): Input RoIs, shape (k, 5).
num_levels (int): Total level number.
Returns:
Tensor: Level index (0-based) of each RoI, shape (k, )
"""
scale = torch.sqrt(
(rois[:, 3] - rois[:, 1] + 1) * (rois[:, 4] - rois[:, 2] + 1))
target_lvls = torch.floor(torch.log2(scale / self.finest_scale + 1e-6))
target_lvls = target_lvls.clamp(min=0, max=num_levels - 1).long()
return target_lvls
def forward(self, feats, rois):
if len(feats) == 1:
return self.roi_layers[0](feats[0], rois)
out_size = self.roi_layers[0].out_size
num_levels = len(feats)
target_lvls = self.map_roi_levels(rois, num_levels)
roi_feats = torch.cuda.FloatTensor(rois.size()[0], self.out_channels,
out_size, out_size).fill_(0)
for i in range(num_levels):
inds = target_lvls == i
if inds.any():
rois_ = rois[inds, :]
roi_feats_t = self.roi_layers[i](feats[i], rois_)
roi_feats[inds] += roi_feats_t
return roi_feats
================================================
FILE: mmdet/models/utils/__init__.py
================================================
from .conv_module import ConvModule
from .norm import build_norm_layer
from .weight_init import (xavier_init, normal_init, uniform_init, kaiming_init,
bias_init_with_prob)
__all__ = [
'ConvModule', 'build_norm_layer', 'xavier_init', 'normal_init',
'uniform_init', 'kaiming_init', 'bias_init_with_prob'
]
================================================
FILE: mmdet/models/utils/conv_module.py
================================================
import warnings
import torch.nn as nn
from mmcv.cnn import kaiming_init, constant_init
from .norm import build_norm_layer
class ConvModule(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
normalize=None,
activation='relu',
inplace=True,
activate_last=True):
super(ConvModule, self).__init__()
self.with_norm = normalize is not None
self.with_activatation = activation is not None
self.with_bias = bias
self.activation = activation
self.activate_last = activate_last
if self.with_norm and self.with_bias:
warnings.warn('ConvModule has norm and bias at the same time')
self.conv = nn.Conv2d(
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias=bias)
self.in_channels = self.conv.in_channels
self.out_channels = self.conv.out_channels
self.kernel_size = self.conv.kernel_size
self.stride = self.conv.stride
self.padding = self.conv.padding
self.dilation = self.conv.dilation
self.transposed = self.conv.transposed
self.output_padding = self.conv.output_padding
self.groups = self.conv.groups
if self.with_norm:
norm_channels = out_channels if self.activate_last else in_channels
self.norm_name, norm = build_norm_layer(normalize, norm_channels)
self.add_module(self.norm_name, norm)
if self.with_activatation:
assert activation in ['relu'], 'Only ReLU supported.'
if self.activation == 'relu':
self.activate = nn.ReLU(inplace=inplace)
# Default using msra init
self.init_weights()
@property
def norm(self):
return getattr(self, self.norm_name)
def init_weights(self):
nonlinearity = 'relu' if self.activation is None else self.activation
kaiming_init(self.conv, nonlinearity=nonlinearity)
if self.with_norm:
constant_init(self.norm, 1, bias=0)
def forward(self, x, activate=True, norm=True):
if self.activate_last:
x = self.conv(x)
if norm and self.with_norm:
x = self.norm(x)
if activate and self.with_activatation:
x = self.activate(x)
else:
if norm and self.with_norm:
x = self.norm(x)
if activate and self.with_activatation:
x = self.activate(x)
x = self.conv(x)
return x
================================================
FILE: mmdet/models/utils/norm.py
================================================
import torch.nn as nn
norm_cfg = {
# format: layer_type: (abbreviation, module)
'BN': ('bn', nn.BatchNorm2d),
'SyncBN': ('bn', None),
'GN': ('gn', nn.GroupNorm),
# and potentially 'SN'
}
def build_norm_layer(cfg, num_features, postfix=''):
""" Build normalization layer
Args:
cfg (dict): cfg should contain:
type (str): identify norm layer type.
layer args: args needed to instantiate a norm layer.
frozen (bool): [optional] whether stop gradient updates
of norm layer, it is helpful to set frozen mode
in backbone's norms.
num_features (int): number of channels from input
postfix (int, str): appended into norm abbreation to
create named layer.
Returns:
name (str): abbreation + postfix
layer (nn.Module): created norm layer
"""
assert isinstance(cfg, dict) and 'type' in cfg
cfg_ = cfg.copy()
layer_type = cfg_.pop('type')
if layer_type not in norm_cfg:
raise KeyError('Unrecognized norm type {}'.format(layer_type))
else:
abbr, norm_layer = norm_cfg[layer_type]
if norm_layer is None:
raise NotImplementedError
assert isinstance(postfix, (int, str))
name = abbr + str(postfix)
frozen = cfg_.pop('frozen', False)
cfg_.setdefault('eps', 1e-5)
if layer_type != 'GN':
layer = norm_layer(num_features, **cfg_)
else:
assert 'num_groups' in cfg_
layer = norm_layer(num_channels=num_features, **cfg_)
if frozen:
for param in layer.parameters():
param.requires_grad = False
return name, layer
================================================
FILE: mmdet/models/utils/weight_init.py
================================================
import numpy as np
import torch.nn as nn
def xavier_init(module, gain=1, bias=0, distribution='normal'):
assert distribution in ['uniform', 'normal']
if distribution == 'uniform':
nn.init.xavier_uniform_(module.weight, gain=gain)
else:
nn.init.xavier_normal_(module.weight, gain=gain)
if hasattr(module, 'bias'):
nn.init.constant_(module.bias, bias)
def normal_init(module, mean=0, std=1, bias=0):
nn.init.normal_(module.weight, mean, std)
if hasattr(module, 'bias'):
nn.init.constant_(module.bias, bias)
def uniform_init(module, a=0, b=1, bias=0):
nn.init.uniform_(module.weight, a, b)
if hasattr(module, 'bias'):
nn.init.constant_(module.bias, bias)
def kaiming_init(module,
mode='fan_out',
nonlinearity='relu',
bias=0,
distribution='normal'):
assert distribution in ['uniform', 'normal']
if distribution == 'uniform':
nn.init.kaiming_uniform_(
module.weight, mode=mode, nonlinearity=nonlinearity)
else:
nn.init.kaiming_normal_(
module.weight, mode=mode, nonlinearity=nonlinearity)
if hasattr(module, 'bias'):
nn.init.constant_(module.bias, bias)
def bias_init_with_prob(prior_prob):
""" initialize conv/fc bias value according to giving probablity"""
bias_init = float(-np.log((1 - prior_prob) / prior_prob))
return bias_init
================================================
FILE: mmdet/ops/__init__.py
================================================
from .dcn import (DeformConv, DeformRoIPooling, DeformRoIPoolingPack,
ModulatedDeformRoIPoolingPack, ModulatedDeformConv,
ModulatedDeformConvPack, deform_conv, modulated_deform_conv,
deform_roi_pooling)
from .nms import nms, soft_nms
from .roi_align import RoIAlign, roi_align
from .roi_pool import RoIPool, roi_pool
__all__ = [
'nms', 'soft_nms', 'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool',
'DeformConv', 'DeformRoIPooling', 'DeformRoIPoolingPack',
'ModulatedDeformRoIPoolingPack', 'ModulatedDeformConv',
'ModulatedDeformConvPack', 'deform_conv', 'modulated_deform_conv',
'deform_roi_pooling'
]
================================================
FILE: mmdet/ops/dcn/__init__.py
================================================
from .functions.deform_conv import deform_conv, modulated_deform_conv
from .functions.deform_pool import deform_roi_pooling
from .modules.deform_conv import (DeformConv, ModulatedDeformConv,
ModulatedDeformConvPack)
from .modules.deform_pool import (DeformRoIPooling, DeformRoIPoolingPack,
ModulatedDeformRoIPoolingPack)
__all__ = [
'DeformConv', 'DeformRoIPooling', 'DeformRoIPoolingPack',
'ModulatedDeformRoIPoolingPack', 'ModulatedDeformConv',
'ModulatedDeformConvPack', 'deform_conv',
'modulated_deform_conv', 'deform_roi_pooling'
]
================================================
FILE: mmdet/ops/dcn/functions/__init__.py
================================================
================================================
FILE: mmdet/ops/dcn/functions/deform_conv.py
================================================
import torch
from torch.autograd import Function
from torch.nn.modules.utils import _pair
from .. import deform_conv_cuda
class DeformConvFunction(Function):
@staticmethod
def forward(ctx,
input,
offset,
weight,
stride=1,
padding=0,
dilation=1,
groups=1,
deformable_groups=1,
im2col_step=64):
if input is not None and input.dim() != 4:
raise ValueError(
"Expected 4D tensor as input, got {}D tensor instead.".format(
input.dim()))
ctx.stride = _pair(stride)
ctx.padding = _pair(padding)
ctx.dilation = _pair(dilation)
ctx.groups = groups
ctx.deformable_groups = deformable_groups
ctx.im2col_step = im2col_step
ctx.save_for_backward(input, offset, weight)
output = input.new_empty(
DeformConvFunction._output_size(input, weight, ctx.padding,
ctx.dilation, ctx.stride))
ctx.bufs_ = [input.new_empty(0), input.new_empty(0)] # columns, ones
if not input.is_cuda:
raise NotImplementedError
else:
cur_im2col_step = min(ctx.im2col_step, input.shape[0])
assert (input.shape[0] %
cur_im2col_step) == 0, 'im2col step must divide batchsize'
deform_conv_cuda.deform_conv_forward_cuda(
input, weight, offset, output, ctx.bufs_[0], ctx.bufs_[1],
weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0],
ctx.padding[1], ctx.padding[0], ctx.dilation[1],
ctx.dilation[0], ctx.groups, ctx.deformable_groups,
cur_im2col_step)
return output
@staticmethod
def backward(ctx, grad_output):
input, offset, weight = ctx.saved_tensors
grad_input = grad_offset = grad_weight = None
if not grad_output.is_cuda:
raise NotImplementedError
else:
cur_im2col_step = min(ctx.im2col_step, input.shape[0])
assert (input.shape[0] %
cur_im2col_step) == 0, 'im2col step must divide batchsize'
if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
grad_input = torch.zeros_like(input)
grad_offset = torch.zeros_like(offset)
deform_conv_cuda.deform_conv_backward_input_cuda(
input, offset, grad_output, grad_input,
grad_offset, weight, ctx.bufs_[0], weight.size(3),
weight.size(2), ctx.stride[1], ctx.stride[0],
ctx.padding[1], ctx.padding[0], ctx.dilation[1],
ctx.dilation[0], ctx.groups, ctx.deformable_groups,
cur_im2col_step)
if ctx.needs_input_grad[2]:
grad_weight = torch.zeros_like(weight)
deform_conv_cuda.deform_conv_backward_parameters_cuda(
input, offset, grad_output,
grad_weight, ctx.bufs_[0], ctx.bufs_[1], weight.size(3),
weight.size(2), ctx.stride[1], ctx.stride[0],
ctx.padding[1], ctx.padding[0], ctx.dilation[1],
ctx.dilation[0], ctx.groups, ctx.deformable_groups, 1,
cur_im2col_step)
return (grad_input, grad_offset, grad_weight, None, None, None, None,
None)
@staticmethod
def _output_size(input, weight, padding, dilation, stride):
channels = weight.size(0)
output_size = (input.size(0), channels)
for d in range(input.dim() - 2):
in_size = input.size(d + 2)
pad = padding[d]
kernel = dilation[d] * (weight.size(d + 2) - 1) + 1
stride_ = stride[d]
output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
if not all(map(lambda s: s > 0, output_size)):
raise ValueError(
"convolution input is too small (output would be {})".format(
'x'.join(map(str, output_size))))
return output_size
class ModulatedDeformConvFunction(Function):
@staticmethod
def forward(ctx,
input,
offset,
mask,
weight,
bias=None,
stride=1,
padding=0,
dilation=1,
groups=1,
deformable_groups=1):
ctx.stride = stride
ctx.padding = padding
ctx.dilation = dilation
ctx.groups = groups
ctx.deformable_groups = deformable_groups
ctx.with_bias = bias is not None
if not ctx.with_bias:
bias = input.new_empty(1) # fake tensor
if not input.is_cuda:
raise NotImplementedError
if weight.requires_grad or mask.requires_grad or offset.requires_grad \
or input.requires_grad:
ctx.save_for_backward(input, offset, mask, weight, bias)
output = input.new_empty(
ModulatedDeformConvFunction._infer_shape(ctx, input, weight))
ctx._bufs = [input.new_empty(0), input.new_empty(0)]
deform_conv_cuda.modulated_deform_conv_cuda_forward(
input, weight, bias, ctx._bufs[0], offset, mask, output,
ctx._bufs[1], weight.shape[2], weight.shape[3], ctx.stride,
ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation,
ctx.groups, ctx.deformable_groups, ctx.with_bias)
return output
@staticmethod
def backward(ctx, grad_output):
if not grad_output.is_cuda:
raise NotImplementedError
input, offset, mask, weight, bias = ctx.saved_tensors
grad_input = torch.zeros_like(input)
grad_offset = torch.zeros_like(offset)
grad_mask = torch.zeros_like(mask)
grad_weight = torch.zeros_like(weight)
grad_bias = torch.zeros_like(bias)
deform_conv_cuda.modulated_deform_conv_cuda_backward(
input, weight, bias, ctx._bufs[0], offset, mask, ctx._bufs[1],
grad_input, grad_weight, grad_bias, grad_offset, grad_mask,
grad_output, weight.shape[2], weight.shape[3], ctx.stride,
ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation,
ctx.groups, ctx.deformable_groups, ctx.with_bias)
if not ctx.with_bias:
grad_bias = None
return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
None, None, None, None, None)
@staticmethod
def _infer_shape(ctx, input, weight):
n = input.size(0)
channels_out = weight.size(0)
height, width = input.shape[2:4]
kernel_h, kernel_w = weight.shape[2:4]
height_out = (height + 2 * ctx.padding -
(ctx.dilation * (kernel_h - 1) + 1)) // ctx.stride + 1
width_out = (width + 2 * ctx.padding -
(ctx.dilation * (kernel_w - 1) + 1)) // ctx.stride + 1
return n, channels_out, height_out, width_out
deform_conv = DeformConvFunction.apply
modulated_deform_conv = ModulatedDeformConvFunction.apply
================================================
FILE: mmdet/ops/dcn/functions/deform_pool.py
================================================
import torch
from torch.autograd import Function
from .. import deform_pool_cuda
class DeformRoIPoolingFunction(Function):
@staticmethod
def forward(ctx,
data,
rois,
offset,
spatial_scale,
out_size,
out_channels,
no_trans,
group_size=1,
part_size=None,
sample_per_part=4,
trans_std=.0):
ctx.spatial_scale = spatial_scale
ctx.out_size = out_size
ctx.out_channels = out_channels
ctx.no_trans = no_trans
ctx.group_size = group_size
ctx.part_size = out_size if part_size is None else part_size
ctx.sample_per_part = sample_per_part
ctx.trans_std = trans_std
assert 0.0 <= ctx.trans_std <= 1.0
if not data.is_cuda:
raise NotImplementedError
n = rois.shape[0]
output = data.new_empty(n, out_channels, out_size, out_size)
output_count = data.new_empty(n, out_channels, out_size, out_size)
deform_pool_cuda.deform_psroi_pooling_cuda_forward(
data, rois, offset, output, output_count, ctx.no_trans,
ctx.spatial_scale, ctx.out_channels, ctx.group_size, ctx.out_size,
ctx.part_size, ctx.sample_per_part, ctx.trans_std)
if data.requires_grad or rois.requires_grad or offset.requires_grad:
ctx.save_for_backward(data, rois, offset)
ctx.output_count = output_count
return output
@staticmethod
def backward(ctx, grad_output):
if not grad_output.is_cuda:
raise NotImplementedError
data, rois, offset = ctx.saved_tensors
output_count = ctx.output_count
grad_input = torch.zeros_like(data)
grad_rois = None
grad_offset = torch.zeros_like(offset)
deform_pool_cuda.deform_psroi_pooling_cuda_backward(
grad_output, data, rois, offset, output_count, grad_input,
grad_offset, ctx.no_trans, ctx.spatial_scale, ctx.out_channels,
ctx.group_size, ctx.out_size, ctx.part_size, ctx.sample_per_part,
ctx.trans_std)
return (grad_input, grad_rois, grad_offset, None, None, None, None,
None, None, None, None)
deform_roi_pooling = DeformRoIPoolingFunction.apply
================================================
FILE: mmdet/ops/dcn/modules/__init__.py
================================================
================================================
FILE: mmdet/ops/dcn/modules/deform_conv.py
================================================
import math
import torch
import torch.nn as nn
from torch.nn.modules.utils import _pair
from ..functions.deform_conv import deform_conv, modulated_deform_conv
class DeformConv(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
deformable_groups=1,
bias=False):
assert not bias
super(DeformConv, self).__init__()
assert in_channels % groups == 0, \
'in_channels {} cannot be divisible by groups {}'.format(
in_channels, groups)
assert out_channels % groups == 0, \
'out_channels {} cannot be divisible by groups {}'.format(
out_channels, groups)
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = _pair(kernel_size)
self.stride = _pair(stride)
self.padding = _pair(padding)
self.dilation = _pair(dilation)
self.groups = groups
self.deformable_groups = deformable_groups
self.weight = nn.Parameter(
torch.Tensor(out_channels, in_channels // self.groups,
*self.kernel_size))
self.reset_parameters()
def reset_parameters(self):
n = self.in_channels
for k in self.kernel_size:
n *= k
stdv = 1. / math.sqrt(n)
self.weight.data.uniform_(-stdv, stdv)
def forward(self, input, offset):
return deform_conv(input, offset, self.weight, self.stride,
self.padding, self.dilation, self.groups,
self.deformable_groups)
class ModulatedDeformConv(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
deformable_groups=1,
bias=True):
super(ModulatedDeformConv, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = _pair(kernel_size)
self.stride = stride
self.padding = padding
self.dilation = dilation
self.groups = groups
self.deformable_groups = deformable_groups
self.with_bias = bias
self.weight = nn.Parameter(
torch.Tensor(out_channels, in_channels // groups,
*self.kernel_size))
if bias:
self.bias = nn.Parameter(torch.Tensor(out_channels))
else:
self.register_parameter('bias', None)
self.reset_parameters()
def reset_parameters(self):
n = self.in_channels
for k in self.kernel_size:
n *= k
stdv = 1. / math.sqrt(n)
self.weight.data.uniform_(-stdv, stdv)
if self.bias is not None:
self.bias.data.zero_()
def forward(self, input, offset, mask):
return modulated_deform_conv(
input, offset, mask, self.weight, self.bias, self.stride,
self.padding, self.dilation, self.groups, self.deformable_groups)
class ModulatedDeformConvPack(ModulatedDeformConv):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
deformable_groups=1,
bias=True):
super(ModulatedDeformConvPack, self).__init__(
in_channels, out_channels, kernel_size, stride, padding, dilation,
groups, deformable_groups, bias)
self.conv_offset_mask = nn.Conv2d(
self.in_channels // self.groups,
self.deformable_groups * 3 * self.kernel_size[0] *
self.kernel_size[1],
kernel_size=self.kernel_size,
stride=_pair(self.stride),
padding=_pair(self.padding),
bias=True)
self.init_offset()
def init_offset(self):
self.conv_offset_mask.weight.data.zero_()
self.conv_offset_mask.bias.data.zero_()
def forward(self, input):
out = self.conv_offset_mask(input)
o1, o2, mask = torch.chunk(out, 3, dim=1)
offset = torch.cat((o1, o2), dim=1)
mask = torch.sigmoid(mask)
return modulated_deform_conv(
input, offset, mask, self.weight, self.bias, self.stride,
self.padding, self.dilation, self.groups, self.deformable_groups)
================================================
FILE: mmdet/ops/dcn/modules/deform_pool.py
================================================
from torch import nn
from ..functions.deform_pool import deform_roi_pooling
class DeformRoIPooling(nn.Module):
def __init__(self,
spatial_scale,
out_size,
out_channels,
no_trans,
group_size=1,
part_size=None,
sample_per_part=4,
trans_std=.0):
super(DeformRoIPooling, self).__init__()
self.spatial_scale = spatial_scale
self.out_size = out_size
self.out_channels = out_channels
self.no_trans = no_trans
self.group_size = group_size
self.part_size = out_size if part_size is None else part_size
self.sample_per_part = sample_per_part
self.trans_std = trans_std
def forward(self, data, rois, offset):
if self.no_trans:
offset = data.new_empty(0)
return deform_roi_pooling(
data, rois, offset, self.spatial_scale, self.out_size,
self.out_channels, self.no_trans, self.group_size, self.part_size,
self.sample_per_part, self.trans_std)
class DeformRoIPoolingPack(DeformRoIPooling):
def __init__(self,
spatial_scale,
out_size,
out_channels,
no_trans,
group_size=1,
part_size=None,
sample_per_part=4,
trans_std=.0,
deform_fc_channels=1024):
super(DeformRoIPoolingPack,
self).__init__(spatial_scale, out_size, out_channels, no_trans,
group_size, part_size, sample_per_part, trans_std)
self.deform_fc_channels = deform_fc_channels
if not no_trans:
self.offset_fc = nn.Sequential(
nn.Linear(self.out_size * self.out_size * self.out_channels,
self.deform_fc_channels),
nn.ReLU(inplace=True),
nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
nn.ReLU(inplace=True),
nn.Linear(self.deform_fc_channels,
self.out_size * self.out_size * 2))
self.offset_fc[-1].weight.data.zero_()
self.offset_fc[-1].bias.data.zero_()
def forward(self, data, rois):
assert data.size(1) == self.out_channels
if self.no_trans:
offset = data.new_empty(0)
return deform_roi_pooling(
data, rois, offset, self.spatial_scale, self.out_size,
self.out_channels, self.no_trans, self.group_size,
self.part_size, self.sample_per_part, self.trans_std)
else:
n = rois.shape[0]
offset = data.new_empty(0)
x = deform_roi_pooling(data, rois, offset, self.spatial_scale,
self.out_size, self.out_channels, True,
self.group_size, self.part_size,
self.sample_per_part, self.trans_std)
offset = self.offset_fc(x.view(n, -1))
offset = offset.view(n, 2, self.out_size, self.out_size)
return deform_roi_pooling(
data, rois, offset, self.spatial_scale, self.out_size,
self.out_channels, self.no_trans, self.group_size,
self.part_size, self.sample_per_part, self.trans_std)
class ModulatedDeformRoIPoolingPack(DeformRoIPooling):
def __init__(self,
spatial_scale,
out_size,
out_channels,
no_trans,
group_size=1,
part_size=None,
sample_per_part=4,
trans_std=.0,
deform_fc_channels=1024):
super(ModulatedDeformRoIPoolingPack, self).__init__(
spatial_scale, out_size, out_channels, no_trans, group_size,
part_size, sample_per_part, trans_std)
self.deform_fc_channels = deform_fc_channels
if not no_trans:
self.offset_fc = nn.Sequential(
nn.Linear(self.out_size * self.out_size * self.out_channels,
self.deform_fc_channels),
nn.ReLU(inplace=True),
nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
nn.ReLU(inplace=True),
nn.Linear(self.deform_fc_channels,
self.out_size * self.out_size * 2))
self.offset_fc[-1].weight.data.zero_()
self.offset_fc[-1].bias.data.zero_()
self.mask_fc = nn.Sequential(
nn.Linear(self.out_size * self.out_size * self.out_channels,
self.deform_fc_channels),
nn.ReLU(inplace=True),
nn.Linear(self.deform_fc_channels,
self.out_size * self.out_size * 1),
nn.Sigmoid())
self.mask_fc[2].weight.data.zero_()
self.mask_fc[2].bias.data.zero_()
def forward(self, data, rois):
assert data.size(1) == self.out_channels
if self.no_trans:
offset = data.new_empty(0)
return deform_roi_pooling(
data, rois, offset, self.spatial_scale, self.out_size,
self.out_channels, self.no_trans, self.group_size,
self.part_size, self.sample_per_part, self.trans_std)
else:
n = rois.shape[0]
offset = data.new_empty(0)
x = deform_roi_pooling(data, rois, offset, self.spatial_scale,
self.out_size, self.out_channels, True,
self.group_size, self.part_size,
self.sample_per_part, self.trans_std)
offset = self.offset_fc(x.view(n, -1))
offset = offset.view(n, 2, self.out_size, self.out_size)
mask = self.mask_fc(x.view(n, -1))
mask = mask.view(n, 1, self.out_size, self.out_size)
return deform_roi_pooling(
data, rois, offset, self.spatial_scale, self.out_size,
self.out_channels, self.no_trans, self.group_size,
self.part_size, self.sample_per_part, self.trans_std) * mask
================================================
FILE: mmdet/ops/dcn/setup.py
================================================
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
setup(
name='deform_conv',
ext_modules=[
CUDAExtension('deform_conv_cuda', [
'src/deform_conv_cuda.cpp',
'src/deform_conv_cuda_kernel.cu',
]),
CUDAExtension('deform_pool_cuda', [
'src/deform_pool_cuda.cpp', 'src/deform_pool_cuda_kernel.cu'
]),
],
cmdclass={'build_ext': BuildExtension})
================================================
FILE: mmdet/ops/dcn/src/deform_conv_cuda.cpp
================================================
// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
#include
#include
#include
void deformable_im2col(const at::Tensor data_im,
const at::Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs,
const int deformable_group, at::Tensor data_col);
void deformable_col2im(const at::Tensor data_col,
const at::Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs,
const int deformable_group, at::Tensor grad_im);
void deformable_col2im_coord(const at::Tensor data_col,
const at::Tensor data_im, const at::Tensor data_offset,
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h,
const int pad_w, const int stride_h,
const int stride_w, const int dilation_h,
const int dilation_w, const int parallel_imgs,
const int deformable_group, at::Tensor grad_offset);
void modulated_deformable_im2col_cuda(const at::Tensor data_im, const at::Tensor data_offset,
const at::Tensor data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kenerl_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int deformable_group, at::Tensor data_col);
void modulated_deformable_col2im_cuda(const at::Tensor data_col, const at::Tensor data_offset,
const at::Tensor data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kenerl_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int deformable_group, at::Tensor grad_im);
void modulated_deformable_col2im_coord_cuda(const at::Tensor data_col, const at::Tensor data_im,
const at::Tensor data_offset, const at::Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h,
const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int deformable_group, at::Tensor grad_offset,
at::Tensor grad_mask);
void shape_check(at::Tensor input, at::Tensor offset,
at::Tensor *gradOutput, at::Tensor weight, int kH, int kW,
int dH, int dW, int padH, int padW, int dilationH,
int dilationW, int group, int deformable_group)
{
AT_CHECK(weight.ndimension() == 4,
"4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
"but got: %s",
weight.ndimension());
AT_CHECK(weight.is_contiguous(),
"weight tensor has to be contiguous");
AT_CHECK(kW > 0 && kH > 0,
"kernel size should be greater than zero, but got kH: %d kW: %d",
kH, kW);
AT_CHECK((weight.size(2) == kH &&
weight.size(3) == kW),
"kernel size should be consistent with weight, ",
"but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
kW, weight.size(2), weight.size(3));
AT_CHECK(dW > 0 && dH > 0,
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
AT_CHECK(dilationW > 0 && dilationH > 0,
"dilation should be greater than 0, but got dilationH: %d dilationW: %d",
dilationH, dilationW);
int ndim = input.ndimension();
int dimf = 0;
int dimh = 1;
int dimw = 2;
if (ndim == 4)
{
dimf++;
dimh++;
dimw++;
}
AT_CHECK(ndim == 3 || ndim == 4,
"3D or 4D input tensor expected but got: %s", ndim);
long nInputPlane = weight.size(1) * group;
long inputHeight = input.size(dimh);
long inputWidth = input.size(dimw);
long nOutputPlane = weight.size(0);
long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
AT_CHECK(nInputPlane % deformable_group == 0,
"input channels must divide deformable group size");
if (outputWidth < 1 || outputHeight < 1)
AT_ERROR(
"Given input size: (%ld x %ld x %ld). "
"Calculated output size: (%ld x %ld x %ld). Output size is too small",
nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
outputWidth);
AT_CHECK(input.size(1) == nInputPlane,
"invalid number of input planes, expected: %d, but got: %d",
nInputPlane, input.size(1));
AT_CHECK((inputHeight >= kH && inputWidth >= kW),
"input image is smaller than kernel");
AT_CHECK(
(offset.size(2) == outputHeight && offset.size(3) == outputWidth),
"invalid spatial size of offset, expected height: %d width: %d, but got height: %d width: %d",
outputHeight, outputWidth, offset.size(2), offset.size(3));
AT_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
"invalid number of channels of offset");
if (gradOutput != NULL)
{
AT_CHECK(gradOutput->size(dimf) == nOutputPlane,
"invalid number of gradOutput planes, expected: %d, but got: %d",
nOutputPlane, gradOutput->size(dimf));
AT_CHECK((gradOutput->size(dimh) == outputHeight &&
gradOutput->size(dimw) == outputWidth),
"invalid size of gradOutput, expected height: %d width: %d , but got height: %d width: %d",
outputHeight, outputWidth, gradOutput->size(dimh), gradOutput->size(dimw));
}
}
int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
at::Tensor offset, at::Tensor output,
at::Tensor columns, at::Tensor ones, int kW,
int kH, int dW, int dH, int padW, int padH,
int dilationW, int dilationH, int group,
int deformable_group, int im2col_step)
{
// todo: resize columns to include im2col: done
// todo: add im2col_step as input
// todo: add new output buffer and transpose it to output (or directly transpose output)
// todo: possibly change data indexing because of parallel_imgs
shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH, padW,
dilationH, dilationW, group, deformable_group);
input = input.contiguous();
offset = offset.contiguous();
weight = weight.contiguous();
int batch = 1;
if (input.ndimension() == 3)
{
// Force batch
batch = 0;
input.unsqueeze_(0);
offset.unsqueeze_(0);
}
// todo: assert batchsize dividable by im2col_step
long batchSize = input.size(0);
long nInputPlane = input.size(1);
long inputHeight = input.size(2);
long inputWidth = input.size(3);
long nOutputPlane = weight.size(0);
long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane, outputHeight, outputWidth});
columns = at::zeros({nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, input.type());
if (ones.ndimension() != 2 || ones.size(0) * ones.size(1) < outputHeight * outputWidth)
{
ones = at::ones({outputHeight, outputWidth}, input.type());
}
input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, inputHeight, inputWidth});
offset = offset.view({batchSize / im2col_step, im2col_step,
deformable_group * 2 * kH * kW, outputHeight, outputWidth});
at::Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane, im2col_step * outputHeight, outputWidth}, output.type());
output_buffer = output_buffer.view({output_buffer.size(0), group, output_buffer.size(1) / group, output_buffer.size(2), output_buffer.size(3)});
for (int elt = 0; elt < batchSize / im2col_step; elt++)
{
deformable_im2col(
input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW,
im2col_step, deformable_group, columns);
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
weight = weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)});
for (int g = 0; g < group; g++){
output_buffer[elt][g] =
output_buffer[elt][g].flatten(1).addmm_(weight[g].flatten(1), columns[g]).view_as(output_buffer[elt][g]);
}
}
output_buffer = output_buffer.view({output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2), output_buffer.size(3), output_buffer.size(4)});
output_buffer = output_buffer.view(
{batchSize / im2col_step, nOutputPlane, im2col_step, outputHeight, outputWidth});
output_buffer.transpose_(1, 2);
output.copy_(output_buffer);
output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
offset = offset.view({batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
if (batch == 0)
{
output = output.view({nOutputPlane, outputHeight, outputWidth});
input = input.view({nInputPlane, inputHeight, inputWidth});
offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
}
return 1;
}
int deform_conv_backward_input_cuda(
at::Tensor input, at::Tensor offset, at::Tensor gradOutput,
at::Tensor gradInput, at::Tensor gradOffset, at::Tensor weight,
at::Tensor columns, int kW, int kH, int dW, int dH, int padW, int padH,
int dilationW, int dilationH, int group, int deformable_group, int im2col_step)
{
shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW, padH,
padW, dilationH, dilationW, group, deformable_group);
input = input.contiguous();
offset = offset.contiguous();
gradOutput = gradOutput.contiguous();
weight = weight.contiguous();
int batch = 1;
if (input.ndimension() == 3)
{
// Force batch
batch = 0;
input = input.view({1, input.size(0), input.size(1), input.size(2)});
offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
gradOutput = gradOutput.view({1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
}
long batchSize = input.size(0);
long nInputPlane = input.size(1);
long inputHeight = input.size(2);
long inputWidth = input.size(3);
long nOutputPlane = weight.size(0);
long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
AT_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
columns = at::zeros({nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, input.type());
// change order of grad output
gradOutput = gradOutput.view(
{batchSize / im2col_step, im2col_step, nOutputPlane, outputHeight, outputWidth});
gradOutput.transpose_(1, 2);
gradInput = gradInput.view(
{batchSize / im2col_step, im2col_step, nInputPlane, inputHeight, inputWidth});
input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, inputHeight, inputWidth});
gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
deformable_group * 2 * kH * kW, outputHeight, outputWidth});
offset = offset.view({batchSize / im2col_step, im2col_step,
deformable_group * 2 * kH * kW, outputHeight, outputWidth});
for (int elt = 0; elt < batchSize / im2col_step; elt++)
{
// divide into groups
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
weight = weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)});
gradOutput = gradOutput.view({gradOutput.size(0), group, gradOutput.size(1) / group, gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
for (int g = 0; g < group; g++){
columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1), gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
}
columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)});
gradOutput = gradOutput.view({gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2), gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
deformable_col2im_coord(
columns, input[elt], offset[elt],
nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
dilationH, dilationW, im2col_step, deformable_group, gradOffset[elt]);
deformable_col2im(
columns, offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, im2col_step,
deformable_group, gradInput[elt]);
}
gradOutput.transpose_(1, 2);
gradOutput = gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
gradOffset = gradOffset.view({batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
offset = offset.view({batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
if (batch == 0)
{
gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
input = input.view({nInputPlane, inputHeight, inputWidth});
gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
gradOffset = gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
}
return 1;
}
int deform_conv_backward_parameters_cuda(
at::Tensor input, at::Tensor offset, at::Tensor gradOutput,
at::Tensor gradWeight, // at::Tensor gradBias,
at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH,
int padW, int padH, int dilationW, int dilationH, int group, int deformable_group,
float scale, int im2col_step)
{
// todo: transpose and reshape outGrad
// todo: reshape columns
// todo: add im2col_step as input
shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH, dW,
padH, padW, dilationH, dilationW, group, deformable_group);
input = input.contiguous();
offset = offset.contiguous();
gradOutput = gradOutput.contiguous();
int batch = 1;
if (input.ndimension() == 3)
{
// Force batch
batch = 0;
input = input.view(at::IntList({1, input.size(0), input.size(1), input.size(2)}));
gradOutput = gradOutput.view({1, gradOutput.size(0),
gradOutput.size(1), gradOutput.size(2)});
}
long batchSize = input.size(0);
long nInputPlane = input.size(1);
long inputHeight = input.size(2);
long inputWidth = input.size(3);
long nOutputPlane = gradWeight.size(0);
long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
columns = at::zeros({nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, input.type());
gradOutput = gradOutput.view(
{batchSize / im2col_step, im2col_step, nOutputPlane, outputHeight, outputWidth});
gradOutput.transpose_(1, 2);
at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
gradOutputBuffer = gradOutputBuffer.view(
{batchSize / im2col_step, nOutputPlane, im2col_step, outputHeight, outputWidth});
gradOutputBuffer.copy_(gradOutput);
gradOutputBuffer = gradOutputBuffer.view(
{batchSize / im2col_step, nOutputPlane, im2col_step * outputHeight, outputWidth});
gradOutput.transpose_(1, 2);
gradOutput = gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, inputHeight, inputWidth});
offset = offset.view({batchSize / im2col_step, im2col_step,
deformable_group * 2 * kH * kW,
outputHeight, outputWidth});
for (int elt = 0; elt < batchSize / im2col_step; elt++)
{
deformable_im2col(
input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW,
im2col_step, deformable_group, columns);
// divide into group
gradOutputBuffer = gradOutputBuffer.view({gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group, gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
gradWeight = gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1), gradWeight.size(2), gradWeight.size(3)});
for (int g = 0; g < group; g++){
gradWeight[g] = gradWeight[g].flatten(1).addmm_(
gradOutputBuffer[elt][g].flatten(1), columns[g].transpose(1, 0), 1.0, scale)
.view_as(gradWeight[g]);
}
gradOutputBuffer = gradOutputBuffer.view({gradOutputBuffer.size(0), gradOutputBuffer.size(1) * gradOutputBuffer.size(2), gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)});
gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1), gradWeight.size(2), gradWeight.size(3), gradWeight.size(4)});
}
input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
offset = offset.view({batchSize, deformable_group * 2 * kH * kW,
outputHeight, outputWidth});
if (batch == 0)
{
gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
input = input.view({nInputPlane, inputHeight, inputWidth});
}
return 1;
}
void modulated_deform_conv_cuda_forward(at::Tensor input, at::Tensor weight,
at::Tensor bias, at::Tensor ones,
at::Tensor offset, at::Tensor mask,
at::Tensor output, at::Tensor columns,
int kernel_h, int kernel_w,
const int stride_h, const int stride_w,
const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias)
{
AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
const int batch = input.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int channels_out = weight.size(0);
const int channels_kernel = weight.size(1);
const int kernel_h_ = weight.size(2);
const int kernel_w_ = weight.size(3);
if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
kernel_h_, kernel_w, kernel_h_, kernel_w_);
if (channels != channels_kernel * group)
AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
channels, channels_kernel * group);
const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
if (ones.ndimension() != 2 ||
ones.size(0) * ones.size(1) < height_out * width_out)
{
// Resize plane and fill with ones...
ones = at::ones({height_out, width_out}, input.type());
}
// resize output
output = output.view({batch, channels_out, height_out, width_out}).zero_();
// resize temporary columns
columns = at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.type());
output = output.view({output.size(0), group, output.size(1) / group, output.size(2), output.size(3)});
for (int b = 0; b < batch; b++)
{
modulated_deformable_im2col_cuda(input[b], offset[b], mask[b],
1, channels, height, width,
height_out, width_out, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
deformable_group, columns);
// divide into group
weight = weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)});
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
for (int g = 0; g < group; g++){
output[b][g] = output[b][g].flatten(1).addmm_(weight[g].flatten(1), columns[g]).view_as(output[b][g]);
}
weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), weight.size(3), weight.size(4)});
columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)});
}
output = output.view({output.size(0), output.size(1) * output.size(2), output.size(3), output.size(4)});
if (with_bias){
output += bias.view({1, bias.size(0), 1, 1});
}
}
void modulated_deform_conv_cuda_backward(at::Tensor input, at::Tensor weight,
at::Tensor bias, at::Tensor ones,
at::Tensor offset, at::Tensor mask,
at::Tensor columns,
at::Tensor grad_input, at::Tensor grad_weight,
at::Tensor grad_bias, at::Tensor grad_offset,
at::Tensor grad_mask, at::Tensor grad_output,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int pad_h, int pad_w,
int dilation_h, int dilation_w, int group,
int deformable_group, const bool with_bias)
{
AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
const int batch = input.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int channels_kernel = weight.size(1);
const int kernel_h_ = weight.size(2);
const int kernel_w_ = weight.size(3);
if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
kernel_h_, kernel_w, kernel_h_, kernel_w_);
if (channels != channels_kernel * group)
AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
channels, channels_kernel * group);
const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
if (ones.ndimension() != 2 ||
ones.size(0) * ones.size(1) < height_out * width_out)
{
// Resize plane and fill with ones...
ones = at::ones({height_out, width_out}, input.type());
}
grad_input = grad_input.view({batch, channels, height, width});
columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out}, input.type());
grad_output = grad_output.view({grad_output.size(0), group, grad_output.size(1) / group, grad_output.size(2), grad_output.size(3)});
for (int b = 0; b < batch; b++)
{
// divide int group
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
weight = weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)});
for (int g = 0; g < group; g++){
columns[g].addmm_(weight[g].flatten(1).transpose(0, 1), grad_output[b][g].flatten(1), 0.0f, 1.0f);
}
columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)});
weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), weight.size(3), weight.size(4)});
// gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cuda(columns, input[b], offset[b], mask[b],
1, channels, height, width,
height_out, width_out, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group,
grad_offset[b], grad_mask[b]);
// gradient w.r.t. input data
modulated_deformable_col2im_cuda(columns, offset[b], mask[b],
1, channels, height, width,
height_out, width_out, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group,
grad_input[b]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and group
modulated_deformable_im2col_cuda(input[b], offset[b], mask[b],
1, channels, height, width,
height_out, width_out, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group,
columns);
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
grad_weight = grad_weight.view({group, grad_weight.size(0) / group, grad_weight.size(1), grad_weight.size(2), grad_weight.size(3)});
if (with_bias)
grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
for (int g = 0; g < group; g++){
grad_weight[g] = grad_weight[g].flatten(1).addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1)).view_as(grad_weight[g]);
if (with_bias){
grad_bias[g] = grad_bias[g].view({-1, 1}).addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1})).view(-1);
}
}
columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)});
grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1), grad_weight.size(2), grad_weight.size(3), grad_weight.size(4)});
if (with_bias)
grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
}
grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1), grad_output.size(2), grad_output.size(3), grad_output.size(4)});
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
m.def("deform_conv_forward_cuda", &deform_conv_forward_cuda, "deform forward (CUDA)");
m.def("deform_conv_backward_input_cuda", &deform_conv_backward_input_cuda,
"deform_conv_backward_input (CUDA)");
m.def("deform_conv_backward_parameters_cuda", &deform_conv_backward_parameters_cuda,
"deform_conv_backward_parameters (CUDA)");
m.def("modulated_deform_conv_cuda_forward", &modulated_deform_conv_cuda_forward,
"modulated deform conv forward (CUDA)");
m.def("modulated_deform_conv_cuda_backward", &modulated_deform_conv_cuda_backward,
"modulated deform conv backward (CUDA)");
}
================================================
FILE: mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
================================================
/*!
******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
*
* COPYRIGHT
*
* All contributions by the University of California:
* Copyright (c) 2014-2017 The Regents of the University of California (Regents)
* All rights reserved.
*
* All other contributions:
* Copyright (c) 2014-2017, the respective contributors
* All rights reserved.
*
* Caffe uses a shared copyright model: each contributor holds copyright over
* their contributions to Caffe. The project versioning records all such
* contribution and copyright details. If a contributor wants to further mark
* their specific copyright on a particular contribution, they should indicate
* their copyright solely in the commit message of the change when it is
* committed.
*
* LICENSE
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* CONTRIBUTION AGREEMENT
*
* By contributing to the BVLC/caffe repository through pull-request, comment,
* or otherwise, the contributor releases their content to the
* license and copyright terms herein.
*
***************** END Caffe Copyright Notice and Disclaimer ********************
*
* Copyright (c) 2018 Microsoft
* Licensed under The MIT License [see LICENSE for details]
* \file modulated_deformable_im2col.cuh
* \brief Function definitions of converting an image to
* column matrix based on kernel, padding, dilation, and offset.
* These functions are mainly used in deformable convolution operators.
* \ref: https://arxiv.org/abs/1703.06211
* \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
*/
// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
#include
#include
#include
#include
#include
using namespace at;
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N)
{
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
template
__device__ scalar_t deformable_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
const int height, const int width, scalar_t h, scalar_t w)
{
int h_low = floor(h);
int w_low = floor(w);
int h_high = h_low + 1;
int w_high = w_low + 1;
scalar_t lh = h - h_low;
scalar_t lw = w - w_low;
scalar_t hh = 1 - lh, hw = 1 - lw;
scalar_t v1 = 0;
if (h_low >= 0 && w_low >= 0)
v1 = bottom_data[h_low * data_width + w_low];
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
v2 = bottom_data[h_low * data_width + w_high];
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
v3 = bottom_data[h_high * data_width + w_low];
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
v4 = bottom_data[h_high * data_width + w_high];
scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template
__device__ scalar_t get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
const int h, const int w, const int height, const int width)
{
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
{
//empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
scalar_t weight = 0;
if (h == argmax_h_low && w == argmax_w_low)
weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
if (h == argmax_h_low && w == argmax_w_high)
weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
if (h == argmax_h_high && w == argmax_w_low)
weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
if (h == argmax_h_high && w == argmax_w_high)
weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
return weight;
}
template
__device__ scalar_t get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
const int height, const int width, const scalar_t *im_data,
const int data_width, const int bp_dir)
{
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
{
//empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
scalar_t weight = 0;
if (bp_dir == 0)
{
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
}
else if (bp_dir == 1)
{
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
}
return weight;
}
template
__global__ void deformable_im2col_gpu_kernel(const int n, const scalar_t *data_im, const scalar_t *data_offset,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int channel_per_deformable_group,
const int batch_size, const int num_channels, const int deformable_group,
const int height_col, const int width_col,
scalar_t *data_col)
{
CUDA_KERNEL_LOOP(index, n)
{
// index index of output matrix
const int w_col = index % width_col;
const int h_col = (index / width_col) % height_col;
const int b_col = (index / width_col / height_col) % batch_size;
const int c_im = (index / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
// compute deformable group index
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
//const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i)
{
for (int j = 0; j < kernel_w; ++j)
{
const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
scalar_t val = static_cast(0);
const scalar_t h_im = h_in + i * dilation_h + offset_h;
const scalar_t w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
{
//const scalar_t map_h = i * dilation_h + offset_h;
//const scalar_t map_w = j * dilation_w + offset_w;
//const int cur_height = height - h_in;
//const int cur_width = width - w_in;
//val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
val = deformable_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
}
*data_col_ptr = val;
data_col_ptr += batch_size * height_col * width_col;
}
}
}
}
void deformable_im2col(
const at::Tensor data_im, const at::Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int parallel_imgs,
const int deformable_group, at::Tensor data_col)
{
// num_axes should be smaller than block size
// todo: check parallel_imgs is correctly passed in
int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels = channels * height_col * width_col * parallel_imgs;
int channel_per_deformable_group = channels / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.type(), "deformable_im2col_gpu", ([&] {
const scalar_t *data_im_ = data_im.data();
const scalar_t *data_offset_ = data_offset.data();
scalar_t *data_col_ = data_col.data();
deformable_im2col_gpu_kernel<<>>(
num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
channel_per_deformable_group, parallel_imgs, channels, deformable_group,
height_col, width_col, data_col_);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
}
}
template
__global__ void deformable_col2im_gpu_kernel(
const int n, const scalar_t *data_col, const scalar_t *data_offset,
const int channels, const int height, const int width,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group,
const int batch_size, const int deformable_group,
const int height_col, const int width_col,
scalar_t *grad_im)
{
CUDA_KERNEL_LOOP(index, n)
{
const int j = (index / width_col / height_col / batch_size) % kernel_w;
const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
// compute the start and end of the output
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = index % width_col;
int h_out = (index / width_col) % height_col;
int b = (index / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) *
2 * kernel_h * kernel_w * height_col * width_col;
const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
const scalar_t cur_top_grad = data_col[index];
const int cur_h = (int)cur_inv_h_data;
const int cur_w = (int)cur_inv_w_data;
for (int dy = -2; dy <= 2; dy++)
{
for (int dx = -2; dx <= 2; dx++)
{
if (cur_h + dy >= 0 && cur_h + dy < height &&
cur_w + dx >= 0 && cur_w + dx < width &&
abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1)
{
int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
scalar_t weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
}
}
}
}
}
void deformable_col2im(
const at::Tensor data_col, const at::Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
at::Tensor grad_im)
{
// todo: make sure parallel_imgs is passed in correctly
int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels = channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
int channel_per_deformable_group = channels / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.type(), "deformable_col2im_gpu", ([&] {
const scalar_t *data_col_ = data_col.data();
const scalar_t *data_offset_ = data_offset.data();
scalar_t *grad_im_ = grad_im.data();
deformable_col2im_gpu_kernel<<>>(
num_kernels, data_col_, data_offset_, channels, height, width, ksize_h,
ksize_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group,
parallel_imgs, deformable_group, height_col, width_col, grad_im_);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in deformable_col2im: %s\n", cudaGetErrorString(err));
}
}
template
__global__ void deformable_col2im_coord_gpu_kernel(const int n, const scalar_t *data_col,
const scalar_t *data_im, const scalar_t *data_offset,
const int channels, const int height, const int width,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group,
const int batch_size, const int offset_channels, const int deformable_group,
const int height_col, const int width_col, scalar_t *grad_offset)
{
CUDA_KERNEL_LOOP(index, n)
{
scalar_t val = 0;
int w = index % width_col;
int h = (index / width_col) % height_col;
int c = (index / width_col / height_col) % offset_channels;
int b = (index / width_col / height_col) / offset_channels;
// compute the start and end of the output
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group *
batch_size * width_col * height_col;
const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) *
channel_per_deformable_group / kernel_h / kernel_w * height * width;
const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
{
const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
scalar_t inv_h = h_in + i * dilation_h + offset_h;
scalar_t inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
{
inv_h = inv_w = -2;
}
const scalar_t weight = get_coordinate_weight(
inv_h, inv_w,
height, width, data_im_ptr + cnt * height * width, width, bp_dir);
val += weight * data_col_ptr[col_pos];
cnt += 1;
}
grad_offset[index] = val;
}
}
void deformable_col2im_coord(
const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset,
const int channels, const int height, const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group, at::Tensor grad_offset)
{
int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w * deformable_group * parallel_imgs;
int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.type(), "deformable_col2im_coord_gpu", ([&] {
const scalar_t *data_col_ = data_col.data();
const scalar_t *data_im_ = data_im.data();
const scalar_t *data_offset_ = data_offset.data();
scalar_t *grad_offset_ = grad_offset.data();
deformable_col2im_coord_gpu_kernel<<>>(
num_kernels, data_col_, data_im_, data_offset_, channels, height, width,
ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group,
parallel_imgs, 2 * ksize_h * ksize_w * deformable_group, deformable_group,
height_col, width_col, grad_offset_);
}));
}
template
__device__ scalar_t dmcn_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
const int height, const int width, scalar_t h, scalar_t w)
{
int h_low = floor(h);
int w_low = floor(w);
int h_high = h_low + 1;
int w_high = w_low + 1;
scalar_t lh = h - h_low;
scalar_t lw = w - w_low;
scalar_t hh = 1 - lh, hw = 1 - lw;
scalar_t v1 = 0;
if (h_low >= 0 && w_low >= 0)
v1 = bottom_data[h_low * data_width + w_low];
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
v2 = bottom_data[h_low * data_width + w_high];
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
v3 = bottom_data[h_high * data_width + w_low];
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
v4 = bottom_data[h_high * data_width + w_high];
scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template
__device__ scalar_t dmcn_get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
const int h, const int w, const int height, const int width)
{
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
{
//empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
scalar_t weight = 0;
if (h == argmax_h_low && w == argmax_w_low)
weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
if (h == argmax_h_low && w == argmax_w_high)
weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
if (h == argmax_h_high && w == argmax_w_low)
weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
if (h == argmax_h_high && w == argmax_w_high)
weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
return weight;
}
template
__device__ scalar_t dmcn_get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
const int height, const int width, const scalar_t *im_data,
const int data_width, const int bp_dir)
{
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
{
//empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
scalar_t weight = 0;
if (bp_dir == 0)
{
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
}
else if (bp_dir == 1)
{
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
}
return weight;
}
template
__global__ void modulated_deformable_im2col_gpu_kernel(const int n,
const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group,
const int batch_size, const int num_channels, const int deformable_group,
const int height_col, const int width_col,
scalar_t *data_col)
{
CUDA_KERNEL_LOOP(index, n)
{
// index index of output matrix
const int w_col = index % width_col;
const int h_col = (index / width_col) % height_col;
const int b_col = (index / width_col / height_col) % batch_size;
const int c_im = (index / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
// compute deformable group index
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
//const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
const scalar_t *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i)
{
for (int j = 0; j < kernel_w; ++j)
{
const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
scalar_t val = static_cast(0);
const scalar_t h_im = h_in + i * dilation_h + offset_h;
const scalar_t w_im = w_in + j * dilation_w + offset_w;
//if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
{
//const float map_h = i * dilation_h + offset_h;
//const float map_w = j * dilation_w + offset_w;
//const int cur_height = height - h_in;
//const int cur_width = width - w_in;
//val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
}
*data_col_ptr = val * mask;
data_col_ptr += batch_size * height_col * width_col;
//data_col_ptr += height_col * width_col;
}
}
}
}
template
__global__ void modulated_deformable_col2im_gpu_kernel(const int n,
const scalar_t *data_col, const scalar_t *data_offset, const scalar_t *data_mask,
const int channels, const int height, const int width,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group,
const int batch_size, const int deformable_group,
const int height_col, const int width_col,
scalar_t *grad_im)
{
CUDA_KERNEL_LOOP(index, n)
{
const int j = (index / width_col / height_col / batch_size) % kernel_w;
const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
// compute the start and end of the output
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = index % width_col;
int h_out = (index / width_col) % height_col;
int b = (index / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
const scalar_t cur_top_grad = data_col[index] * mask;
const int cur_h = (int)cur_inv_h_data;
const int cur_w = (int)cur_inv_w_data;
for (int dy = -2; dy <= 2; dy++)
{
for (int dx = -2; dx <= 2; dx++)
{
if (cur_h + dy >= 0 && cur_h + dy < height &&
cur_w + dx >= 0 && cur_w + dx < width &&
abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1)
{
int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
scalar_t weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
}
}
}
}
}
template
__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
const scalar_t *data_col, const scalar_t *data_im,
const scalar_t *data_offset, const scalar_t *data_mask,
const int channels, const int height, const int width,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group,
const int batch_size, const int offset_channels, const int deformable_group,
const int height_col, const int width_col,
scalar_t *grad_offset, scalar_t *grad_mask)
{
CUDA_KERNEL_LOOP(index, n)
{
scalar_t val = 0, mval = 0;
int w = index % width_col;
int h = (index / width_col) % height_col;
int c = (index / width_col / height_col) % offset_channels;
int b = (index / width_col / height_col) / offset_channels;
// compute the start and end of the output
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
{
const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
scalar_t inv_h = h_in + i * dilation_h + offset_h;
scalar_t inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
{
inv_h = inv_w = -2;
}
else
{
mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
}
const scalar_t weight = dmcn_get_coordinate_weight(
inv_h, inv_w,
height, width, data_im_ptr + cnt * height * width, width, bp_dir);
val += weight * data_col_ptr[col_pos] * mask;
cnt += 1;
}
// KERNEL_ASSIGN(grad_offset[index], offset_req, val);
grad_offset[index] = val;
if (offset_c % 2 == 0)
// KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
}
}
void modulated_deformable_im2col_cuda(
const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
const int batch_size, const int channels, const int height_im, const int width_im,
const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int deformable_group, at::Tensor data_col)
{
// num_axes should be smaller than block size
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels = channels * batch_size * height_col * width_col;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.type(), "modulated_deformable_im2col_gpu", ([&] {
const scalar_t *data_im_ = data_im.data();
const scalar_t *data_offset_ = data_offset.data();
const scalar_t *data_mask_ = data_mask.data();
scalar_t *data_col_ = data_col.data();
modulated_deformable_im2col_gpu_kernel<<>>(
num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
batch_size, channels, deformable_group, height_col, width_col, data_col_);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
}
}
void modulated_deformable_col2im_cuda(
const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask,
const int batch_size, const int channels, const int height_im, const int width_im,
const int height_col, const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int deformable_group, at::Tensor grad_im)
{
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.type(), "modulated_deformable_col2im_gpu", ([&] {
const scalar_t *data_col_ = data_col.data();
const scalar_t *data_offset_ = data_offset.data();
const scalar_t *data_mask_ = data_mask.data();
scalar_t *grad_im_ = grad_im.data();
modulated_deformable_col2im_gpu_kernel<<>>(
num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im,
kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group,
batch_size, deformable_group, height_col, width_col, grad_im_);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
}
}
void modulated_deformable_col2im_coord_cuda(
const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
const int batch_size, const int channels, const int height_im, const int width_im,
const int height_col, const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int deformable_group,
at::Tensor grad_offset, at::Tensor grad_mask)
{
const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.type(), "modulated_deformable_col2im_coord_gpu", ([&] {
const scalar_t *data_col_ = data_col.data();
const scalar_t *data_im_ = data_im.data();
const scalar_t *data_offset_ = data_offset.data();
const scalar_t *data_mask_ = data_mask.data();
scalar_t *grad_offset_ = grad_offset.data();
scalar_t *grad_mask_ = grad_mask.data();
modulated_deformable_col2im_coord_gpu_kernel<<>>(
num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im,
kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group,
batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col,
grad_offset_, grad_mask_);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
}
}
================================================
FILE: mmdet/ops/dcn/src/deform_pool_cuda.cpp
================================================
// author: Charles Shang
// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob /mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c
#include
#include
#include
void DeformablePSROIPoolForward(const at::Tensor data,
const at::Tensor bbox,
const at::Tensor trans,
at::Tensor out,
at::Tensor top_count,
const int batch,
const int channels,
const int height,
const int width,
const int num_bbox,
const int channels_trans,
const int no_trans,
const float spatial_scale,
const int output_dim,
const int group_size,
const int pooled_size,
const int part_size,
const int sample_per_part,
const float trans_std);
void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad,
const at::Tensor data,
const at::Tensor bbox,
const at::Tensor trans,
const at::Tensor top_count,
at::Tensor in_grad,
at::Tensor trans_grad,
const int batch,
const int channels,
const int height,
const int width,
const int num_bbox,
const int channels_trans,
const int no_trans,
const float spatial_scale,
const int output_dim,
const int group_size,
const int pooled_size,
const int part_size,
const int sample_per_part,
const float trans_std);
void deform_psroi_pooling_cuda_forward(at::Tensor input, at::Tensor bbox,
at::Tensor trans,
at::Tensor out, at::Tensor top_count,
const int no_trans,
const float spatial_scale,
const int output_dim,
const int group_size,
const int pooled_size,
const int part_size,
const int sample_per_part,
const float trans_std)
{
AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
const int batch = input.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int channels_trans = no_trans ? 2 : trans.size(1);
const int num_bbox = bbox.size(0);
if (num_bbox != out.size(0))
AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
out.size(0), num_bbox);
DeformablePSROIPoolForward(input, bbox, trans, out, top_count,
batch, channels, height, width,
num_bbox,
channels_trans,
no_trans,
spatial_scale,
output_dim,
group_size,
pooled_size,
part_size,
sample_per_part,
trans_std);
}
void deform_psroi_pooling_cuda_backward(at::Tensor out_grad,
at::Tensor input, at::Tensor bbox,
at::Tensor trans, at::Tensor top_count,
at::Tensor input_grad, at::Tensor trans_grad,
const int no_trans,
const float spatial_scale,
const int output_dim,
const int group_size,
const int pooled_size,
const int part_size,
const int sample_per_part,
const float trans_std)
{
AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
const int batch = input.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int channels_trans = no_trans ? 2 : trans.size(1);
const int num_bbox = bbox.size(0);
if (num_bbox != out_grad.size(0))
AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
out_grad.size(0), num_bbox);
DeformablePSROIPoolBackwardAcc(out_grad,
input,
bbox,
trans,
top_count,
input_grad,
trans_grad,
batch, channels, height, width, num_bbox,
channels_trans,
no_trans,
spatial_scale,
output_dim,
group_size,
pooled_size,
part_size,
sample_per_part,
trans_std);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
m.def("deform_psroi_pooling_cuda_forward", &deform_psroi_pooling_cuda_forward,
"deform psroi pooling forward(CUDA)");
m.def("deform_psroi_pooling_cuda_backward", &deform_psroi_pooling_cuda_backward,
"deform psroi pooling backward(CUDA)");
}
================================================
FILE: mmdet/ops/dcn/src/deform_pool_cuda_kernel.cu
================================================
/*!
* Copyright (c) 2017 Microsoft
* Licensed under The MIT License [see LICENSE for details]
* \file deformable_psroi_pooling.cu
* \brief
* \author Yi Li, Guodong Zhang, Jifeng Dai
*/
/***************** Adapted by Charles Shang *********************/
// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/cuda/deform_psroi_pooling_cuda.cu
#include
#include
#include
#include
#include
using namespace at;
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N)
{
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
template
__device__ scalar_t bilinear_interp(
const scalar_t *data,
const scalar_t x,
const scalar_t y,
const int width,
const int height)
{
int x1 = floor(x);
int x2 = ceil(x);
int y1 = floor(y);
int y2 = ceil(y);
scalar_t dist_x = (scalar_t)(x - x1);
scalar_t dist_y = (scalar_t)(y - y1);
scalar_t value11 = data[y1 * width + x1];
scalar_t value12 = data[y2 * width + x1];
scalar_t value21 = data[y1 * width + x2];
scalar_t value22 = data[y2 * width + x2];
scalar_t value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22;
return value;
}
template
__global__ void DeformablePSROIPoolForwardKernel(
const int count,
const scalar_t *bottom_data,
const scalar_t spatial_scale,
const int channels,
const int height, const int width,
const int pooled_height, const int pooled_width,
const scalar_t *bottom_rois, const scalar_t *bottom_trans,
const int no_trans,
const scalar_t trans_std,
const int sample_per_part,
const int output_dim,
const int group_size,
const int part_size,
const int num_classes,
const int channels_each_class,
scalar_t *top_data,
scalar_t *top_count)
{
CUDA_KERNEL_LOOP(index, count)
{
// The output is in order (n, ctop, ph, pw)
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int ctop = (index / pooled_width / pooled_height) % output_dim;
int n = index / pooled_width / pooled_height / output_dim;
// [start, end) interval for spatial sampling
const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
// Force too small ROIs to be 1x1
scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
// Compute w and h at bottom
scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
int class_id = ctop / channels_each_class;
scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
wstart += trans_x * roi_width;
scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
hstart += trans_y * roi_height;
scalar_t sum = 0;
int count = 0;
int gw = floor((scalar_t)(pw)*group_size / pooled_width);
int gh = floor((scalar_t)(ph)*group_size / pooled_height);
gw = min(max(gw, 0), group_size - 1);
gh = min(max(gh, 0), group_size - 1);
const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
for (int ih = 0; ih < sample_per_part; ih++)
{
for (int iw = 0; iw < sample_per_part; iw++)
{
scalar_t w = wstart + iw * sub_bin_size_w;
scalar_t h = hstart + ih * sub_bin_size_h;
// bilinear interpolation
if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
{
continue;
}
w = min(max(w, 0.), width - 1.);
h = min(max(h, 0.), height - 1.);
int c = (ctop * group_size + gh) * group_size + gw;
scalar_t val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height);
sum += val;
count++;
}
}
top_data[index] = count == 0 ? (scalar_t)(0) : sum / count;
top_count[index] = count;
}
}
template
__global__ void DeformablePSROIPoolBackwardAccKernel(
const int count,
const scalar_t *top_diff,
const scalar_t *top_count,
const int num_rois,
const scalar_t spatial_scale,
const int channels,
const int height, const int width,
const int pooled_height, const int pooled_width,
const int output_dim,
scalar_t *bottom_data_diff, scalar_t *bottom_trans_diff,
const scalar_t *bottom_data,
const scalar_t *bottom_rois,
const scalar_t *bottom_trans,
const int no_trans,
const scalar_t trans_std,
const int sample_per_part,
const int group_size,
const int part_size,
const int num_classes,
const int channels_each_class)
{
CUDA_KERNEL_LOOP(index, count)
{
// The output is in order (n, ctop, ph, pw)
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int ctop = (index / pooled_width / pooled_height) % output_dim;
int n = index / pooled_width / pooled_height / output_dim;
// [start, end) interval for spatial sampling
const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
// Force too small ROIs to be 1x1
scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
// Compute w and h at bottom
scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
int class_id = ctop / channels_each_class;
scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
wstart += trans_x * roi_width;
scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
hstart += trans_y * roi_height;
if (top_count[index] <= 0)
{
continue;
}
scalar_t diff_val = top_diff[index] / top_count[index];
const scalar_t *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
scalar_t *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
int gw = floor((scalar_t)(pw)*group_size / pooled_width);
int gh = floor((scalar_t)(ph)*group_size / pooled_height);
gw = min(max(gw, 0), group_size - 1);
gh = min(max(gh, 0), group_size - 1);
for (int ih = 0; ih < sample_per_part; ih++)
{
for (int iw = 0; iw < sample_per_part; iw++)
{
scalar_t w = wstart + iw * sub_bin_size_w;
scalar_t h = hstart + ih * sub_bin_size_h;
// bilinear interpolation
if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
{
continue;
}
w = min(max(w, 0.), width - 1.);
h = min(max(h, 0.), height - 1.);
int c = (ctop * group_size + gh) * group_size + gw;
// backward on feature
int x0 = floor(w);
int x1 = ceil(w);
int y0 = floor(h);
int y1 = ceil(h);
scalar_t dist_x = w - x0, dist_y = h - y0;
scalar_t q00 = (1 - dist_x) * (1 - dist_y);
scalar_t q01 = (1 - dist_x) * dist_y;
scalar_t q10 = dist_x * (1 - dist_y);
scalar_t q11 = dist_x * dist_y;
int bottom_index_base = c * height * width;
atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
if (no_trans)
{
continue;
}
scalar_t U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
scalar_t U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
scalar_t U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
scalar_t U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
scalar_t diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
diff_x *= roi_width;
scalar_t diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
diff_y *= roi_height;
atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
}
}
}
}
void DeformablePSROIPoolForward(const at::Tensor data,
const at::Tensor bbox,
const at::Tensor trans,
at::Tensor out,
at::Tensor top_count,
const int batch,
const int channels,
const int height,
const int width,
const int num_bbox,
const int channels_trans,
const int no_trans,
const float spatial_scale,
const int output_dim,
const int group_size,
const int pooled_size,
const int part_size,
const int sample_per_part,
const float trans_std)
{
const int pooled_height = pooled_size;
const int pooled_width = pooled_size;
const int count = num_bbox * output_dim * pooled_height * pooled_width;
const int num_classes = no_trans ? 1 : channels_trans / 2;
const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data.type(), "deformable_psroi_pool_forward", ([&] {
const scalar_t *bottom_data = data.data();
const scalar_t *bottom_rois = bbox.data();
const scalar_t *bottom_trans = no_trans ? NULL : trans.data();
scalar_t *top_data = out.data();
scalar_t *top_count_data = top_count.data();
DeformablePSROIPoolForwardKernel<<>>(
count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width,
bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, output_dim,
group_size, part_size, num_classes, channels_each_class, top_data, top_count_data);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
}
}
void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad,
const at::Tensor data,
const at::Tensor bbox,
const at::Tensor trans,
const at::Tensor top_count,
at::Tensor in_grad,
at::Tensor trans_grad,
const int batch,
const int channels,
const int height,
const int width,
const int num_bbox,
const int channels_trans,
const int no_trans,
const float spatial_scale,
const int output_dim,
const int group_size,
const int pooled_size,
const int part_size,
const int sample_per_part,
const float trans_std)
{
// LOG(INFO) << "DeformablePSROIPoolBackward";
const int num_rois = num_bbox;
const int pooled_height = pooled_size;
const int pooled_width = pooled_size;
const int count = num_bbox * output_dim * pooled_height * pooled_width;
const int num_classes = no_trans ? 1 : channels_trans / 2;
const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
out_grad.type(), "deformable_psroi_pool_backward_acc", ([&] {
const scalar_t *top_diff = out_grad.data();
const scalar_t *bottom_data = data.data();
const scalar_t *bottom_rois = bbox.data();
const scalar_t *bottom_trans = no_trans ? NULL : trans.data();
scalar_t *bottom_data_diff = in_grad.data();
scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data();
const scalar_t *top_count_data = top_count.data();
DeformablePSROIPoolBackwardAccKernel<<>>(
count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width,
pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff,
bottom_data, bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part,
group_size, part_size, num_classes, channels_each_class);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
}
}
================================================
FILE: mmdet/ops/nms/.gitignore
================================================
*.cpp
================================================
FILE: mmdet/ops/nms/Makefile
================================================
PYTHON=${PYTHON:-python}
all:
echo "Compiling nms kernels..."
$(PYTHON) setup.py build_ext --inplace
clean:
rm -f *.so
================================================
FILE: mmdet/ops/nms/__init__.py
================================================
from .nms_wrapper import nms, soft_nms
__all__ = ['nms', 'soft_nms']
================================================
FILE: mmdet/ops/nms/cpu_nms.pyx
================================================
# --------------------------------------------------------
# Fast R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------
import numpy as np
cimport numpy as np
cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
return a if a >= b else b
cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
return a if a <= b else b
def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
cdef int ndets = dets.shape[0]
cdef np.ndarray[np.int_t, ndim=1] suppressed = \
np.zeros((ndets), dtype=np.int)
# nominal indices
cdef int _i, _j
# sorted indices
cdef int i, j
# temp variables for box i's (the box currently under consideration)
cdef np.float32_t ix1, iy1, ix2, iy2, iarea
# variables for computing overlap with box j (lower scoring box)
cdef np.float32_t xx1, yy1, xx2, yy2
cdef np.float32_t w, h
cdef np.float32_t inter, ovr
keep = []
for _i in range(ndets):
i = order[_i]
if suppressed[i] == 1:
continue
keep.append(i)
ix1 = x1[i]
iy1 = y1[i]
ix2 = x2[i]
iy2 = y2[i]
iarea = areas[i]
for _j in range(_i + 1, ndets):
j = order[_j]
if suppressed[j] == 1:
continue
xx1 = max(ix1, x1[j])
yy1 = max(iy1, y1[j])
xx2 = min(ix2, x2[j])
yy2 = min(iy2, y2[j])
w = max(0.0, xx2 - xx1 + 1)
h = max(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (iarea + areas[j] - inter)
if ovr >= thresh:
suppressed[j] = 1
return keep
================================================
FILE: mmdet/ops/nms/cpu_soft_nms.pyx
================================================
# ----------------------------------------------------------
# Soft-NMS: Improving Object Detection With One Line of Code
# Copyright (c) University of Maryland, College Park
# Licensed under The MIT License [see LICENSE for details]
# Written by Navaneeth Bodla and Bharat Singh
# Modified by Kai Chen
# ----------------------------------------------------------
import numpy as np
cimport numpy as np
cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
return a if a >= b else b
cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
return a if a <= b else b
def cpu_soft_nms(
np.ndarray[float, ndim=2] boxes_in,
float iou_thr,
unsigned int method=1,
float sigma=0.5,
float min_score=0.001,
):
boxes = boxes_in.copy()
cdef unsigned int N = boxes.shape[0]
cdef float iw, ih, box_area
cdef float ua
cdef int pos = 0
cdef float maxscore = 0
cdef int maxpos = 0
cdef float x1, x2, y1, y2, tx1, tx2, ty1, ty2, ts, area, weight, ov
inds = np.arange(N)
for i in range(N):
maxscore = boxes[i, 4]
maxpos = i
tx1 = boxes[i, 0]
ty1 = boxes[i, 1]
tx2 = boxes[i, 2]
ty2 = boxes[i, 3]
ts = boxes[i, 4]
ti = inds[i]
pos = i + 1
# get max box
while pos < N:
if maxscore < boxes[pos, 4]:
maxscore = boxes[pos, 4]
maxpos = pos
pos = pos + 1
# add max box as a detection
boxes[i, 0] = boxes[maxpos, 0]
boxes[i, 1] = boxes[maxpos, 1]
boxes[i, 2] = boxes[maxpos, 2]
boxes[i, 3] = boxes[maxpos, 3]
boxes[i, 4] = boxes[maxpos, 4]
inds[i] = inds[maxpos]
# swap ith box with position of max box
boxes[maxpos, 0] = tx1
boxes[maxpos, 1] = ty1
boxes[maxpos, 2] = tx2
boxes[maxpos, 3] = ty2
boxes[maxpos, 4] = ts
inds[maxpos] = ti
tx1 = boxes[i, 0]
ty1 = boxes[i, 1]
tx2 = boxes[i, 2]
ty2 = boxes[i, 3]
ts = boxes[i, 4]
pos = i + 1
# NMS iterations, note that N changes if detection boxes fall below
# threshold
while pos < N:
x1 = boxes[pos, 0]
y1 = boxes[pos, 1]
x2 = boxes[pos, 2]
y2 = boxes[pos, 3]
s = boxes[pos, 4]
area = (x2 - x1 + 1) * (y2 - y1 + 1)
iw = (min(tx2, x2) - max(tx1, x1) + 1)
if iw > 0:
ih = (min(ty2, y2) - max(ty1, y1) + 1)
if ih > 0:
ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih)
ov = iw * ih / ua # iou between max box and detection box
if method == 1: # linear
if ov > iou_thr:
weight = 1 - ov
else:
weight = 1
elif method == 2: # gaussian
weight = np.exp(-(ov * ov) / sigma)
else: # original NMS
if ov > iou_thr:
weight = 0
else:
weight = 1
boxes[pos, 4] = weight * boxes[pos, 4]
# if box score falls below threshold, discard the box by
# swapping with last box update N
if boxes[pos, 4] < min_score:
boxes[pos, 0] = boxes[N-1, 0]
boxes[pos, 1] = boxes[N-1, 1]
boxes[pos, 2] = boxes[N-1, 2]
boxes[pos, 3] = boxes[N-1, 3]
boxes[pos, 4] = boxes[N-1, 4]
inds[pos] = inds[N - 1]
N = N - 1
pos = pos - 1
pos = pos + 1
return boxes[:N], inds[:N]
================================================
FILE: mmdet/ops/nms/gpu_nms.hpp
================================================
void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
int boxes_dim, float nms_overlap_thresh, int device_id, size_t base);
size_t nms_Malloc();
================================================
FILE: mmdet/ops/nms/gpu_nms.pyx
================================================
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------
import numpy as np
cimport numpy as np
assert sizeof(int) == sizeof(np.int32_t)
cdef extern from "gpu_nms.hpp":
void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int, size_t) nogil
size_t nms_Malloc() nogil
memory_pool = {}
def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
np.int32_t device_id=0):
cdef int boxes_num = dets.shape[0]
cdef int boxes_dim = 5
cdef int num_out
cdef size_t base
cdef np.ndarray[np.int32_t, ndim=1] \
keep = np.zeros(boxes_num, dtype=np.int32)
cdef np.ndarray[np.float32_t, ndim=1] \
scores = dets[:, 4]
cdef np.ndarray[np.int_t, ndim=1] \
order = scores.argsort()[::-1]
cdef np.ndarray[np.float32_t, ndim=2] \
sorted_dets = dets[order, :5]
cdef float cthresh = thresh
if device_id not in memory_pool:
with nogil:
base = nms_Malloc()
memory_pool[device_id] = base
# print "malloc", base
base = memory_pool[device_id]
with nogil:
_nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, cthresh, device_id, base)
keep = keep[:num_out]
return list(order[keep])
================================================
FILE: mmdet/ops/nms/nms_kernel.cu
================================================
// ------------------------------------------------------------------
// Faster R-CNN
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
// Written by Shaoqing Ren
// ------------------------------------------------------------------
#include
#include
#include
#include "gpu_nms.hpp"
#define CUDA_CHECK(condition) \
/* Code block avoids redefinition of cudaError_t error */ \
do { \
cudaError_t error = condition; \
if (error != cudaSuccess) { \
std::cout << cudaGetErrorString(error) << std::endl; \
} \
} while (0)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define MULTIPLIER 16
#define LONGLONG_SIZE 64
int const threadsPerBlock =
sizeof(unsigned long long) * 8 *
MULTIPLIER; // number of bits for a long long variable
__device__ inline float devIoU(float const* const a, float const* const b) {
float left = max(a[0], b[0]), right = min(a[2], b[2]);
float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
float width = max(right - left + 1, 0.f),
height = max(bottom - top + 1, 0.f);
float interS = width * height;
float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return interS / (Sa + Sb - interS);
}
__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
const float* dev_boxes,
unsigned long long* dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size =
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size =
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
__shared__ float block_boxes[threadsPerBlock * 5];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 5 + 0] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
}
__syncthreads();
unsigned long long ts[MULTIPLIER];
if (threadIdx.x < row_size) {
#pragma unroll
for (int i = 0; i < MULTIPLIER; ++i) {
ts[i] = 0;
}
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
const float* cur_box = dev_boxes + cur_box_idx * 5;
int i = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
ts[i / LONGLONG_SIZE] |= 1ULL << (i % LONGLONG_SIZE);
}
}
const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
#pragma unroll
for (int i = 0; i < MULTIPLIER; ++i) {
dev_mask[(cur_box_idx * col_blocks + col_start) * MULTIPLIER + i] =
ts[i];
}
}
}
void _set_device(int device_id) {
int current_device;
CUDA_CHECK(cudaGetDevice(¤t_device));
if (current_device == device_id) {
return;
}
// The call to cudaSetDevice must come before any calls to Get, which
// may perform initialization using the GPU.
CUDA_CHECK(cudaSetDevice(device_id));
}
const size_t MEMORY_SIZE = 500000000;
size_t nms_Malloc() {
float* boxes_dev = NULL;
CUDA_CHECK(cudaMalloc(&boxes_dev, MEMORY_SIZE));
return size_t(boxes_dev);
}
void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
int boxes_dim, float nms_overlap_thresh, int device_id, size_t base) {
_set_device(device_id);
float* boxes_dev = NULL;
unsigned long long* mask_dev = NULL;
const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
if (base > 0) {
size_t require_mem =
boxes_num * boxes_dim * sizeof(float) +
boxes_num * col_blocks * sizeof(unsigned long long) * MULTIPLIER;
if (require_mem >= MEMORY_SIZE) {
std::cout << "require_mem: " << require_mem << std::endl;
}
boxes_dev = (float*)(base);
mask_dev =
(unsigned long long*)(base +
512 * ((unsigned long long)(boxes_num *
boxes_dim *
sizeof(float) /
512) +
1));
} else {
CUDA_CHECK(
cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(float)));
CUDA_CHECK(cudaMalloc(&mask_dev, MULTIPLIER * boxes_num * col_blocks *
sizeof(unsigned long long)));
}
CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host,
boxes_num * boxes_dim * sizeof(float),
cudaMemcpyHostToDevice));
dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
DIVUP(boxes_num, threadsPerBlock));
dim3 threads(threadsPerBlock);
nms_kernel<<>>(boxes_num, nms_overlap_thresh, boxes_dev,
mask_dev);
std::vector mask_host(boxes_num * col_blocks *
MULTIPLIER);
CUDA_CHECK(cudaMemcpy(
&mask_host[0], mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks * MULTIPLIER,
cudaMemcpyDeviceToHost));
std::vector remv(col_blocks * MULTIPLIER);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks * MULTIPLIER);
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;
int offset = inblock / LONGLONG_SIZE;
int bit_pos = inblock % LONGLONG_SIZE;
if (!(remv[nblock * MULTIPLIER + offset] & (1ULL << bit_pos))) {
keep_out[num_to_keep++] = i;
unsigned long long* p = &mask_host[0] + i * col_blocks * MULTIPLIER;
for (int j = nblock * MULTIPLIER + offset;
j < col_blocks * MULTIPLIER; j++) {
remv[j] |= p[j];
}
}
}
*num_out = num_to_keep;
if (!base) {
CUDA_CHECK(cudaFree(boxes_dev));
CUDA_CHECK(cudaFree(mask_dev));
}
}
================================================
FILE: mmdet/ops/nms/nms_wrapper.py
================================================
import numpy as np
import torch
from .gpu_nms import gpu_nms
from .cpu_nms import cpu_nms
from .cpu_soft_nms import cpu_soft_nms
def nms(dets, iou_thr, device_id=None):
"""Dispatch to either CPU or GPU NMS implementations."""
if isinstance(dets, torch.Tensor):
is_tensor = True
if dets.is_cuda:
device_id = dets.get_device()
dets_np = dets.detach().cpu().numpy()
elif isinstance(dets, np.ndarray):
is_tensor = False
dets_np = dets
else:
raise TypeError(
'dets must be either a Tensor or numpy array, but got {}'.format(
type(dets)))
if dets_np.shape[0] == 0:
inds = []
else:
inds = (gpu_nms(dets_np, iou_thr, device_id=device_id)
if device_id is not None else cpu_nms(dets_np, iou_thr))
if is_tensor:
inds = dets.new_tensor(inds, dtype=torch.long)
else:
inds = np.array(inds, dtype=np.int64)
return dets[inds, :], inds
def soft_nms(dets, iou_thr, method='linear', sigma=0.5, min_score=1e-3):
if isinstance(dets, torch.Tensor):
is_tensor = True
dets_np = dets.detach().cpu().numpy()
elif isinstance(dets, np.ndarray):
is_tensor = False
dets_np = dets
else:
raise TypeError(
'dets must be either a Tensor or numpy array, but got {}'.format(
type(dets)))
method_codes = {'linear': 1, 'gaussian': 2}
if method not in method_codes:
raise ValueError('Invalid method for SoftNMS: {}'.format(method))
new_dets, inds = cpu_soft_nms(
dets_np,
iou_thr,
method=method_codes[method],
sigma=sigma,
min_score=min_score)
if is_tensor:
return dets.new_tensor(new_dets), dets.new_tensor(
inds, dtype=torch.long)
else:
return new_dets.astype(np.float32), inds.astype(np.int64)
================================================
FILE: mmdet/ops/nms/setup.py
================================================
import os.path as osp
from distutils.core import setup, Extension
import numpy as np
from Cython.Build import cythonize
from Cython.Distutils import build_ext
# extensions
ext_args = dict(
include_dirs=[np.get_include()],
language='c++',
extra_compile_args={
'cc': ['-Wno-unused-function', '-Wno-write-strings'],
'nvcc': ['-c', '--compiler-options', '-fPIC'],
},
)
extensions = [
Extension('cpu_nms', ['cpu_nms.pyx'], **ext_args),
Extension('cpu_soft_nms', ['cpu_soft_nms.pyx'], **ext_args),
Extension('gpu_nms', ['gpu_nms.pyx', 'nms_kernel.cu'], **ext_args),
]
def customize_compiler_for_nvcc(self):
"""inject deep into distutils to customize how the dispatch
to cc/nvcc works.
If you subclass UnixCCompiler, it's not trivial to get your subclass
injected in, and still have the right customizations (i.e.
distutils.sysconfig.customize_compiler) run on it. So instead of going
the OO route, I have this. Note, it's kindof like a wierd functional
subclassing going on."""
# tell the compiler it can processes .cu
self.src_extensions.append('.cu')
# save references to the default compiler_so and _comple methods
default_compiler_so = self.compiler_so
super = self._compile
# now redefine the _compile method. This gets executed for each
# object but distutils doesn't have the ability to change compilers
# based on source extension: we add it.
def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
if osp.splitext(src)[1] == '.cu':
# use the cuda for .cu files
self.set_executable('compiler_so', 'nvcc')
# use only a subset of the extra_postargs, which are 1-1 translated
# from the extra_compile_args in the Extension class
postargs = extra_postargs['nvcc']
else:
postargs = extra_postargs['cc']
super(obj, src, ext, cc_args, postargs, pp_opts)
# reset the default compiler_so, which we might have changed for cuda
self.compiler_so = default_compiler_so
# inject our redefined _compile method into the class
self._compile = _compile
# run the customize_compiler
class custom_build_ext(build_ext):
def build_extensions(self):
customize_compiler_for_nvcc(self.compiler)
build_ext.build_extensions(self)
setup(
name='nms',
cmdclass={'build_ext': custom_build_ext},
ext_modules=cythonize(extensions),
)
================================================
FILE: mmdet/ops/roi_align/__init__.py
================================================
from .functions.roi_align import roi_align
from .modules.roi_align import RoIAlign
__all__ = ['roi_align', 'RoIAlign']
================================================
FILE: mmdet/ops/roi_align/functions/__init__.py
================================================
================================================
FILE: mmdet/ops/roi_align/functions/roi_align.py
================================================
from torch.autograd import Function, Variable
from .. import roi_align_cuda
class RoIAlignFunction(Function):
@staticmethod
def forward(ctx, features, rois, out_size, spatial_scale, sample_num=0):
if isinstance(out_size, int):
out_h = out_size
out_w = out_size
elif isinstance(out_size, tuple):
assert len(out_size) == 2
assert isinstance(out_size[0], int)
assert isinstance(out_size[1], int)
out_h, out_w = out_size
else:
raise TypeError(
'"out_size" must be an integer or tuple of integers')
ctx.spatial_scale = spatial_scale
ctx.sample_num = sample_num
ctx.save_for_backward(rois)
ctx.feature_size = features.size()
batch_size, num_channels, data_height, data_width = features.size()
num_rois = rois.size(0)
output = features.new_zeros(num_rois, num_channels, out_h, out_w)
if features.is_cuda:
roi_align_cuda.forward(features, rois, out_h, out_w, spatial_scale,
sample_num, output)
else:
raise NotImplementedError
return output
@staticmethod
def backward(ctx, grad_output):
feature_size = ctx.feature_size
spatial_scale = ctx.spatial_scale
sample_num = ctx.sample_num
rois = ctx.saved_tensors[0]
assert (feature_size is not None and grad_output.is_cuda)
batch_size, num_channels, data_height, data_width = feature_size
out_w = grad_output.size(3)
out_h = grad_output.size(2)
grad_input = grad_rois = None
if ctx.needs_input_grad[0]:
grad_input = Variable(
rois.new(batch_size, num_channels, data_height, data_width)
.zero_())
roi_align_cuda.backward(grad_output, rois, out_h, out_w,
spatial_scale, sample_num, grad_input)
return grad_input, grad_rois, None, None, None
roi_align = RoIAlignFunction.apply
================================================
FILE: mmdet/ops/roi_align/gradcheck.py
================================================
import numpy as np
import torch
from torch.autograd import gradcheck
import os.path as osp
import sys
sys.path.append(osp.abspath(osp.join(__file__, '../../')))
from roi_align import RoIAlign # noqa: E402
feat_size = 15
spatial_scale = 1.0 / 8
img_size = feat_size / spatial_scale
num_imgs = 2
num_rois = 20
batch_ind = np.random.randint(num_imgs, size=(num_rois, 1))
rois = np.random.rand(num_rois, 4) * img_size * 0.5
rois[:, 2:] += img_size * 0.5
rois = np.hstack((batch_ind, rois))
feat = torch.randn(
num_imgs, 16, feat_size, feat_size, requires_grad=True, device='cuda:0')
rois = torch.from_numpy(rois).float().cuda()
inputs = (feat, rois)
print('Gradcheck for roi align...')
test = gradcheck(RoIAlign(3, spatial_scale), inputs, atol=1e-3, eps=1e-3)
print(test)
test = gradcheck(RoIAlign(3, spatial_scale, 2), inputs, atol=1e-3, eps=1e-3)
print(test)
================================================
FILE: mmdet/ops/roi_align/modules/__init__.py
================================================
================================================
FILE: mmdet/ops/roi_align/modules/roi_align.py
================================================
from torch.nn.modules.module import Module
from ..functions.roi_align import RoIAlignFunction
class RoIAlign(Module):
def __init__(self, out_size, spatial_scale, sample_num=0):
super(RoIAlign, self).__init__()
self.out_size = out_size
self.spatial_scale = float(spatial_scale)
self.sample_num = int(sample_num)
def forward(self, features, rois):
return RoIAlignFunction.apply(features, rois, self.out_size,
self.spatial_scale, self.sample_num)
================================================
FILE: mmdet/ops/roi_align/setup.py
================================================
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
setup(
name='roi_align_cuda',
ext_modules=[
CUDAExtension('roi_align_cuda', [
'src/roi_align_cuda.cpp',
'src/roi_align_kernel.cu',
]),
],
cmdclass={'build_ext': BuildExtension})
================================================
FILE: mmdet/ops/roi_align/src/roi_align_cuda.cpp
================================================
#include
#include
#include
int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
const float spatial_scale, const int sample_num,
const int channels, const int height,
const int width, const int num_rois,
const int pooled_height, const int pooled_width,
at::Tensor output);
int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
const float spatial_scale, const int sample_num,
const int channels, const int height,
const int width, const int num_rois,
const int pooled_height, const int pooled_width,
at::Tensor bottom_grad);
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
int roi_align_forward_cuda(at::Tensor features, at::Tensor rois,
int pooled_height, int pooled_width,
float spatial_scale, int sample_num,
at::Tensor output) {
CHECK_INPUT(features);
CHECK_INPUT(rois);
CHECK_INPUT(output);
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 5) {
printf("wrong roi size\n");
return 0;
}
int num_channels = features.size(1);
int data_height = features.size(2);
int data_width = features.size(3);
ROIAlignForwardLaucher(features, rois, spatial_scale, sample_num,
num_channels, data_height, data_width, num_rois,
pooled_height, pooled_width, output);
return 1;
}
int roi_align_backward_cuda(at::Tensor top_grad, at::Tensor rois,
int pooled_height, int pooled_width,
float spatial_scale, int sample_num,
at::Tensor bottom_grad) {
CHECK_INPUT(top_grad);
CHECK_INPUT(rois);
CHECK_INPUT(bottom_grad);
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 5) {
printf("wrong roi size\n");
return 0;
}
int num_channels = bottom_grad.size(1);
int data_height = bottom_grad.size(2);
int data_width = bottom_grad.size(3);
ROIAlignBackwardLaucher(top_grad, rois, spatial_scale, sample_num,
num_channels, data_height, data_width, num_rois,
pooled_height, pooled_width, bottom_grad);
return 1;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &roi_align_forward_cuda, "Roi_Align forward (CUDA)");
m.def("backward", &roi_align_backward_cuda, "Roi_Align backward (CUDA)");
}
================================================
FILE: mmdet/ops/roi_align/src/roi_align_kernel.cu
================================================
#include
#include
using namespace at; // temporal fix for pytorch<=0.4.1 (see #9848)
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
#define THREADS_PER_BLOCK 1024
inline int GET_BLOCKS(const int N) {
int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
int max_block_num = 65000;
return min(optimal_block_num, max_block_num);
}
template
__device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data,
const int height, const int width,
scalar_t y, scalar_t x) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
return 0;
}
if (y <= 0) y = 0;
if (x <= 0) x = 0;
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (scalar_t)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (scalar_t)x_low;
} else {
x_high = x_low + 1;
}
scalar_t ly = y - y_low;
scalar_t lx = x - x_low;
scalar_t hy = 1. - ly;
scalar_t hx = 1. - lx;
// do bilinear interpolation
scalar_t lt = bottom_data[y_low * width + x_low];
scalar_t rt = bottom_data[y_low * width + x_high];
scalar_t lb = bottom_data[y_high * width + x_low];
scalar_t rb = bottom_data[y_high * width + x_high];
scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
scalar_t val = (w1 * lt + w2 * rt + w3 * lb + w4 * rb);
return val;
}
template
__global__ void ROIAlignForward(const int nthreads, const scalar_t *bottom_data,
const scalar_t *bottom_rois,
const scalar_t spatial_scale,
const int sample_num, const int channels,
const int height, const int width,
const int pooled_height, const int pooled_width,
scalar_t *top_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the aligned output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
scalar_t roi_start_w = offset_bottom_rois[1] * spatial_scale;
scalar_t roi_start_h = offset_bottom_rois[2] * spatial_scale;
scalar_t roi_end_w = (offset_bottom_rois[3] + 1) * spatial_scale;
scalar_t roi_end_h = (offset_bottom_rois[4] + 1) * spatial_scale;
// Force malformed ROIs to be 1x1
scalar_t roi_width = fmaxf((scalar_t)roi_end_w - roi_start_w, 0.);
scalar_t roi_height = fmaxf((scalar_t)roi_end_h - roi_start_h, 0.);
scalar_t bin_size_h = roi_height / pooled_height;
scalar_t bin_size_w = roi_width / pooled_width;
const scalar_t *offset_bottom_data =
bottom_data + (roi_batch_ind * channels + c) * height * width;
int sample_num_h = (sample_num > 0)
? sample_num
: ceil(roi_height / pooled_height); // e.g., = 2
int sample_num_w =
(sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
scalar_t h = (scalar_t)(ph + 0.5) * bin_size_h + roi_start_h;
scalar_t w = (scalar_t)(pw + 0.5) * bin_size_w + roi_start_w;
int hstart = fminf(floor(h), height - 2);
int wstart = fminf(floor(w), width - 2);
scalar_t output_val = 0;
for (int iy = 0; iy < sample_num_h; iy++) {
const scalar_t y = roi_start_h + ph * bin_size_h +
(scalar_t)(iy + scalar_t(.5f)) * bin_size_h /
(scalar_t)(sample_num_h);
for (int ix = 0; ix < sample_num_w; ix++) {
const scalar_t x = roi_start_w + pw * bin_size_w +
(scalar_t)(ix + scalar_t(.5f)) * bin_size_w /
(scalar_t)(sample_num_w);
scalar_t val = bilinear_interpolate(offset_bottom_data,
height, width, y, x);
output_val += val;
}
}
output_val /= (sample_num_h * sample_num_w);
top_data[index] = output_val;
}
}
int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
const float spatial_scale, const int sample_num,
const int channels, const int height,
const int width, const int num_rois,
const int pooled_height, const int pooled_width,
at::Tensor output) {
const int output_size = num_rois * pooled_height * pooled_width * channels;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.type(), "ROIAlignLaucherForward", ([&] {
const scalar_t *bottom_data = features.data();
const scalar_t *rois_data = rois.data();
scalar_t *top_data = output.data();
ROIAlignForward
<<>>(
output_size, bottom_data, rois_data, scalar_t(spatial_scale),
sample_num, channels, height, width, pooled_height,
pooled_width, top_data);
}));
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
return 1;
}
template
__device__ void bilinear_interpolate_gradient(const int height, const int width,
scalar_t y, scalar_t x,
scalar_t &w1, scalar_t &w2,
scalar_t &w3, scalar_t &w4,
int &x_low, int &x_high,
int &y_low, int &y_high) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
w1 = w2 = w3 = w4 = 0.;
x_low = x_high = y_low = y_high = -1;
return;
}
if (y <= 0) y = 0;
if (x <= 0) x = 0;
y_low = (int)y;
x_low = (int)x;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (scalar_t)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (scalar_t)x_low;
} else {
x_high = x_low + 1;
}
scalar_t ly = y - y_low;
scalar_t lx = x - x_low;
scalar_t hy = 1. - ly;
scalar_t hx = 1. - lx;
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
return;
}
template
__global__ void ROIAlignBackward(
const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
const scalar_t spatial_scale, const int sample_num, const int channels,
const int height, const int width, const int pooled_height,
const int pooled_width, scalar_t *bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the aligned output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
scalar_t roi_start_w = offset_bottom_rois[1] * spatial_scale;
scalar_t roi_start_h = offset_bottom_rois[2] * spatial_scale;
scalar_t roi_end_w = (offset_bottom_rois[3] + 1) * spatial_scale;
scalar_t roi_end_h = (offset_bottom_rois[4] + 1) * spatial_scale;
// Force malformed ROIs to be 1x1
scalar_t roi_width = fmaxf((scalar_t)roi_end_w - roi_start_w, 0.);
scalar_t roi_height = fmaxf((scalar_t)roi_end_h - roi_start_h, 0.);
scalar_t bin_size_h = roi_height / pooled_height;
scalar_t bin_size_w = roi_width / pooled_width;
scalar_t *offset_bottom_diff =
bottom_diff + (roi_batch_ind * channels + c) * height * width;
int offset_top = (n * channels + c) * pooled_height * pooled_width +
ph * pooled_width + pw;
scalar_t offset_top_diff = top_diff[offset_top];
int sample_num_h = (sample_num > 0)
? sample_num
: ceil(roi_height / pooled_height); // e.g., = 2
int sample_num_w =
(sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
const scalar_t count = (scalar_t)(sample_num_h * sample_num_w);
scalar_t h = (scalar_t)(ph + 0.5) * bin_size_h + roi_start_h;
scalar_t w = (scalar_t)(pw + 0.5) * bin_size_w + roi_start_w;
int hstart = fminf(floor(h), height - 2);
int wstart = fminf(floor(w), width - 2);
for (int iy = 0; iy < sample_num_h; iy++) {
const scalar_t y =
roi_start_h + ph * bin_size_h +
(scalar_t)(iy + .5f) * bin_size_h / (scalar_t)(sample_num_h);
for (int ix = 0; ix < sample_num_w; ix++) {
const scalar_t x =
roi_start_w + pw * bin_size_w +
(scalar_t)(ix + .5f) * bin_size_w / (scalar_t)(sample_num_w);
scalar_t w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(
height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
scalar_t g1 = offset_top_diff * w1 / count;
scalar_t g2 = offset_top_diff * w2 / count;
scalar_t g3 = offset_top_diff * w3 / count;
scalar_t g4 = offset_top_diff * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
}
}
}
}
}
int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
const float spatial_scale, const int sample_num,
const int channels, const int height,
const int width, const int num_rois,
const int pooled_height, const int pooled_width,
at::Tensor bottom_grad) {
const int output_size = num_rois * pooled_height * pooled_width * channels;
// TODO: use AT_DISPATCH_FLOATING_TYPES_AND_HALF when atomicAdd is resolved
AT_DISPATCH_FLOATING_TYPES(
top_grad.type(), "ROIAlignLaucherBackward", ([&] {
const scalar_t *top_diff = top_grad.data();
const scalar_t *rois_data = rois.data();
scalar_t *bottom_diff = bottom_grad.data();
if (sizeof(scalar_t) == sizeof(double)) {
fprintf(stderr, "double is not supported\n");
exit(-1);
}
ROIAlignBackward
<<>>(
output_size, top_diff, rois_data, spatial_scale, sample_num,
channels, height, width, pooled_height, pooled_width,
bottom_diff);
}));
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
return 1;
}
================================================
FILE: mmdet/ops/roi_pool/__init__.py
================================================
from .functions.roi_pool import roi_pool
from .modules.roi_pool import RoIPool
__all__ = ['roi_pool', 'RoIPool']
================================================
FILE: mmdet/ops/roi_pool/functions/__init__.py
================================================
================================================
FILE: mmdet/ops/roi_pool/functions/roi_pool.py
================================================
import torch
from torch.autograd import Function
from .. import roi_pool_cuda
class RoIPoolFunction(Function):
@staticmethod
def forward(ctx, features, rois, out_size, spatial_scale):
if isinstance(out_size, int):
out_h = out_size
out_w = out_size
elif isinstance(out_size, tuple):
assert len(out_size) == 2
assert isinstance(out_size[0], int)
assert isinstance(out_size[1], int)
out_h, out_w = out_size
else:
raise TypeError(
'"out_size" must be an integer or tuple of integers')
assert features.is_cuda
ctx.save_for_backward(rois)
num_channels = features.size(1)
num_rois = rois.size(0)
out_size = (num_rois, num_channels, out_h, out_w)
output = features.new_zeros(*out_size)
argmax = features.new_zeros(*out_size, dtype=torch.int)
roi_pool_cuda.forward(features, rois, out_h, out_w, spatial_scale,
output, argmax)
ctx.spatial_scale = spatial_scale
ctx.feature_size = features.size()
ctx.argmax = argmax
return output
@staticmethod
def backward(ctx, grad_output):
assert grad_output.is_cuda
spatial_scale = ctx.spatial_scale
feature_size = ctx.feature_size
argmax = ctx.argmax
rois = ctx.saved_tensors[0]
assert feature_size is not None
grad_input = grad_rois = None
if ctx.needs_input_grad[0]:
grad_input = grad_output.new(feature_size).zero_()
roi_pool_cuda.backward(grad_output, rois, argmax, spatial_scale,
grad_input)
return grad_input, grad_rois, None, None
roi_pool = RoIPoolFunction.apply
================================================
FILE: mmdet/ops/roi_pool/gradcheck.py
================================================
import torch
from torch.autograd import gradcheck
import os.path as osp
import sys
sys.path.append(osp.abspath(osp.join(__file__, '../../')))
from roi_pool import RoIPool # noqa: E402
feat = torch.randn(4, 16, 15, 15, requires_grad=True).cuda()
rois = torch.Tensor([[0, 0, 0, 50, 50], [0, 10, 30, 43, 55],
[1, 67, 40, 110, 120]]).cuda()
inputs = (feat, rois)
print('Gradcheck for roi pooling...')
test = gradcheck(RoIPool(4, 1.0 / 8), inputs, eps=1e-5, atol=1e-3)
print(test)
================================================
FILE: mmdet/ops/roi_pool/modules/__init__.py
================================================
================================================
FILE: mmdet/ops/roi_pool/modules/roi_pool.py
================================================
from torch.nn.modules.module import Module
from ..functions.roi_pool import roi_pool
class RoIPool(Module):
def __init__(self, out_size, spatial_scale):
super(RoIPool, self).__init__()
self.out_size = out_size
self.spatial_scale = float(spatial_scale)
def forward(self, features, rois):
return roi_pool(features, rois, self.out_size, self.spatial_scale)
================================================
FILE: mmdet/ops/roi_pool/setup.py
================================================
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
setup(
name='roi_pool',
ext_modules=[
CUDAExtension('roi_pool_cuda', [
'src/roi_pool_cuda.cpp',
'src/roi_pool_kernel.cu',
])
],
cmdclass={'build_ext': BuildExtension})
================================================
FILE: mmdet/ops/roi_pool/src/roi_pool_cuda.cpp
================================================
#include
#include
#include
int ROIPoolForwardLaucher(const at::Tensor features, const at::Tensor rois,
const float spatial_scale, const int channels,
const int height, const int width, const int num_rois,
const int pooled_h, const int pooled_w,
at::Tensor output, at::Tensor argmax);
int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
const at::Tensor argmax, const float spatial_scale,
const int batch_size, const int channels,
const int height, const int width,
const int num_rois, const int pooled_h,
const int pooled_w, at::Tensor bottom_grad);
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
int roi_pooling_forward_cuda(at::Tensor features, at::Tensor rois,
int pooled_height, int pooled_width,
float spatial_scale, at::Tensor output,
at::Tensor argmax) {
CHECK_INPUT(features);
CHECK_INPUT(rois);
CHECK_INPUT(output);
CHECK_INPUT(argmax);
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 5) {
printf("wrong roi size\n");
return 0;
}
int channels = features.size(1);
int height = features.size(2);
int width = features.size(3);
ROIPoolForwardLaucher(features, rois, spatial_scale, channels, height, width,
num_rois, pooled_height, pooled_width, output, argmax);
return 1;
}
int roi_pooling_backward_cuda(at::Tensor top_grad, at::Tensor rois,
at::Tensor argmax, float spatial_scale,
at::Tensor bottom_grad) {
CHECK_INPUT(top_grad);
CHECK_INPUT(rois);
CHECK_INPUT(argmax);
CHECK_INPUT(bottom_grad);
int pooled_height = top_grad.size(2);
int pooled_width = top_grad.size(3);
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 5) {
printf("wrong roi size\n");
return 0;
}
int batch_size = bottom_grad.size(0);
int channels = bottom_grad.size(1);
int height = bottom_grad.size(2);
int width = bottom_grad.size(3);
ROIPoolBackwardLaucher(top_grad, rois, argmax, spatial_scale, batch_size,
channels, height, width, num_rois, pooled_height,
pooled_width, bottom_grad);
return 1;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &roi_pooling_forward_cuda, "Roi_Pooling forward (CUDA)");
m.def("backward", &roi_pooling_backward_cuda, "Roi_Pooling backward (CUDA)");
}
================================================
FILE: mmdet/ops/roi_pool/src/roi_pool_kernel.cu
================================================
#include
#include
using namespace at; // temporal fix for pytorch<=0.4.1 (see #9848)
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
#define THREADS_PER_BLOCK 1024
inline int GET_BLOCKS(const int N) {
int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
int max_block_num = 65000;
return min(optimal_block_num, max_block_num);
}
template
__global__ void ROIPoolForward(const int nthreads, const scalar_t *bottom_data,
const scalar_t *rois,
const scalar_t spatial_scale, const int channels,
const int height, const int width,
const int pooled_h, const int pooled_w,
scalar_t *top_data, int *argmax_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_w;
int ph = (index / pooled_w) % pooled_h;
int c = (index / pooled_w / pooled_h) % channels;
int n = index / pooled_w / pooled_h / channels;
const scalar_t *offset_rois = rois + n * 5;
int roi_batch_ind = offset_rois[0];
// calculate the roi region on feature maps
scalar_t roi_x1 = offset_rois[1] * spatial_scale;
scalar_t roi_y1 = offset_rois[2] * spatial_scale;
scalar_t roi_x2 = (offset_rois[3] + 1) * spatial_scale;
scalar_t roi_y2 = (offset_rois[4] + 1) * spatial_scale;
// force malformed rois to be 1x1
scalar_t roi_w = roi_x2 - roi_x1;
scalar_t roi_h = roi_y2 - roi_y1;
if (roi_w <= 0 || roi_h <= 0) continue;
scalar_t bin_size_w = roi_w / static_cast(pooled_w);
scalar_t bin_size_h = roi_h / static_cast(pooled_h);
// the corresponding bin region
int bin_x1 = floor(static_cast(pw) * bin_size_w + roi_x1);
int bin_y1 = floor(static_cast(ph) * bin_size_h + roi_y1);
int bin_x2 = ceil(static_cast(pw + 1) * bin_size_w + roi_x1);
int bin_y2 = ceil(static_cast(ph + 1) * bin_size_h + roi_y1);
// add roi offsets and clip to input boundaries
bin_x1 = min(max(bin_x1, 0), width);
bin_y1 = min(max(bin_y1, 0), height);
bin_x2 = min(max(bin_x2, 0), width);
bin_y2 = min(max(bin_y2, 0), height);
bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1);
// If nothing is pooled, argmax = -1 causes nothing to be backprop'd
int max_idx = -1;
bottom_data += (roi_batch_ind * channels + c) * height * width;
// Define an empty pooling region to be zero
scalar_t max_val = is_empty ? static_cast(0)
: bottom_data[bin_y1 * width + bin_x1] - 1;
for (int h = bin_y1; h < bin_y2; ++h) {
for (int w = bin_x1; w < bin_x2; ++w) {
int offset = h * width + w;
if (bottom_data[offset] > max_val) {
max_val = bottom_data[offset];
max_idx = offset;
}
}
}
top_data[index] = max_val;
if (argmax_data != NULL) argmax_data[index] = max_idx;
}
}
int ROIPoolForwardLaucher(const at::Tensor features, const at::Tensor rois,
const float spatial_scale, const int channels,
const int height, const int width, const int num_rois,
const int pooled_h, const int pooled_w,
at::Tensor output, at::Tensor argmax) {
const int output_size = num_rois * channels * pooled_h * pooled_w;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.type(), "ROIPoolLaucherForward", ([&] {
const scalar_t *bottom_data = features.data();
const scalar_t *rois_data = rois.data();
scalar_t *top_data = output.data();
int *argmax_data = argmax.data();
ROIPoolForward
<<>>(
output_size, bottom_data, rois_data, scalar_t(spatial_scale),
channels, height, width, pooled_h, pooled_w, top_data,
argmax_data);
}));
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
return 1;
}
template
__global__ void ROIPoolBackward(const int nthreads, const scalar_t *top_diff,
const scalar_t *rois, const int *argmax_data,
const scalar_t spatial_scale,
const int channels, const int height,
const int width, const int pooled_h,
const int pooled_w, scalar_t *bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int pw = index % pooled_w;
int ph = (index / pooled_w) % pooled_h;
int c = (index / pooled_w / pooled_h) % channels;
int n = index / pooled_w / pooled_h / channels;
int roi_batch_ind = rois[n * 5];
int bottom_index = argmax_data[(n * channels + c) * pooled_h * pooled_w +
ph * pooled_w + pw];
atomicAdd(bottom_diff + (roi_batch_ind * channels + c) * height * width +
bottom_index,
top_diff[index]);
}
}
int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
const at::Tensor argmax, const float spatial_scale,
const int batch_size, const int channels,
const int height, const int width,
const int num_rois, const int pooled_h,
const int pooled_w, at::Tensor bottom_grad) {
const int output_size = num_rois * pooled_h * pooled_w * channels;
// TODO: use AT_DISPATCH_FLOATING_TYPES_AND_HALF when atomicAdd is resolved
AT_DISPATCH_FLOATING_TYPES(
top_grad.type(), "ROIPoolLaucherBackward", ([&] {
const scalar_t *top_diff = top_grad.data();
const scalar_t *rois_data = rois.data();
const int *argmax_data = argmax.data();
scalar_t *bottom_diff = bottom_grad.data();
if (sizeof(scalar_t) == sizeof(double)) {
fprintf(stderr, "double is not supported\n");
exit(-1);
}
ROIPoolBackward
<<>>(
output_size, top_diff, rois_data, argmax_data,
scalar_t(spatial_scale), channels, height, width, pooled_h,
pooled_w, bottom_diff);
}));
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
return 1;
}
================================================
FILE: setup.py
================================================
import os
import subprocess
import time
from setuptools import find_packages, setup
def readme():
with open('README.md', encoding='utf-8') as f:
content = f.read()
return content
MAJOR = 0
MINOR = 5
PATCH = 7
SUFFIX = ''
SHORT_VERSION = '{}.{}.{}{}'.format(MAJOR, MINOR, PATCH, SUFFIX)
version_file = 'mmdet/version.py'
def get_git_hash():
def _minimal_ext_cmd(cmd):
# construct minimal environment
env = {}
for k in ['SYSTEMROOT', 'PATH', 'HOME']:
v = os.environ.get(k)
if v is not None:
env[k] = v
# LANGUAGE is used on win32
env['LANGUAGE'] = 'C'
env['LANG'] = 'C'
env['LC_ALL'] = 'C'
out = subprocess.Popen(
cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
return out
try:
out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
sha = out.strip().decode('ascii')
except OSError:
sha = 'unknown'
return sha
def get_hash():
if os.path.exists('.git'):
sha = get_git_hash()[:7]
elif os.path.exists(version_file):
try:
from mmdet.version import __version__
sha = __version__.split('+')[-1]
except ImportError:
raise ImportError('Unable to get git version')
else:
sha = 'unknown'
return sha
def write_version_py():
content = """# GENERATED VERSION FILE
# TIME: {}
__version__ = '{}'
short_version = '{}'
"""
sha = get_hash()
VERSION = SHORT_VERSION + '+' + sha
with open(version_file, 'w') as f:
f.write(content.format(time.asctime(), VERSION, SHORT_VERSION))
def get_version():
with open(version_file, 'r') as f:
exec(compile(f.read(), version_file, 'exec'))
return locals()['__version__']
if __name__ == '__main__':
write_version_py()
setup(
name='mmdet',
version=get_version(),
description='Open MMLab Detection Toolbox',
long_description=readme(),
keywords='computer vision, object detection',
url='https://github.com/open-mmlab/mmdetection',
packages=find_packages(exclude=('configs', 'tools', 'demo')),
package_data={'mmdet.ops': ['*/*.so']},
classifiers=[
'Development Status :: 4 - Beta',
'License :: OSI Approved :: Apache Software License',
'Operating System :: OS Independent',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
],
license='GPLv3',
setup_requires=['pytest-runner'],
tests_require=['pytest'],
install_requires=[
'mmcv', 'numpy', 'matplotlib', 'six', 'terminaltables',
'pycocotools'
],
zip_safe=False)
================================================
FILE: tools/coco_eval.py
================================================
from argparse import ArgumentParser
from mmdet.core import coco_eval
def main():
parser = ArgumentParser(description='COCO Evaluation')
parser.add_argument('result', help='result file path')
parser.add_argument('--ann', help='annotation file path')
parser.add_argument(
'--types',
type=str,
nargs='+',
choices=['proposal_fast', 'proposal', 'bbox', 'segm', 'keypoint'],
default=['bbox'],
help='result types')
parser.add_argument(
'--max-dets',
type=int,
nargs='+',
default=[100, 300, 1000],
help='proposal numbers, only used for recall evaluation')
args = parser.parse_args()
coco_eval(args.result, args.types, args.ann, args.max_dets)
if __name__ == '__main__':
main()
================================================
FILE: tools/convert_datasets/pascal_voc.py
================================================
import argparse
import os.path as osp
import xml.etree.ElementTree as ET
import mmcv
import numpy as np
from mmdet.core import voc_classes
label_ids = {name: i + 1 for i, name in enumerate(voc_classes())}
def parse_xml(args):
xml_path, img_path = args
tree = ET.parse(xml_path)
root = tree.getroot()
size = root.find('size')
w = int(size.find('width').text)
h = int(size.find('height').text)
bboxes = []
labels = []
bboxes_ignore = []
labels_ignore = []
for obj in root.findall('object'):
name = obj.find('name').text
label = label_ids[name]
difficult = int(obj.find('difficult').text)
bnd_box = obj.find('bndbox')
bbox = [
int(bnd_box.find('xmin').text),
int(bnd_box.find('ymin').text),
int(bnd_box.find('xmax').text),
int(bnd_box.find('ymax').text)
]
if difficult:
bboxes_ignore.append(bbox)
labels_ignore.append(label)
else:
bboxes.append(bbox)
labels.append(label)
if not bboxes:
bboxes = np.zeros((0, 4))
labels = np.zeros((0, ))
else:
bboxes = np.array(bboxes, ndmin=2) - 1
labels = np.array(labels)
if not bboxes_ignore:
bboxes_ignore = np.zeros((0, 4))
labels_ignore = np.zeros((0, ))
else:
bboxes_ignore = np.array(bboxes_ignore, ndmin=2) - 1
labels_ignore = np.array(labels_ignore)
annotation = {
'filename': img_path,
'width': w,
'height': h,
'ann': {
'bboxes': bboxes.astype(np.float32),
'labels': labels.astype(np.int64),
'bboxes_ignore': bboxes_ignore.astype(np.float32),
'labels_ignore': labels_ignore.astype(np.int64)
}
}
return annotation
def cvt_annotations(devkit_path, years, split, out_file):
if not isinstance(years, list):
years = [years]
annotations = []
for year in years:
filelist = osp.join(devkit_path, 'VOC{}/ImageSets/Main/{}.txt'.format(
year, split))
if not osp.isfile(filelist):
print('filelist does not exist: {}, skip voc{} {}'.format(
filelist, year, split))
return
img_names = mmcv.list_from_file(filelist)
xml_paths = [
osp.join(devkit_path, 'VOC{}/Annotations/{}.xml'.format(
year, img_name)) for img_name in img_names
]
img_paths = [
'VOC{}/JPEGImages/{}.jpg'.format(year, img_name)
for img_name in img_names
]
part_annotations = mmcv.track_progress(parse_xml,
list(zip(xml_paths, img_paths)))
annotations.extend(part_annotations)
mmcv.dump(annotations, out_file)
return annotations
def parse_args():
parser = argparse.ArgumentParser(
description='Convert PASCAL VOC annotations to mmdetection format')
parser.add_argument('devkit_path', help='pascal voc devkit path')
parser.add_argument('-o', '--out-dir', help='output path')
args = parser.parse_args()
return args
def main():
args = parse_args()
devkit_path = args.devkit_path
out_dir = args.out_dir if args.out_dir else devkit_path
mmcv.mkdir_or_exist(out_dir)
years = []
if osp.isdir(osp.join(devkit_path, 'VOC2007')):
years.append('2007')
if osp.isdir(osp.join(devkit_path, 'VOC2012')):
years.append('2012')
if '2007' in years and '2012' in years:
years.append(['2007', '2012'])
if not years:
raise IOError('The devkit path {} contains neither "VOC2007" nor '
'"VOC2012" subfolder'.format(devkit_path))
for year in years:
if year == '2007':
prefix = 'voc07'
elif year == '2012':
prefix = 'voc12'
elif year == ['2007', '2012']:
prefix = 'voc0712'
for split in ['train', 'val', 'trainval']:
dataset_name = prefix + '_' + split
print('processing {} ...'.format(dataset_name))
cvt_annotations(devkit_path, year, split,
osp.join(out_dir, dataset_name + '.pkl'))
if not isinstance(year, list):
dataset_name = prefix + '_test'
print('processing {} ...'.format(dataset_name))
cvt_annotations(devkit_path, year, 'test',
osp.join(out_dir, dataset_name + '.pkl'))
print('Done!')
if __name__ == '__main__':
main()
================================================
FILE: tools/dist_train.sh
================================================
#!/usr/bin/env bash
PYTHON=${PYTHON:-"python"}
$PYTHON -m torch.distributed.launch --nproc_per_node=$2 $(dirname "$0")/train.py $1 --launcher pytorch ${@:3}
================================================
FILE: tools/graph/new_vg_big_graph_a.pkl
================================================
[File too large to display: 68.7 MB]
================================================
FILE: tools/graph/new_vg_big_graph_r.pkl
================================================
[File too large to display: 68.7 MB]
================================================
FILE: tools/test.py
================================================
import argparse
import torch
import mmcv
from mmcv.runner import load_checkpoint, parallel_test, obj_from_dict
from mmcv.parallel import scatter, collate, MMDataParallel
from mmdet import datasets
from mmdet.core import results2json, coco_eval
from mmdet.datasets import build_dataloader
from mmdet.models import build_detector, detectors
def single_test(model, data_loader, show=False):
model.eval()
results = []
dataset = data_loader.dataset
prog_bar = mmcv.ProgressBar(len(dataset))
for i, data in enumerate(data_loader):
with torch.no_grad():
result = model(return_loss=False, rescale=not show, **data)
results.append(result)
if show:
model.module.show_result(data, result, dataset.img_norm_cfg,
dataset='vg', score_thr=0.4, save_num='work_dirs/fpn_hkrm/0.3_vghkrm_%08d'%i + '.jpg')
# dataset=dataset.CLASSES)
batch_size = data['img'][0].size(0)
for _ in range(batch_size):
prog_bar.update()
return results
def _data_func(data, device_id):
data = scatter(collate([data], samples_per_gpu=1), [device_id])[0]
return dict(return_loss=False, rescale=True, **data)
def parse_args():
parser = argparse.ArgumentParser(description='MMDet test detector')
parser.add_argument('config', help='test config file path')
parser.add_argument('checkpoint', help='checkpoint file')
parser.add_argument(
'--gpus', default=1, type=int, help='GPU number used for testing')
parser.add_argument(
'--proc_per_gpu',
default=1,
type=int,
help='Number of processes per GPU')
parser.add_argument('--out', help='output result file')
parser.add_argument(
'--eval',
type=str,
nargs='+',
choices=['proposal', 'proposal_fast', 'bbox', 'segm', 'keypoints'],
help='eval types')
parser.add_argument('--show', action='store_true', help='show results')
args = parser.parse_args()
return args
def main():
args = parse_args()
if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
raise ValueError('The output file must be a pkl file.')
cfg = mmcv.Config.fromfile(args.config)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
cfg.model.pretrained = None
cfg.data.test.test_mode = True
dataset = obj_from_dict(cfg.data.test, datasets, dict(test_mode=True))
if args.gpus == 1:
model = build_detector(
cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
load_checkpoint(model, args.checkpoint, strict=True)
model = MMDataParallel(model, device_ids=[0])
data_loader = build_dataloader(
dataset,
imgs_per_gpu=1,
workers_per_gpu=cfg.data.workers_per_gpu,
num_gpus=1,
dist=False,
shuffle=False)
outputs = single_test(model, data_loader, args.show)
else:
model_args = cfg.model.copy()
model_args.update(train_cfg=None, test_cfg=cfg.test_cfg)
model_type = getattr(detectors, model_args.pop('type'))
outputs = parallel_test(
model_type,
model_args,
args.checkpoint,
dataset,
_data_func,
range(args.gpus),
workers_per_gpu=args.proc_per_gpu)
if args.out:
print('writing results to {}'.format(args.out))
mmcv.dump(outputs, args.out)
eval_types = args.eval
if eval_types:
print('Starting evaluate {}'.format(' and '.join(eval_types)))
if eval_types == ['proposal_fast']:
result_file = args.out
coco_eval(result_file, eval_types, dataset.coco)
else:
if not isinstance(outputs[0], dict):
result_file = args.out + '.json'
results2json(dataset, outputs, result_file)
coco_eval(result_file, eval_types, dataset.coco)
else:
for name in outputs[0]:
print('\nEvaluating {}'.format(name))
outputs_ = [out[name] for out in outputs]
result_file = args.out + '.{}.json'.format(name)
results2json(dataset, outputs_, result_file)
coco_eval(result_file, eval_types, dataset.coco)
if __name__ == '__main__':
main()
================================================
FILE: tools/train.py
================================================
from __future__ import division
import argparse
from mmcv import Config
from mmdet import __version__
from mmdet.datasets import get_dataset
from mmdet.apis import (train_detector, init_dist, get_root_logger,
set_random_seed)
from mmdet.models import build_detector
import torch
def parse_args():
parser = argparse.ArgumentParser(description='Train a detector')
parser.add_argument('config', help='train config file path')
parser.add_argument('--work_dir', help='the dir to save logs and models')
parser.add_argument(
'--resume_from', help='the checkpoint file to resume from')
parser.add_argument(
'--validate',
action='store_true',
help='whether to evaluate the checkpoint during training')
parser.add_argument(
'--gpus',
type=int,
default=1,
help='number of gpus to use '
'(only applicable to non-distributed training)')
parser.add_argument('--seed', type=int, default=None, help='random seed')
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local_rank', type=int, default=0)
args = parser.parse_args()
return args
def main():
args = parse_args()
cfg = Config.fromfile(args.config)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
# update configs according to CLI args
if args.work_dir is not None:
cfg.work_dir = args.work_dir
if args.resume_from is not None:
cfg.resume_from = args.resume_from
cfg.gpus = args.gpus
if cfg.checkpoint_config is not None:
# save mmdet version in checkpoints as meta data
cfg.checkpoint_config.meta = dict(
mmdet_version=__version__, config=cfg.text)
# init distributed env first, since logger depends on the dist info.
if args.launcher == 'none':
distributed = False
else:
distributed = True
init_dist(args.launcher, **cfg.dist_params)
# init logger before other steps
logger = get_root_logger(cfg.log_level)
logger.info('Distributed training: {}'.format(distributed))
# set random seeds
if args.seed is not None:
logger.info('Set random seed to {}'.format(args.seed))
set_random_seed(args.seed)
model = build_detector(
cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
train_dataset = get_dataset(cfg.data.train)
train_detector(
model,
train_dataset,
cfg,
distributed=distributed,
validate=args.validate,
logger=logger)
if __name__ == '__main__':
main()
================================================
FILE: tools/vis_subgraph.py
================================================
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.cm as cm
sns.set(font_scale=1.2)
graph_a = pickle.load(open('/home/cyan/code/mmdetection/tools/graph/new_COCO_graph_a.pkl', 'rb'))
graph_r = pickle.load(open('/home/cyan/code/mmdetection/tools/graph/new_COCO_graph_r.pkl', 'rb'))
graph_a = np.float32(graph_a)
graph_r = np.float32(graph_r)
CLASSES = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'traffic_light', 'fire_hydrant',
'stop_sign', 'parking_meter', 'bench', 'bird', 'cat', 'dog',
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
'skis', 'snowboard', 'sports_ball', 'kite', 'baseball_bat',
'baseball_glove', 'skateboard', 'surfboard', 'tennis_racket',
'bottle', 'wine_glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
'hot_dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
'potted_plant', 'bed', 'dining_table', 'toilet', 'tv', 'laptop',
'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave',
'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock',
'vase', 'scissors', 'teddy_bear', 'hair_drier', 'toothbrush']
#graph_r = 1 - graph_r
#graph_a = 1 - graph_a
start = 43
end = 53
f, (ax1,ax2) = plt.subplots(figsize=(10,5), ncols=2)
sns.heatmap(graph_r[start:end,start:end], cmap=cm.Blues, annot=True, annot_kws={'size':8}, cbar_kws={"shrink":0.2}, ax=ax1, linewidths = 0.02, xticklabels=CLASSES[start:end], yticklabels=CLASSES[start:end], square=True)
labelx = ax1.get_xticklabels()
plt.setp(labelx, rotation=30, horizontalalignment='right')
ax1.set_title("Visualization of Relation Subgraph")
sns.heatmap(graph_a[start:end,start:end], cmap=cm.Blues, annot=True, annot_kws={'size':8}, cbar_kws={"shrink":0.2}, ax=ax2, linewidths = 0.02, xticklabels=CLASSES[start:end], yticklabels=CLASSES[start:end], square=True)
labelx = ax2.get_xticklabels()
plt.setp(labelx, rotation=30, horizontalalignment='right')
ax2.set_title("Visualization of Attribute Subgraph")
plt.savefig('./work_dirs/vis/subgraph2.png')
plt.show()
================================================
FILE: tools/voc_eval.py
================================================
from argparse import ArgumentParser
import mmcv
import numpy as np
from mmdet import datasets
from mmdet.core import eval_map
def voc_eval(result_file, dataset, iou_thr=0.5):
det_results = mmcv.load(result_file)
gt_bboxes = []
gt_labels = []
gt_ignore = []
for i in range(len(dataset)):
ann = dataset.get_ann_info(i)
bboxes = ann['bboxes']
labels = ann['labels']
if 'bboxes_ignore' in ann:
ignore = np.concatenate([
np.zeros(bboxes.shape[0], dtype=np.bool),
np.ones(ann['bboxes_ignore'].shape[0], dtype=np.bool)
])
gt_ignore.append(ignore)
bboxes = np.vstack([bboxes, ann['bboxes_ignore']])
labels = np.concatenate([labels, ann['labels_ignore']])
gt_bboxes.append(bboxes)
gt_labels.append(labels)
if not gt_ignore:
gt_ignore = gt_ignore
if hasattr(dataset, 'year') and dataset.year == 2007:
dataset_name = 'voc07'
else:
dataset_name = dataset.CLASSES
eval_map(
det_results,
gt_bboxes,
gt_labels,
gt_ignore=gt_ignore,
scale_ranges=None,
iou_thr=iou_thr,
dataset=dataset_name,
print_summary=True)
def main():
parser = ArgumentParser(description='VOC Evaluation')
parser.add_argument('result', help='result file path')
parser.add_argument('config', help='config file path')
parser.add_argument(
'--iou-thr',
type=float,
default=0.5,
help='IoU threshold for evaluation')
args = parser.parse_args()
cfg = mmcv.Config.fromfile(args.config)
test_dataset = mmcv.runner.obj_from_dict(cfg.data.test, datasets)
voc_eval(args.result, test_dataset, args.iou_thr)
if __name__ == '__main__':
main()